<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1364372</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2024.1364372</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Hair cluster detection model based on dermoscopic images</article-title>
<alt-title alt-title-type="left-running-head">Xiong et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2024.1364372">10.3389/fphy.2024.1364372</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Xiong</surname>
<given-names>Ya</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yu</surname>
<given-names>Kun</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lan</surname>
<given-names>Yujie</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Lei</surname>
<given-names>Zeyuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1607176/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Fan</surname>
<given-names>Dongli</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1499125/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Plastic and Cosmetic Surgery</institution>, <institution>Xinqiao Hospital</institution>, <institution>The Army Medical University</institution>, <addr-line>Chongqing</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>College of Automation</institution>, <institution>Chongqing University of Posts and Telecommunications</institution>, <addr-line>Chongqing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/669638/overview">Bo Xiao</ext-link>, Imperial College London, United Kingdom</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1307533/overview">Huafeng Li</ext-link>, Kunming University of Science and Technology, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1976485/overview">Puhong Duan</ext-link>, Hunan University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2367159/overview">Youlin Wang</ext-link>, University of Montreal, Canada</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Dongli Fan, <email>fdltmmu@sina.com</email>; Zeyuan Lei, <email>leizeyuan0854@163.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>14</day>
<month>02</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>12</volume>
<elocation-id>1364372</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>01</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>02</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Xiong, Yu, Lan, Lei and Fan.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Xiong, Yu, Lan, Lei and Fan</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>
<bold>Introduction:</bold> Hair loss has always bothered many people, with numerous individuals potentially facing the issue of sparse hair.</p>
<p>
<bold>Methods:</bold> Due to a scarcity of accurate research on detecting sparse hair, this paper proposes a sparse hair cluster detection model based on improved object detection neural network and medical images of sparse hair under dermatoscope to optimize the evaluation of treatment outcomes for hair loss patients. A new Multi-Level Feature Fusion Module is designed to extract and fuse features at different levels. Additionally, a new Channel-Space Dual Attention Module is proposed to consider both channel and spatial dimensions simultaneously, thereby further enhancing the model&#x2019;s representational capacity and the precision of sparse hair cluster detection.</p>
<p>
<bold>Results:</bold> After testing on self-annotated data, the proposed method is proven capable of accurately identifying and counting sparse hair clusters, surpassing existing methods in terms of accuracy and efficiency.</p>
<p>
<bold>Discussion:</bold> Therefore, it can work as an effective tool for early detection and treatment of sparse hair, and offer greater convenience for medical professionals in diagnosis and treatment.</p>
</abstract>
<kwd-group>
<kwd>hair loss</kwd>
<kwd>dermatoscope</kwd>
<kwd>hair cluster detection</kwd>
<kwd>feature fusion</kwd>
<kwd>dual attention module</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Radiation Detectors and Imaging</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>As a common issue, sparse hair [<xref ref-type="bibr" rid="B1">1</xref>] brothers many people, affecting both males and females alike [<xref ref-type="bibr" rid="B2">2</xref>], [<xref ref-type="bibr" rid="B3">3</xref>]. Hair loss or thinning primarily attributed to genetic factors, hormonal changes, environmental conditions, or medical conditions is a prevalent problem affecting millions worldwide [<xref ref-type="bibr" rid="B4">4</xref>]. Regardless of gender or age, it impacts an individual&#x2019;s self-esteem, personal aesthetics, and overall mental health. Traditional solutions such as drug treatments, hair transplants, or wearing wigs have achieved varying degrees of success and affordability, but they do not fundamentally resolve the problem or prevent its recurrence. Therefore, early detection and predictive analysis of sparse hair conditions are vital for implementing preventative measures and more effective treatments [<xref ref-type="bibr" rid="B5">5</xref>].</p>
<p>Over the past few decades, both domestic and international researchers have been exploring how to accurately detect sparse hair. The earliest research primarily relies on manual feature extraction and traditional image processing techniques [<xref ref-type="bibr" rid="B6">6</xref>]. However, due to the limitations on the selection and representational power of features, these methods are difficult to adapt to the complex and diverse forms of hair clusters. Therefore, with the rapid development of computer vision and deep learning [<xref ref-type="bibr" rid="B7">7</xref>], researchers introduce neural network into the field of sparse hair target detection. In recent years, with the advent of artificial intelligence (AI) and deep learning technologies, their application in the healthcare sector grows exponentially, providing promising results in different fields like diagnosis, prognosis, treatment planning, and public health [<xref ref-type="bibr" rid="B8">8</xref>]. In light of this, the development of AI-driven sparse hair detection models [<xref ref-type="bibr" rid="B9">9</xref>], especially those based on neural network, offers a promising research pathway.</p>
<p>Based on the strong learning capability and adaptability, neural network is able to learn effective feature representations from a large amount of data and train and optimize through the backpropagation algorithm. This provides new opportunities and challenges for the target detection of sparse hair [<xref ref-type="bibr" rid="B10">10</xref>]. Researchers design and improve hair cluster target detection models based on neural network to enhance detection accuracy and robustness.</p>
<p>At present, domestic and international research in the field of sparse hair detection is still in the exploratory stage [<xref ref-type="bibr" rid="B11">11</xref>]. Some studies have utilized traditional Convolutional Neural Network (CNN) to detect hair clusters, improving detection performance by constructing deep-level feature representations and using effective loss functions. Other studies have explored more advanced network structures, such as Recurrent Neural Network (RNN) and Attention Mechanisms, to capture the temporal information and local details of hair clusters. In summary, using neural network in hair cluster target detection models for sparse hair detection has enormous potential to thoroughly transform hair care and treatment [<xref ref-type="bibr" rid="B12">12</xref>].</p>
<p>However, the target detection of sparse hair still faces some challenges. Hair clusters exhibit diverse morphologies with differences in color, texture, and shape [<xref ref-type="bibr" rid="B13">13</xref>], posing difficulties for detection algorithms. Additionally, due to the sparse distribution of hair, hair cluster targets unevenly occupy proportions in images, making target detection more challenging. Currently, dermatoscopy is a non-invasive diagnostic technique that allows the observation of hair shafts, follicles, and capillaries, providing a visual representation of inflammation around the scalp and changes in hair shaft diameter and shape [<xref ref-type="bibr" rid="B14">14</xref>]. It is widely used in the diagnosis and treatment of hair diseases, as well as in the assessment and follow-up of prognosis [<xref ref-type="bibr" rid="B15">15</xref>], [<xref ref-type="bibr" rid="B16">16</xref>], [<xref ref-type="bibr" rid="B17">17</xref>], [<xref ref-type="bibr" rid="B18">18</xref>]. Digital intelligent analysis of dermatoscopy is still in the developmental stage, and research on dermoscopy for androgenetic alopecia is limited. For the daily management and assessment of treatment outcomes for patients with hair loss, hair counting plays a crucial role. However, there are currently no clear standards for a comprehensive evaluation of hair loss across the entire scalp.</p>
<p>In response to these challenges, this study utilizes hair images obtained by dermoscopy, combined with existing advanced target detection techniques, to propose an efficient and accurate sparse hair cluster target detection model. This model sets the hair cluster as the detection target (in this paper, the sparse hair or hair loss area) and predicts the number of hair clusters. This paper has three main contributions as follows.<list list-type="simple">
<list-item>
<p>1. Based on the advanced existing object detection networks, a dermoscopy image hair detection network structure based on an improved object detection neural network is proposed to better adapt to sparse hair detection. Through experiments, it proves that the proposed method surpasses the existing methods in terms of accuracy and efficiency, providing an effective tool for early detection and treatment of sparse hair.</p>
</list-item>
<list-item>
<p>2. Multi-Level Feature Fusion Module: A new multi-level feature fusion Module (MLFF) is designed to extract and fuse features at different levels. The MLFF structure can obtain features from different convolutional layers, then integrate these features through a specific fusion strategy to produce a richer, more representative feature expression.</p>
</list-item>
<list-item>
<p>3. Channel-Space Dual Attention Module: A new attention mechanism, the Channel-Space Dual Attention Module, is proposed to consider both channel and spatial dimensions&#x2019; information simultaneously. The CSDA module can handle channel and spatial correlation in a unified framework, thereby further enhancing the model&#x2019;s expressive capacity and accuracy of sparse hair detection.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>With the rapid development of computer technology and computer-assisted medical diagnostic systems, the continuous growth of computational power and data, deep learning has experienced tremendous development, becoming one of the powerful tools in the medical field. The technology of feature extraction and classification from medical images [<xref ref-type="bibr" rid="B19">19</xref>], [<xref ref-type="bibr" rid="B20">20</xref>] using maturing deep learning models is increasingly mature.</p>
<p>The field of object detection has always been a research hotspot. For instance, one study proposed a safety helmet detection method based on the YOLOv5 algorithm [<xref ref-type="bibr" rid="B21">21</xref>]. This research involved annotating a collected dataset of 6,045, training, and testing the YOLOv5 model with different parameters. In another study, YOLOv4 was employed for small object detection and anti-complex background interference in remote sensing images [<xref ref-type="bibr" rid="B22">22</xref>]. With the use of deep learning-based algorithms, ship detection technology has greatly enriched, allowing monitoring of large, distant seas. Through the use of a custom dataset with four types of ship targets, Kmeans&#x2b;&#x2b; clustering algorithm for prior box framework selection, and transfer learning method, the study enhanced YOLOv4&#x2019;s detection ability. Further improvements were introduced by replacing Spatial Pyramid Pooling (SPP) with a Receptive Field Block with dilated convolution and adding a Convolutional Block Attention Module (CBAM). These modifications have improved the detection performance of small vessels and enhanced the model&#x2019;s resistance to complex backgrounds. Due to the relatively large size and distinct features of vessels, the detection results are satisfactory. However, it remains a challenge for densely packed, small targets.</p>
<p>In recent years, there has been an emergence of research utilizing deep learning methods in skin imaging analysis, particularly in studies related to hair. Researchers have explored the application of deep learning-based object detection [<xref ref-type="bibr" rid="B23">23</xref>], [<xref ref-type="bibr" rid="B24">24</xref>], segmentation [<xref ref-type="bibr" rid="B25">25</xref>], and other algorithms in hair detection and segmentation. These studies primarily focus on aspects such as hair detection, removal, segmentation, and even reconstruction, but there is room for improvement in terms of accuracy.</p>
<p>Various deep learning structures and techniques are introduced in multiple studies to address the challenges related to hair recognition and removal in dermoscopic images. One such study proposed a novel deep learning technique, Chimera Net [<xref ref-type="bibr" rid="B26">26</xref>], an encoder-decoder architecture that uses a pretrained EfficientNet and squeeze-and-excitation residual (SERes) structure. This method exhibited superior performance over well-known deep learning methods like U-Net and ResUNet-a. Additionally, other research explored difficulties and solutions related to hair reconstruction. A novel method was proposed to capture high-fidelity hair geometry with strand-level accuracy [<xref ref-type="bibr" rid="B13">13</xref>]. The multi-stage approach includes a new multiview stereo method and a novel cost function for reconstructing each hair pixel into a 3D line. The task of Digital Hair Removal (DHR) also received ample research. One study proposed a DHR deep learning method using U-Net and free-form image restoration architecture [<xref ref-type="bibr" rid="B9">9</xref>]. It outperforms other state-of-the-art methods on the ISIC2018 dataset. Another study explored a similar theme Attia et al. [<xref ref-type="bibr" rid="B10">10</xref>], highlighting the challenges associated with hair segmentation and its impact on subsequent skin lesion diagnosis. Moreover, one paper delved into an important metric for determining the number of hairs on the scalp [<xref ref-type="bibr" rid="B27">27</xref>]. It stressed the need for an automated method to increase speed and throughput while lowering the cost of counting and measuring hair in trichogram images. The proposed deep learning-based, enables rapid, fully automatic hair counting and length measurement. Another study described a real-time hair segmentation method based on a fully convolutional network, the basic structure of which is an encoder-decoder [<xref ref-type="bibr" rid="B28">28</xref>]. This method uses Mobile-Unet, a variant of the U-Net segmentation model, which combines the optimization techniques of MobileNetV2.</p>
<p>In summary, the above studies emphasize the enormous potential of deep learning techniques in advancing hair-related dermoscopy research. However, deep learning-based sparse hair detection is still in the exploratory stage. To address these challenges, this paper, based on sparse hair dermoscopic medical images, proposes a dermoscopic image hair detection network structure based on an improved object detection neural network to achieve the detection of sparse hair clusters (sparse hair or hair loss areas in this paper) and predict the number of hair clusters.</p>
</sec>
<sec sec-type="materials|methods" id="s3">
<title>3 Materials and methods</title>
<p>In this section, we will provide a detailed introduction to the proposed sparse hair detection network structure, which is based on the object detection network [<xref ref-type="bibr" rid="B29">29</xref>]. Firstly, we will describe the overall structure of the network in <xref ref-type="sec" rid="s3-1">Section 3.1</xref>. Subsequently, we will highlight the novel contributions of this paper in <xref ref-type="sec" rid="s3-2">Sections 3.2</xref>, <xref ref-type="sec" rid="s3-3">3.3</xref>, namely, the MLFF Module and the CSDA Module, respectively.</p>
<sec id="s3-1">
<title>3.1 Overall structure</title>
<p>The overall framework proposed for sparse hair detection in this article is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>, primarily based on enhancements to classical object detection network architectures. Given the crucial significance of the accuracy of the sparse hair detection model for hair target recognition and assisting doctors in obtaining diagnostic results, the model proposed in this article is intended for application in sparse hair target detection models.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>The method proposed in this paper.</p>
</caption>
<graphic xlink:href="fphy-12-1364372-g001.tif"/>
</fig>
<p>It can be divided into three parts: the feature extraction backbone network, the feature enhancement and processing network, and the detection network. Specifically, the feature extraction backbone network is a convolutional neural network that incorporates the concept of a feature pyramid architecture, capable of extracting image features at different levels and reducing model computation while speeding up training. As shallow features contain more semantic information, a MLFF Module is proposed to handle them, preventing the loss of semantic information. At the end of the feature extraction backbone network, there is a Spatial Pyramid Pooling (SPP) module aimed at improving the network&#x2019;s receptive field by transforming feature maps of arbitrary sizes into fixed-size feature vectors. Three main backbone features can be obtained through the feature extraction backbone network.</p>
<p>In the feature enhancement and processing network, the Channel-Spatial Dual Attention module (CSDA) is introduced. The three feature layers obtained from the backbone network undergo processing through this module to generate enhanced features. Subsequently, processing is carried out based on the YOLOv5 network model. This network segment primarily consists of a series of feature aggregation layers that mix and combine image features to generate a Feature Pyramid Network (FPN). The output feature maps are then transferred to the detection network. With the adoption of a novel FPN structure, this design strengthens the bottom-up pathway, improving the transfer of low-level features and enhancing the detection of objects at different scales. Consequently, it enables the accurate identification of the same target object with varying sizes and proportions.</p>
<p>The detection network is primarily employed for the final detection phase of the model. It applies anchor boxes to the feature maps output from the preceding layer and outputs a vector containing the class probability, object score, and position of the bounding box around the object. The detection network of the proposed architecture consists of three detection layers, with inputs being feature maps of sizes 80 &#xd7; 80, 40 &#xd7; 40, and 20 &#xd7; 20, respectively, used for detecting objects of different sizes in the image. Each detection layer ultimately outputs an 18-dimensional vector ((4 &#x2b; 1&#x2b;1)&#xd7;3 anchor boxes). The first four parameters are used for determining the regression parameters for each feature point, and adjusting these regression parameters yields the predicted box. The fifth parameter is utilized to determine whether each feature point contains an object, and the last parameter is employed to identify the category of the object contained in each feature point. Subsequently, the predicted bounding boxes and categories of the targets in the original image are generated and labeled, enabling the detection of clusters of hair targets in the image.</p>
<p>
<xref ref-type="statement" rid="Algorithm_1">Algorithm 1</xref> describes the training process of the hair detection model in dermoscopic images. The computation time increases linearly with the increase of training sample, batch size, and training epochs. The time complexity of the training algorithm is <italic>O</italic> [<italic>E</italic> &#xd7; (<italic>n</italic>/<italic>B</italic>) &#xd7; 2 &#xd7; (<italic>M</italic> &#x2212; 1)].</p>
<p>
<statement content-type="algorithm" id="Algorithm_1">
<label>Algorithm 1</label>
<p>A dermoscopy-image hair detection model based on improved object detection neural network.<list list-type="simple">
<list-item>
<p>
<bold>Input:</bold>&#xa0;Training dataset <italic>D</italic>, segmentation model <italic>M</italic>, number of epochs <italic>E</italic>, learning rate <italic>&#x3b7;</italic>, <italic>n</italic> training samples, loss function <italic>L</italic>, batch size <italic>B</italic>
</p>
</list-item>
<list-item>
<p>
<bold>Output:</bold>&#xa0;Trained segmentation model <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p>1:&#x2003;Initialize segmentation model <italic>M</italic>
</p>
</list-item>
<list-item>
<p>2:&#x2003;<bold>for</bold>&#xa0;<italic>e</italic> &#x2208; [1, <italic>E</italic>]&#xa0;<bold>do</bold>
</p>
</list-item>
<list-item>
<p>3:&#x2003;&#x2003;&#x2003;<bold>for</bold>&#xa0;<italic>b</italic> &#x2208; [1, <italic>n</italic>/<italic>B</italic>](mini-batch <italic>b</italic> in <italic>D</italic> with size <italic>B</italic>)&#xa0;<bold>do</bold>
</p>
</list-item>
<list-item>
<p>4:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;Perform forward pass on <italic>M</italic> with mini-batch <italic>b</italic>
</p>
</list-item>
<list-item>
<p>5:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;Calculate detection loss according to the loss function <italic>L</italic>
</p>
</list-item>
<list-item>
<p>6:&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;Perform backward pass and update model weights and model according to the gradient</p>
</list-item>
<list-item>
<p>7:&#x2003;&#x2003;&#x2003;<bold>end</bold>&#xa0;<bold>for</bold>
</p>
</list-item>
<list-item>
<p>8:&#x2003;&#x2003;&#x2003;Save the trained model <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>9:&#x2003;<bold>end</bold>&#xa0;<bold>for</bold>
</p>
</list-item>
</list>
</p>
</statement>
</p>
</sec>
<sec id="s3-2">
<title>3.2 Multi-level feature fusion structure</title>
<p>The main task of the MLFF (Multi-Level Feature Fusion) structure is to process a large amount of semantic information contained in shallow layers. Its structure is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. The purpose of this module is to extract and fuse semantic information from shallow features, so that the resulting feature information is more detailed and more suitable for subsequent object detection tasks. Semantic feature information reflects a global feature of homogeneous phenomena in the image, depicting the surface organization and arrangement rules of slow-changing or cyclically-changing structures in the image. However, the low-level information extracted by the original backbone network (such as pixel values or local region attributes) is often of low quality and contrast, making it difficult to obtain and utilize this low-level information effectively. This paper proposes the MLFF module to address this problem.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Multi-level feature fusion structure.</p>
</caption>
<graphic xlink:href="fphy-12-1364372-g002.tif"/>
</fig>
<p>As shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, in this module, a feature <italic>X</italic>
<sub>1</sub> Eq. <xref ref-type="disp-formula" rid="e1">1</xref> before the output of this module serves as the input. It undergoes two consecutive CBS modules, resulting in two feature layers <italic>X</italic>
<sub>2</sub> and <italic>X</italic>
<sub>3</sub> Eq. <xref ref-type="disp-formula" rid="e1">1</xref>, represented as follows:<disp-formula id="e1">
<mml:math id="m3">
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>The CBS module represents a sequence of convolution operation, batch normalization operation, and activation function operation. This sequence is designed to capture local relationships within the input data, facilitating effective feature learning in images. Simultaneously, it helps mitigate the vanishing gradient problem and enhances the model&#x2019;s adaptability to changes in the distribution of input data. The CBS module can be expressed as follows:<disp-formula id="e2">
<mml:math id="m4">
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">out</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>U</mml:mi>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>N</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>v</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">out</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Where Conv represents the convolution operation, BN represents batch normalization operation, and SiLU represents the activation function operation. <italic>X</italic>
<sub>
<italic>out</italic>
</sub> represents the output feature of the CBS module, <italic>X</italic>
<sub>
<italic>in</italic>
</sub> represents the input feature of the CBS module, <italic>c</italic>
<sub>
<italic>in</italic>
</sub> represents the number of channels in the input feature, and <italic>c</italic>
<sub>
<italic>out</italic>
</sub>represents the number of channels in the output feature.</p>
<p>After the three features obtained through stacking and fusion, two feature layers are obtained. They will undergo another CBS module (where <italic>c</italic>
<sub>
<italic>in</italic>
</sub> &#x3d; <italic>c</italic>
<sub>
<italic>out</italic>
</sub>) for feature processing. Finally, these features will be stacked together, achieving feature integration. With the depth of feature processing and fusion, the dimension of the image feature vector continuously increases, and the size of each slice changes accordingly. Finally, after passing through a CBS module (where <italic>c</italic>
<sub>
<italic>in</italic>
</sub> &#x3d; <italic>c</italic>
<sub>
<italic>out</italic>
</sub>), as in Eq. <xref ref-type="disp-formula" rid="e2">2</xref>, the output feature Eq. <xref ref-type="disp-formula" rid="e3">3</xref> is:<disp-formula id="e3">
<mml:math id="m5">
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">MLFF</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>4</mml:mn>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>The obtained features will be inputted into the feature enhancement and processing network for further processing, where the abundant semantic information contained in the shallow layers will be fully utilized to achieve better detection performance. The first three branches actually correspond to dense residual structures, which take into account the easy-to-optimize characteristics of residual networks, and the ability of residual networks to improve the overall accuracy of the network by adding a considerable depth. In addition, skip connections are used to alleviate the problem of gradient disappearance caused by the depth of the neural network.</p>
<p>For the CBS module, the SiLU activation function is used, which is an improved version based on the Sigmoid activation function and ReLU activation function. SiLU has the characteristics of no upper bound and a lower bound, smoothness, and non-monotonicity. SiLU performs better than ReLU in deep models and can be considered as a smoothed ReLU activation function. Its specific implementation is shown in the equation below Eq. <xref ref-type="disp-formula" rid="e4">4</xref>:<disp-formula id="e4">
<mml:math id="m6">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
</sec>
<sec id="s3-3">
<title>3.3 Channel-space dual attention module</title>
<p>After obtaining feature information at different depths, it is necessary to further process these features to capture the target information in them. Therefore, this paper proposes a Channel-Space Dual Attention Module (CSDA) for feature inference, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. Finally, the inferred information is passed through the second part of the object detection model architecture to obtain three types of feature maps.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Channel spatial dual attention module.</p>
</caption>
<graphic xlink:href="fphy-12-1364372-g003.tif"/>
</fig>
<p>The module proposed in this article takes the feature layers obtained from the feature extraction backbone network, namely, <inline-formula id="inf3">
<mml:math id="m7">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>80</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>80</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>256</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, <inline-formula id="inf4">
<mml:math id="m8">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>40</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>40</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>512</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> and <inline-formula id="inf5">
<mml:math id="m9">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>20</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>20</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1024</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, and infers attention maps along two different dimensions. One dimension is the channel attention mechanism, which is based on the SE module [<xref ref-type="bibr" rid="B30">30</xref>] and uses global average pooling to calculate channel attention. The other dimension is the spatial attention mechanism, which focuses on which pixels in different feature maps are important and require significant attention. Then, the channel attention map and the spatial attention map are multiplied successively with the feature maps on the backbone to perform adaptive feature focusing, resulting in corresponding feature maps <inline-formula id="inf6">
<mml:math id="m10">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula id="inf7">
<mml:math id="m11">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula id="inf8">
<mml:math id="m12">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>.</p>
<p>For the Squeeze-and-Excitation module, it can be viewed as a computational unit that mainly embeds the dependency factors of feature map channels into variable <italic>&#x3c5;</italic>. This is to ensure that the network can enhance its sensitivity to information features and suppress less useful features. In the channel-wise optimization process, squeezing and excitation steps are applied to optimize the response of the convolutional kernel, in order to capture the correlation of channel information. The specific implementation is shown in the following equation:<disp-formula id="e5">
<mml:math id="m13">
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">tran</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>In the equation, <italic>C</italic>
<sub>
<italic>tran</italic>
</sub> is the convolutional operator, <italic>&#x3c5;</italic> &#x3d; [<italic>v</italic>
<sub>1</sub>, <italic>v</italic>
<sub>2</sub>, &#x2026; , <italic>v</italic>
<sub>
<italic>n</italic>
</sub>] represents the learned weights in the network, and <italic>n</italic> denotes the parameters of the <italic>n</italic> &#x2212; <italic>th</italic> convolutional kernel. Therefore, the output of the convolutional operator is <italic>Y</italic> &#x3d; [<italic>y</italic>
<sub>1</sub>, <italic>y</italic>
<sub>2</sub>, &#x2026; , <italic>y</italic>
<sub>
<italic>n</italic>
</sub>], which is implemented as shown in Eq. <xref ref-type="disp-formula" rid="e5">5</xref> and Eq. <xref ref-type="disp-formula" rid="e6">6</xref>. In the proposed attention module, after the channel attention, we can obtain the feature <italic>F</italic>
<sub>
<italic>channel</italic>
</sub>.<disp-formula id="e6">
<mml:math id="m14">
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c5;</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2217;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>Regarding the spatial attention module, as shown in the right half of <xref ref-type="fig" rid="F3">Figure 3</xref>, the feature map obtained by the feature extraction network is understood as a three-dimensional space, where each slice corresponds to a channel. Firstly, the values at the same position on different channels are subjected to average pooling and max pooling operations to obtain the features <italic>F</italic>
<sub>max</sub>, <italic>F</italic>
<sub>
<italic>average</italic>
</sub> Eq. <xref ref-type="disp-formula" rid="e7">7</xref>.<disp-formula id="e7">
<mml:math id="m15">
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">average</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>Finally, convolution and normalization operations are applied to generate a 2D spatial attention map <italic>F</italic>
<sub>
<italic>spatial</italic>
</sub>, which is computed as follows Eq. <xref ref-type="disp-formula" rid="e8">8</xref>:<disp-formula id="e8">
<mml:math id="m16">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">spatial</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>d</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>7</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">average</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>The symbol <italic>f</italic>
<sup>7 &#xd7; 7</sup> represents a convolution operation with a kernel size of 7 &#xd7; 7. After obtaining the channel attention map, it is multiplied with the input feature map <italic>F</italic> to obtain a new feature map <italic>F</italic>&#x2032;. This new feature map <italic>F</italic>&#x2032; is then multiplied with the spatial attention map to obtain the final feature map <italic>F</italic>&#x2033;. The overall process can be described as follows Eq. <xref ref-type="disp-formula" rid="e9">9</xref>:<disp-formula id="e9">
<mml:math id="m17">
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">channel</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:mi>F</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2297;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">saptial</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>Finally, three feature maps, denoted as <inline-formula id="inf9">
<mml:math id="m18">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula id="inf10">
<mml:math id="m19">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula id="inf11">
<mml:math id="m20">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, can be obtained. The obtained new features are then processed and enhanced using feature processing networks and detection networks to obtain the final object detection results. The experimental results of the proposed network will be discussed in <xref ref-type="sec" rid="s3">Section 3</xref> of this paper.</p>
</sec>
<sec id="s3-4">
<title>3.4 Attention dynamic head</title>
<p>Introducing dynamic heads [<xref ref-type="bibr" rid="B31">31</xref>], based on three feature maps <inline-formula id="inf12">
<mml:math id="m21">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula id="inf13">
<mml:math id="m22">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> and <inline-formula id="inf14">
<mml:math id="m23">
<mml:msub>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, the general formula for applying self-attention is as follows Eq. <xref ref-type="disp-formula" rid="e10">10</xref>:<disp-formula id="e10">
<mml:math id="m24">
<mml:mi>W</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c0;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>Where <italic>&#x3c0;</italic>(&#x22c5;) is an attention function. A simple solution to this attention function is achieved through fully connected layers. However, due to the high dimensionality of tensors, directly learning attention functions across all dimensions is computationally expensive and practically unaffordable. Therefore, transforming the attention function into attention along three directions, with each attention focusing on a single direction, is proposed Eq. <xref ref-type="disp-formula" rid="e11">11</xref>.<disp-formula id="e11">
<mml:math id="m25">
<mml:msup>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="script">F</mml:mi>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
<p>Where <italic>&#x3c0;</italic>
<sub>
<italic>L</italic>
</sub> (&#x22c5;), <italic>&#x3c0;</italic>
<sub>
<italic>S</italic>
</sub> (&#x22c5;), <italic>&#x3c0;</italic>
<sub>
<italic>C</italic>
</sub> (&#x22c5;) are three different attention functions applied respectively to dimensions L, S, and C.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Experimental results and analysis</title>
<sec id="s4-1">
<title>4.1 Datasets</title>
<p>In the experiment described in this paper, both the training and testing datasets are sourced entirely from hospitals and collected based on different patients, each with varying degrees of hair sparsity. The original dataset is devoid of any annotations, and labeling is used to annotate it, generating XML-format files to store the labeled tags. Each image corresponds to one XML file, containing multiple hair cluster labels, primarily annotating each hair cluster. In the experiment, each hair cluster does not exceed three strands. A total of 200 images were annotated for the dataset. As neural network-based object detection models are developed on the basis of extensive image data, the dataset is expanded and divided through data augmentation, resulting in 500 images. From these, 50 images are randomly selected as the validation set, and another 50 images are chosen as the test set. This is done to enrich the dataset size, better extract features of hair belonging to different labeled categories, and prevent the trained model from overfitting. The objective of this dataset is to achieve hair detection in populations with sparse hair, identifying the number of hair clusters.</p>
</sec>
<sec id="s4-2">
<title>4.2 Experimental details</title>
<p>During the preprocessing stage, the source dataset had a size of 1,920 &#xd7; 1,080. In this study, all hair datasets underwent image enhancement and partitioning, resulting in a final size of 640 &#xd7; 640 for each slice.</p>
<p>In the experiment, all programs were implemented in the PyTorch framework under the Windows 10 operating system. The training process used one GeForce RTX 3090 GPU and was written in Python language, calling CUDA, CuDNN, OpenCV, and other required libraries. The optimizer used in the experiment was SGD, with a momentum of 0.937 and default parameters for other settings. The initial learning rate, weight decay, and batch size were set to 0.01, 5e-4, and 8, respectively, and the epoch was set to 500. The trained model&#x2019;s weight file was saved, and the model&#x2019;s performance was evaluated using the test set.</p>
<p>The model evaluation metrics adopted include commonly used object detection metrics such as Precision, Recall, mAP (mean average precision), and F1 score, which are used to assess the performance of the trained model. Visual comparison was also conducted. The implementation of these metrics is as follows Eq. <xref ref-type="disp-formula" rid="e12">12</xref>:<disp-formula id="e12">
<mml:math id="m26">
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>Among them, <italic>TP</italic> represents the number of correctly identified clusters of hair; <italic>FP</italic> represents the number of clusters mistakenly identified as hair; <italic>FN</italic> represents the number of hair cluster targets that were not successfully identified; <italic>C</italic> represents the number of categories of hair cluster targets; <italic>AP</italic> represents the area enclosed by the precision-recall curve and the coordinate axis.</p>
<p>
<xref ref-type="fig" rid="F4">Figure 4</xref> displays the training and validation loss curves, as well as precision, recall, and mAP curves for the entire training process. The model is trained from scratch, and from the curves in the figure, it is evident that the network model descends rapidly in the first 50 epochs and gradually stabilizes thereafter. In the figure, a smaller box_loss indicates more accurate bounding boxes, and a smaller obj_loss indicates more accurate predictions of targets. Precision, recall, and mAP curves stabilize later, indicating a good training outcome. In summary, the figure demonstrates that the model for hair cluster detection is well-trained and does not exhibit overfitting. <xref ref-type="fig" rid="F5">Figure 5</xref> shows the correlation between predicted labels during the training process of the hair cluster object detection model. <xref ref-type="fig" rid="F5">Figure 5</xref> is a set of 2D histograms, illustrating the contrast between each axis of the data. Labels in the image are located in the xywh space, where x and y represent the center values of the label box, and w and h represent the length and width of the label box. The histograms of x and y in <xref ref-type="fig" rid="F5">Figure 5</xref> indicate that the size variation of detected targets is small. Additionally, the distribution plots of x and width, as well as y and height, show that their relationships have a linear correlation. Combined with <xref ref-type="fig" rid="F4">Figure 4</xref>, this suggests that the proposed model for the hair cluster object detection task is trainable.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Network training situation.</p>
</caption>
<graphic xlink:href="fphy-12-1364372-g004.tif"/>
</fig>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Correlation between predicted labels during network training.</p>
</caption>
<graphic xlink:href="fphy-12-1364372-g005.tif"/>
</fig>
</sec>
<sec id="s4-3">
<title>4.3 Comparative experiments</title>
<p>In the comparative experiments, to validate the performance of the proposed hair cluster detection model based on sparse hair, experiments and analyses were conducted on test set images using publicly available source code of classical object detection models. The object detection network developed in this study was compared with YOLOv3 [<xref ref-type="bibr" rid="B32">32</xref>], YOLOv4 [<xref ref-type="bibr" rid="B33">33</xref>], MobileNet YOLOv4, YOLOv5, Detr, FastestV2, YOLOv7, FastestDet, and YOLOv8 on test set images. <xref ref-type="table" rid="T1">Table 1</xref> presents the performance of the proposed method and other methods on the test set.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Comparison with different detection networks (Bold numbers represent best results).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Networks</th>
<th align="center">year</th>
<th align="center">Precision</th>
<th align="center">mAP</th>
<th align="center">F1 score</th>
<th align="center">Recall</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">YOLOv3</td>
<td align="center">2018</td>
<td align="center">0.733</td>
<td align="center">0.500</td>
<td align="center">0.35</td>
<td align="center">0.471</td>
</tr>
<tr>
<td align="left">YOLOv4</td>
<td align="center">2020</td>
<td align="center">0.768</td>
<td align="center">0.561</td>
<td align="center">0.58</td>
<td align="center">0.434</td>
</tr>
<tr>
<td align="left">Mobilenet YOLOv4</td>
<td align="center">2020</td>
<td align="center">0.792</td>
<td align="center">0.406</td>
<td align="center">0.21</td>
<td align="center">0.245</td>
</tr>
<tr>
<td align="left">YOLOv5</td>
<td align="center">2020</td>
<td align="center">0.865</td>
<td align="center">0.706</td>
<td align="center">0.63</td>
<td align="center">0.677</td>
</tr>
<tr>
<td align="left">Detr</td>
<td align="center">2020</td>
<td align="center">0.822</td>
<td align="center">0.717</td>
<td align="center">0.65</td>
<td align="center">0.854</td>
</tr>
<tr>
<td align="left">FastestV2</td>
<td align="center">2021</td>
<td align="center">0.479</td>
<td align="center">0.458</td>
<td align="center">0.52</td>
<td align="center">0.564</td>
</tr>
<tr>
<td align="left">YOLOv7</td>
<td align="center">2022</td>
<td align="center">0.816</td>
<td align="center">0.697</td>
<td align="center">0.66</td>
<td align="center">0.691</td>
</tr>
<tr>
<td align="left">FastestDet</td>
<td align="center">2022</td>
<td align="center">0.609</td>
<td align="center">0.524</td>
<td align="center">0.47</td>
<td align="center">0.593</td>
</tr>
<tr>
<td align="left">YOLOv8</td>
<td align="center">2023</td>
<td align="center">0.820</td>
<td align="center">0.658</td>
<td align="center">0.63</td>
<td align="center">0.712</td>
</tr>
<tr>
<td align="left">Our network</td>
<td align="center">-</td>
<td align="center">
<bold>0.898</bold>
</td>
<td align="center">
<bold>0.734</bold>
</td>
<td align="center">
<bold>0.72</bold>
</td>
<td align="center">
<bold>0.873</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The comparative experimental results in <xref ref-type="table" rid="T1">Table 1</xref> indicate that the hair cluster detection model proposed in this study achieves the highest mAP value, surpassing the classical YOLOv5 network model by 2.8%. Additionally, it outperforms the latest YOLOv8 by 7.6%. This suggests that the proposed algorithm has advantages in the task of hair cluster target recognition. Moreover, the proposed model achieves the highest Precision, F1, and Recall scores, demonstrating the superior performance of the sparse hair cluster model proposed in this study. Therefore, the results indicate that the proposed model can ensure accurate identification of sparse hair clusters, comparable to the best methods in terms of metrics, and surpassing most other methods.</p>
<p>To more clearly illustrate the performance of the proposed method, visual experiments were conducted on six images selected from the test set, as shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. <xref ref-type="fig" rid="F6">Figure 6</xref> displays the visual comparison of hair cluster detection results obtained by the proposed method and five other methods (YOLOv8, YOLOv7, Detr, FastestDet, FastestV2) under the same experimental conditions. It is evident that the proposed method achieves more accurate hair cluster detection results compared to other methods.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Visual comparison of hair cluster detection results.</p>
</caption>
<graphic xlink:href="fphy-12-1364372-g006.tif"/>
</fig>
<p>As evident from the obtained detection results above, the proposed hair cluster detection model for sparse hair in this study has achieved significant results. Simultaneously, the algorithm accomplishes counting and visualizing the detected clusters. A comparison reveals that the method developed in this study exhibits the best performance in hair cluster detection. In <xref ref-type="fig" rid="F6">Figure 6</xref>, it can be observed that other methods show instances of hair cluster omission. In summary, the method investigated in this study demonstrates commendable hair cluster detection performance. Finally, for a more comprehensive comparison of the advantages of the proposed method against different approaches, <xref ref-type="fig" rid="F7">Figure 7</xref> depicts bar charts representing the hair cluster detection performance of various methods across different metrics. The performance on four metrics is illustrated separately. It is evident that the proposed method holds a significant advantage in hair cluster detection tasks.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Performance comparison of different detection methods on the four indicators of Precision, Recall, mAP (mean average precision), and F1 score. The method that performs best in each case is marked with an asterisk.</p>
</caption>
<graphic xlink:href="fphy-12-1364372-g007.tif"/>
</fig>
</sec>
<sec id="s4-4">
<title>4.4 Ablation experiment</title>
<p>This study utilizes the developed model as the network for sparse hair target detection (Ours) in hair cluster detection. Experiments were conducted by removing the designed modules from this model. Specifically, the MLFF module was removed from the feature extraction network to assess the extraction of image features, and the CSDA module was removed from the feature enhancement and processing network to examine feature inference and fusion. As shown in the performance metrics results in <xref ref-type="table" rid="T2">Table 2</xref>, removing the corresponding modules leads to a decrease in the model&#x2019;s detection performance. Additionally, as depicted in <xref ref-type="fig" rid="F8">Figure 8A</xref>, it is apparent that some smaller and overlapping hair clusters are missed when certain modules are removed, while the detection results proposed in this study remain superior.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Comparison of ablation experiments of target detection indicators on data sets (Bold numbers represent best results).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Networks</th>
<th align="center">Precision</th>
<th align="center">mAP</th>
<th align="center">F1 score</th>
<th align="center">Recall</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Without MLFF</td>
<td align="center">0.817</td>
<td align="center">0.680</td>
<td align="center">0.57</td>
<td align="center">0.712</td>
</tr>
<tr>
<td align="left">Without CSDA</td>
<td align="center">0.762</td>
<td align="center">0.599</td>
<td align="center">0.33</td>
<td align="center">0.588</td>
</tr>
<tr>
<td align="left">Our network</td>
<td align="center">
<bold>0.898</bold>
</td>
<td align="center">
<bold>0.734</bold>
</td>
<td align="center">
<bold>0.72</bold>
</td>
<td align="center">
<bold>0.873</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Visual comparison of ablation experiment results. <bold>(A)</bold>: Comparison of detection results; <bold>(B)</bold>: Comparison of detection heatmaps.</p>
</caption>
<graphic xlink:href="fphy-12-1364372-g008.tif"/>
</fig>
<p>To further explore the differences between different modules and their reasons, a heatmap analysis was conducted. <xref ref-type="fig" rid="F8">Figure 8B</xref> visualizes the objective performance of different modules. It can be observed that removing the CSDA module generates regions of interest extending beyond the actual target area, focusing on some irrelevant background information. While focusing on certain background regions might not significantly impact normal target detection, it proves detrimental for densely distributed small targets, exacerbating background interference and the difficulty of instance recognition. Without the MLFF module, the situation of missed detections is more severe, indicating that the inclusion of the MLFF module in the network brings more information about the target. In conclusion, the proposed modules in this study contribute to improving the model&#x2019;s detection performance to a certain extent, significantly enhancing the overall performance of the target detection network.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>In this study, we have proposed and implemented an efficient and accurate detection model specifically designed for sparse hair clusters. This model is based on an improved neural network for object detection. The construction of this model introduces three innovative aspects: firstly, we designed a new neural network structure based on existing advanced object detection networks to optimize the detection of sparse hair. Secondly, a novel multi-level feature fusion structure was devised to better extract and fuse features at different levels. Lastly, a new attention mechanism, the Channel-Spatial Bi-Attention Module, was introduced to simultaneously consider information in both channel and spatial dimensions, further enhancing the model&#x2019;s expressive power and the accuracy of sparse hair detection.</p>
<p>The model primarily consists of three parts: a feature extraction backbone network, a feature enhancement and processing network, and a detection network. It effectively achieves the detection of hair clusters, predicting the number of hair clusters with promising results in experiments. Despite the application of dermoscopy in hair detection being in an exploratory and developing stage, and related research being incomplete, our study provides a new and effective tool for the precise detection of sparse hair clusters. It opens up new avenues for research and applications in hair detection, contributing to the advancement of dermoscopy in hair detection. This, in turn, assists healthcare professionals in diagnosing conditions and selecting treatment plans, while also providing convenience for daily management and condition monitoring for individuals with hair loss.</p>
<p>If the decisions made by the model are not interpretable, they may not be accepted by individuals. In future research, our project team will explore the interpretability of the hair cluster object detection network, applying these advancements to help healthcare professionals understand the processes in image analysis. Additionally, in order to bring the detection model to edge devices for user convenience, we will explore the development of lightweight hair cluster object detection models in the future.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The raw data supporting the conclusion of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s7">
<title>Ethics statement</title>
<p>The studies involving humans were approved by Medical Ethics Committee of the Second Affiliated Hospital of Army Medical University of Chinese People&#x2019;s Liberation Army. The studies were conducted in accordance with the local legislation and institutional requirements. Written informed consent for participation in this study was provided by the participants&#x2019; legal guardians/next of kin. Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p>
</sec>
<sec id="s8">
<title>Author contributions</title>
<p>YX: Data curation, Software, Supervision, Visualization, Writing&#x2013;original draft. KY: Resources, Software, Validation, Writing&#x2013;original draft. YL: Data curation, Formal Analysis, Funding acquisition, Investigation, Methodology, Software, Writing&#x2013;review and editing. ZL: Data curation, Formal Analysis, Project administration, Software, Supervision, Writing&#x2013;review and editing. DF: Conceptualization, Data curation, Resources, Writing&#x2013;original draft.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research, authorship, and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sperling</surname>
<given-names>LC</given-names>
</name>
<name>
<surname>Mezebish</surname>
<given-names>DS</given-names>
</name>
</person-group>. <article-title>Hair diseases</article-title>. <source>Med Clin North America</source> (<year>1998</year>) <volume>82</volume>:<fpage>1155</fpage>&#x2013;<lpage>69</lpage>. <pub-id pub-id-type="doi">10.1016/s0025-7125(05)70408-9</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Franzoi</surname>
<given-names>SL</given-names>
</name>
<name>
<surname>Anderson</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Frommelt</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Individual differences in men&#x2019;s perceptions of and reactions to thinning hair</article-title>. <source>J Soc Psychol</source> (<year>1990</year>) <volume>130</volume>:<fpage>209</fpage>&#x2013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1080/00224545.1990.9924571</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shapiro</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Hair loss in women</article-title>. <source>New Engl J Med</source> (<year>2007</year>) <volume>357</volume>:<fpage>1620</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1056/nejmcp072110</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ahmed</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Almohanna</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Griggs</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tosti</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Genetic hair disorders: a review</article-title>. <source>Dermatol Ther</source> (<year>2019</year>) <volume>9</volume>:<fpage>421</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1007/s13555-019-0313-2</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>York</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Meah</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Bhoyrul</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Sinclair</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>A review of the treatment of male pattern hair loss</article-title>. <source>Expert Opin Pharmacother</source> (<year>2020</year>) <volume>21</volume>:<fpage>603</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1080/14656566.2020.1721463</pub-id>
</citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>O&#x2019;Mahony</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Campbell</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Carvalho</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Harapanahalli</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Hernandez</surname>
<given-names>GV</given-names>
</name>
<name>
<surname>Krpalkova</surname>
<given-names>L</given-names>
</name>
<etal/>
</person-group> <article-title>Deep learning vs traditional computer vision</article-title>. In <conf-name>Advances in Computer Vision: Proceedings of the 2019 Computer Vision Conference (CVC), Volume 1 1</conf-name>; <conf-date>25-26 April 2019</conf-date>; <conf-loc>Las Vegas, Nevada, USA</conf-loc>. <publisher-name>Springer</publisher-name> (<year>2020</year>). <fpage>128</fpage>&#x2013;<lpage>44</lpage>.</citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Voulodimos</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Doulamis</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Doulamis</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Protopapadakis</surname>
<given-names>E</given-names>
</name>
<etal/>
</person-group> <article-title>Deep learning for computer vision: a brief review</article-title>. <source>Comput intelligence Neurosci</source> (<year>2018</year>) <volume>2018</volume>:<fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1155/2018/7068349</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Esteva</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Yeung</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Naik</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Madani</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Mottaghi</surname>
<given-names>A</given-names>
</name>
<etal/>
</person-group> <article-title>Deep learning-enabled medical computer vision</article-title>. <source>NPJ digital Med</source> (<year>2021</year>) <volume>4</volume>:<fpage>5</fpage>. <pub-id pub-id-type="doi">10.1038/s41746-020-00376-2</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Raj</surname>
<given-names>ANJ</given-names>
</name>
<name>
<surname>Tjahjadi</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Zhuang</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Digital hair removal by deep learning for skin lesion segmentation</article-title>. <source>Pattern Recognition</source> (<year>2021</year>) <volume>117</volume>:<fpage>107994</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2021.107994</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Attia</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Hossny</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Nahavandi</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Asadi</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Yazdabadi</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Digital hair segmentation using hybrid convolutional and recurrent neural networks architecture</article-title>. <source>Comp Methods Programs Biomed</source> (<year>2019</year>) <volume>177</volume>:<fpage>17</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2019.05.010</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Gil</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Deep-learning-based scalp image analysis using limited data</article-title>. <source>Electronics</source> (<year>2023</year>) <volume>12</volume>:<fpage>1380</fpage>. <pub-id pub-id-type="doi">10.3390/electronics12061380</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hosny</surname>
<given-names>KM</given-names>
</name>
<name>
<surname>Elshora</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Mohamed</surname>
<given-names>ER</given-names>
</name>
<name>
<surname>Vrochidou</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Papakostas</surname>
<given-names>GA</given-names>
</name>
</person-group>. <article-title>Deep learning and optimization-based methods for skin lesions segmentation: a review</article-title>. <source>IEEE Access</source> (<year>2023</year>) <volume>11</volume>:<fpage>85467</fpage>&#x2013;<lpage>88</lpage>. <pub-id pub-id-type="doi">10.1109/access.2023.3303961</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nam</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>MH</given-names>
</name>
<name>
<surname>Sheikh</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Strand-accurate multi-view hair capture</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <conf-date>June 16 2019 to June 17 2019</conf-date>; <conf-loc>Long Beach, CA, USA</conf-loc> (<year>2019</year>). p. <fpage>155</fpage>&#x2013;<lpage>64</lpage>.</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cu&#xe9;llar</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Puig</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Kolm</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Puig-Butille</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zaballos</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Mart&#xed;-Laborda</surname>
<given-names>R</given-names>
</name>
<etal/>
</person-group> <article-title>Dermoscopic features of melanomas associated with mc1r variants in Spanish cdkn2a mutation carriers</article-title>. <source>Br J Dermatol</source> (<year>2009</year>) <volume>160</volume>:<fpage>48</fpage>&#x2013;<lpage>53</lpage>. <pub-id pub-id-type="doi">10.1111/j.1365-2133.2008.08826.x</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tosti</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Torres</surname>
<given-names>F</given-names>
</name>
</person-group>. <article-title>Dermoscopy in the diagnosis of hair and scalp disorders</article-title>. <source>Actas dermo-sifiliogr&#xe1;ficas</source> (<year>2009</year>) <volume>100</volume>:<fpage>114</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1016/s0001-7310(09)73176-x</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pirmez</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Tosti</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Trichoscopy tips</article-title>. <source>Dermatol Clin</source> (<year>2018</year>) <volume>36</volume>:<fpage>413</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1016/j.det.2018.05.008</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Van Camp</surname>
<given-names>YP</given-names>
</name>
<name>
<surname>Van Rompaey</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Elseviers</surname>
<given-names>MM</given-names>
</name>
</person-group>. <article-title>Nurse-led interventions to enhance adherence to chronic medication: systematic review and meta-analysis of randomised controlled trials</article-title>. <source>Eur J Clin Pharmacol</source> (<year>2013</year>) <volume>69</volume>:<fpage>761</fpage>&#x2013;<lpage>70</lpage>. <pub-id pub-id-type="doi">10.1007/s00228-012-1419-y</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>RX</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>CB</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>CX</given-names>
</name>
<name>
<surname>Jing</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>YJ</given-names>
</name>
<etal/>
</person-group> <article-title>Dermoscopy in China: current status and future prospective</article-title>. <source>Chin Med J</source> (<year>2019</year>) <volume>132</volume>:<fpage>2096</fpage>&#x2013;<lpage>104</lpage>. <pub-id pub-id-type="doi">10.1097/cm9.0000000000000396</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>He</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Cong</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Brain tumor segmentation based on the fusion of deep semantics and edge information in multimodal mri</article-title>. <source>Inf Fusion</source> (<year>2023</year>) <volume>91</volume>:<fpage>376</fpage>&#x2013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2022.10.022</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Cong</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>Medical image segmentation method based on multi-feature interaction and fusion over cloud computing</article-title>. <source>Simulation Model Pract Theor</source> (<year>2023</year>) <volume>126</volume>:<fpage>102769</fpage>. <pub-id pub-id-type="doi">10.1016/j.simpat.2023.102769</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Nie</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Safety helmet detection based on yolov5</article-title>. In: <conf-name>2021 IEEE International conference on power electronics, computer applications (ICPECA) (IEEE)</conf-name>; <conf-date>January 22-24, 2021</conf-date>; <conf-loc>Shenyang, China</conf-loc> (<year>2021</year>). p. <fpage>6</fpage>&#x2013;<lpage>11</lpage>.</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>T</given-names>
</name>
<etal/>
</person-group> <article-title>An improved method for ship target detection based on yolov4</article-title>. <source>Appl Sci</source> (<year>2023</year>) <volume>13</volume>:<fpage>1302</fpage>. <pub-id pub-id-type="doi">10.3390/app13031302</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Haner</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Weng</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Convolutional neural network based detection and judgement of environmental obstacle in vehicle operation</article-title>. <source>CAAI Trans Intelligence Tech</source> (<year>2019</year>) <volume>4</volume>:<fpage>80</fpage>&#x2013;<lpage>91</lpage>. <pub-id pub-id-type="doi">10.1049/trit.2018.1045</pub-id>
</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qi</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Multi-focus image fusion via morphological similarity-based dictionary construction and sparse representation</article-title>. <source>CAAI Trans Intelligence Tech</source> (<year>2018</year>) <volume>3</volume>:<fpage>83</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1049/trit.2018.0011</pub-id>
</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>X-net: a dual encoding&#x2013;decoding method in medical image segmentation</article-title>. <source>Vis Comp</source> (<year>2021</year>) <volume>39</volume>:<fpage>2223</fpage>&#x2013;<lpage>33</lpage>. <pub-id pub-id-type="doi">10.1007/s00371-021-02328-7</pub-id>
</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lama</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Kasmi</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Hagerty</surname>
<given-names>JR</given-names>
</name>
<name>
<surname>Stanley</surname>
<given-names>RJ</given-names>
</name>
<name>
<surname>Young</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Miinch</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>Chimeranet: U-net for hair detection in dermoscopic skin lesion images</article-title>. <source>J Digital Imaging</source> (<year>2023</year>) <volume>36</volume>:<fpage>526</fpage>&#x2013;<lpage>35</lpage>. <pub-id pub-id-type="doi">10.1007/s10278-022-00740-6</pub-id>
</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sacha</surname>
<given-names>JP</given-names>
</name>
<name>
<surname>Caterino</surname>
<given-names>TL</given-names>
</name>
<name>
<surname>Fisher</surname>
<given-names>BK</given-names>
</name>
<name>
<surname>Carr</surname>
<given-names>GJ</given-names>
</name>
<name>
<surname>Youngquist</surname>
<given-names>RS</given-names>
</name>
<name>
<surname>D&#x2019;Alessandro</surname>
<given-names>BM</given-names>
</name>
<etal/>
</person-group> <article-title>Development and qualification of a machine learning algorithm for automated hair counting</article-title>. <source>Int J Cosmet Sci</source> (<year>2021</year>) <volume>43</volume>:<fpage>S34</fpage>&#x2013;<lpage>S41</lpage>. <comment>S34&#x2013;S41</comment>. <pub-id pub-id-type="doi">10.1111/ics.12735</pub-id>
</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yoon</surname>
<given-names>HS</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>SW</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>JH</given-names>
</name>
</person-group>. <article-title>Real-time hair segmentation using mobile-unet</article-title>. <source>Electronics</source> (<year>2021</year>) <volume>10</volume>:<fpage>99</fpage>. <pub-id pub-id-type="doi">10.3390/electronics10020099</pub-id>
</citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Long</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z</given-names>
</name>
<etal/>
</person-group> <article-title>Application of local fully convolutional neural network combined with yolo v5 algorithm in small target detection of remote sensing image</article-title>. <source>PloS one</source> (<year>2021</year>) <volume>16</volume>:<fpage>e0259283</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0259283</pub-id>
</citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Squeeze-and-excitation networks</article-title>. In: <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>; <conf-date>June 18 2018 to June 23 2018</conf-date>; <conf-loc>Salt Lake City, UT, USA</conf-loc> (<year>2018</year>). <fpage>7132</fpage>&#x2013;<lpage>41</lpage>.</citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>L</given-names>
</name>
<etal/>
</person-group> <article-title>Dynamic head: unifying object detection heads with attentions</article-title>. <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>; <conf-date>June 20 2021 to June 25 2021</conf-date>; <conf-loc>Nashville, TN, USA</conf-loc> (<year>2021</year>). <fpage>7373</fpage>&#x2013;<lpage>82</lpage>.</citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Yolov3: an incremental improvement[J]</article-title> (<year>2018</year>). <comment>arXiv preprint arXiv:1804.02767 Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/1804.02767.pdf">https://arxiv.org/pdf/1804.02767.pdf</ext-link>.</comment>
</citation>
</ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Bochkovskiy</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>CY</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>HYM</given-names>
</name>
</person-group>. <article-title>Yolov4: optimal speed and accuracy of object detection[J]</article-title> (<year>2020</year>). <comment>arXiv preprint arXiv:2004.10934 Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2004.10934">https://arxiv.org/abs/2004.10934</ext-link>.</comment>
</citation>
</ref>
</ref-list>
</back>
</article>