<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2024.1278161</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Automatic detection of standing dead trees based on improved YOLOv7 from airborne remote sensing imagery</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zhou</surname>
<given-names>Hongwei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1962353"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wu</surname>
<given-names>Shangxin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2410082"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Xu</surname>
<given-names>Zihan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sun</surname>
<given-names>Hong</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Computer and Control Engineering, Northeast Forestry University</institution>, <addr-line>Harbin</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Key Laboratory of National Forestry and Grassland Administration on Forest and Grassland Pest Monitoring and Warning, Center for Biological Disaster Prevention and Control, National Forestry and Grassland Administration</institution>, <addr-line>Shenyang</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Yunchao Tang, Guangxi University, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Guoxiong Zhou, Central South University Forestry and Technology, China</p>
<p>Jun Liu, Shandong Provincial University Laboratory for Protected Horticulture, China</p>
<p>Parvathaneni Naga Srinivasu, Prasad V. Potluri Siddhartha Institute of Technology, India</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Shangxin Wu, <email xlink:href="mailto:wsx@nefu.edu.cn">wsx@nefu.edu.cn</email>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>22</day>
<month>01</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1278161</elocation-id>
<history>
<date date-type="received">
<day>15</day>
<month>08</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>01</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Zhou, Wu, Xu and Sun</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Zhou, Wu, Xu and Sun</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Detecting and localizing standing dead trees (SDTs) is crucial for effective forest management and conservation. Due to challenges posed by mountainous terrain and road conditions, conducting a swift and comprehensive survey of SDTs through traditional manual inventory methods is considerably difficult. In recent years, advancements in deep learning and remote sensing technology have facilitated real-time and efficient detection of dead trees. Nevertheless, challenges persist in identifying individual dead trees in airborne remote sensing images, attributed to factors such as small target size, mutual occlusion and complex backgrounds. These aspects collectively contribute to the increased difficulty of detecting dead trees at a single-tree scale. To address this issue, the paper introduces an improved You Only Look Once version 7 (YOLOv7) model that incorporates the Simple Parameter-Free Attention Module (SimAM), an unparameterized attention mechanism. This improvement aims to enhance the network&#x2019;s feature extraction capabilities and increase the model&#x2019;s sensitivity to small target dead trees. To validate the superiority of SimAM_YOLOv7, we compared it with four widely adopted attention mechanisms. Additionally, a method to enhance model robustness is presented, involving the replacement of the Complete Intersection over Union (CIoU) loss in the original YOLOv7 model with the Wise-IoU (WIoU) loss function. Following these, we evaluated detection accuracy using a self-developed dataset of SDTs in forests. The results indicate that the improved YOLOv7 model can effectively identify dead trees in airborne remote sensing images, achieving precision, recall and mAP@0.5 values of 94.31%, 93.13% and 98.03%, respectively. These values are 3.67%, 2.28% and 1.56% higher than those of the original YOLOv7 model. This improvement model provides a convenient solution for forest management.</p>
</abstract>
<kwd-group>
<kwd>standing dead trees</kwd>
<kwd>deep learning</kwd>
<kwd>attention mechanism</kwd>
<kwd>Wise-IoU loss function</kwd>
<kwd>airborne remote sensing imagery</kwd>
</kwd-group>
<counts>
<fig-count count="13"/>
<table-count count="6"/>
<equation-count count="18"/>
<ref-count count="52"/>
<page-count count="18"/>
<word-count count="8525"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Trees are essential for maintaining the ecological balance within forest ecosystems (<xref ref-type="bibr" rid="B26">Manning et&#xa0;al., 2006</xref>; <xref ref-type="bibr" rid="B30">Nadrowski et&#xa0;al., 2010</xref>). Diseases and pests are significant factors contributing to the widespread death of trees (<xref ref-type="bibr" rid="B1">Bernal et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B25">Luo et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B41">Wang J. et&#xa0;al., 2023</xref>). Regularly inspecting standing dead trees (SDTs) in the forest to determine the causes of their death facilitates early detection and the mitigation of potential pest and disease issues. Therefore, it is essential to accurately and efficiently identify and monitor dead trees in forest areas. Traditional SDTs inventory methods often rely on rangers collecting coordinate location information in the field. However, this approach is hindered by challenging mountainous terrain and road conditions. Field trekking for inventory purposes becomes difficult, costly, and time-consuming (<xref ref-type="bibr" rid="B2">Butler and Schlaepfer, 2004</xref>).</p>
<p>To complement field trekking, low- and medium-resolution satellite remote sensing images have been used to detect the extent of forest infestation in localized areas (<xref ref-type="bibr" rid="B8">Eklundh et&#xa0;al., 2009</xref>; <xref ref-type="bibr" rid="B7">Coops et&#xa0;al., 2010</xref>; <xref ref-type="bibr" rid="B28">Meng et&#xa0;al., 2016</xref>). However, these studies have primarily focused on area-based detection, lacking the ability to identify disease-infected dead trees at the single-tree scale. With advancements in remote sensing platforms and technologies, the use of high-resolution satellite remote sensing images (e.g., QuickBird, IKONOS) and aerial images has made it possible to detect single tree. By combining these images with canopy detection methods, more accurate identification of dead trees, even in mountainous areas with challenging terrain and rugged roads, has become achievable (<xref ref-type="bibr" rid="B13">Hicke and Logan, 2009</xref>; <xref ref-type="bibr" rid="B44">Wang et&#xa0;al., 2015</xref>; <xref ref-type="bibr" rid="B45">Windrim et&#xa0;al., 2020</xref>). While Light Detection and Ranging (LiDAR) technology can provide precise information on the location and height of individual SDTS in forests, it comes with a high cost for data collection (<xref ref-type="bibr" rid="B5">Chen et&#xa0;al., 2011</xref>). On the other hand, high-resolution optical remote sensing images offer several advantages, such as easy data collection and wide application, making them a prominent focus for research on single-tree identification techniques (<xref ref-type="bibr" rid="B11">Han et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B20">Lee et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B52">Zheng et&#xa0;al., 2023</xref>).</p>
<p>With the proposal and development of machine learning methods, they have been utilized by scholars to detect dead trees using high-resolution remote sensing images (<xref ref-type="bibr" rid="B27">Maxwell et&#xa0;al., 2018</xref>). These methods involve the application of various machine learning algorithms such as Support Vector Machine (SVM), Random Forest (RF), k-Nearest Neighbor Algorithm (K-NN), Clustering Algorithm and Artificial Neural Network (ANN) for SDTs detection (<xref ref-type="bibr" rid="B3">Celik, 2009</xref>; <xref ref-type="bibr" rid="B21">Lee et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B29">Miltiadou et&#xa0;al., 2020</xref>). However, the existing methods often rely on manual design and extraction of image features, which can limit their accuracy and robustness, particularly in complex scenarios such as tree occlusion or when dealing with similar colors of features. Accurately detecting dead trees using machine learning methods becomes challenging due to these limitations.</p>
<p>In recent years, deep learning has made significant advancements, leading to the development of powerful object detection models based on convolutional neural net-works (CNN). Compared to traditional machine learning methods, deep learning models have the ability to automatically learn image features during the detection of SDTs (<xref ref-type="bibr" rid="B9">Farias et&#xa0;al., 2018</xref>). They can also synthesize contextual information and semantic relationships within the image, enhancing detection accuracy (<xref ref-type="bibr" rid="B22">Lei et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B50">Zhang et&#xa0;al., 2022</xref>). The end-to-end training approach simplifies the dead trees detection system and has the potential to improve overall performance and efficiency. Deep learning algorithms based on CNN have demonstrated advantages over other methods, leading researchers across various fields to explore their application (<xref ref-type="bibr" rid="B39">Voulodimos et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B31">Naga Srinivasu et&#xa0;al., 2023</xref>). In the field of forestry, deep learning has been widely used for tasks such as forest resource management, soil analysis, tree detection and classification (<xref ref-type="bibr" rid="B37">Srivastava et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B42">Wang et&#xa0;al., 2021</xref>). Scholars have conducted research on dead tree detection based on deep learning (<xref ref-type="bibr" rid="B6">Chiang et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B24">Li et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B43">Wang et&#xa0;al., 2022</xref>). They primarily focus on detecting larger or densely packed targets, with limited studies addressing the detection of individual dead trees at a smaller scale. However, there are several challenges in achieving SDTs detection at the single-tree scale: 1) Multi-scale problem: SDTs exhibit variations in size dimensions and shapes, making accurate localization challenging. The detection models need to account for these multi-scale variations to accurately identify dead trees. 2) Occlusion problem: In remote sensing images, the presence of living trees can obscure SDTs, making it difficult to distinguish their boundaries and features, which could lead to missed detections. 3) Background complexity: Remote sensing images may contain complex backgrounds, including houses, land, bare rocks or other elements. This complexity can result in misidentifications, where the background is mistakenly detected as SDTs.</p>
<p>Deep learning-based object detection algorithms can be broadly categorized into two types based on the presence of a candidate region extraction step: two-stage algorithms, exemplified by Faster R-CNN, and single-stage algorithms, represented by You Only Look Once (YOLO). While two-stage algorithms, like Faster R-CNN, typically exhibit slower detection speeds due to their two-step nature, they often achieve higher detection accuracy. On the other hand, YOLO, with its continuous improvements in network architecture, demonstrates advanced performance in both detection accuracy and speed (<xref ref-type="bibr" rid="B36">Sirisha et&#xa0;al., 2023</xref>). This paper presents an improved algorithm based on the You Only Look Once version 7 (YOLOv7) model to address the challenges of detecting small targets, mutual occlusion and complex backgrounds in optical remote sensing images for automated SDTs detection. The contributions of this paper can be summarized as follows:</p>
<list list-type="order">
<list-item>
<p>Introducing the Simple Parameter-Free Attention Module (SimAM) to enhance the model&#x2019;s feature extraction capabilities for small target dead standing trees.</p>
</list-item>
<list-item>
<p>Replacing the Complete Intersection over Union (CIoU) loss with Wise-IoU (WIoU) improves the robustness and detection accuracy of the model by focusing on ordinary quality bounding box.</p>
</list-item>
<list-item>
<p>Analyzing the performance metrics of the proposed improved model, including precision, recall, mAP@0.5 and Frames Per Second (FPS), against a benchmark model.</p>
</list-item>
<list-item>
<p>Discussing the research advancements in SDTs detection using deep learning methods, along with the limitations of this study and future research directions.</p>
</list-item>
</list>
<p>The paper is structured as follows: the introduction offers the research area, outlines the authors&#x2019; contributions and elucidates the workflow of the model detection. Section 2 details the experimental materials and introduces the proposed model for SDTs detection. Section 3 showcases the experimental results. Discussion and conclusions are presented in Section 4 and Section 5.</p>
<p>The workflow diagram of SDTs detection model based on the improved YOLOv7 algorithm is shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. First, the collected remote sensing images are preprocessed, including three parts: image cropping, screening and labeling, and data enhancement. Then the images are fed into the improved YOLOv7 network for training to obtain the training model. The SDTs in the test set of images are detected with the training model, and finally the model detection effect is comprehensively evaluated by combining various evaluation indexes and visualization results.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The workflow diagram of SDTs detection model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g001.tif"/>
</fig>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Study areas and dataset</title>
<p>The study area for this experiment is the Experimental Forestry Farm of Mao&#x2019;er Mountain, located in the southeastern part of Heilongjiang Province, China. It is situated in the northwestern part of Shangzhi City, on the western slope of Zhang Guangcailing, within the geographic coordinates of 45&#xb0;16&#x2032; to 45&#xb0;24&#x2032;N and 127&#xb0;30&#x2032; to 127&#xb0;40&#x2032;E (<xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>). The Forestry Farm is divided into ten sizing zones, characterized by numerous mountainous hills with gentle slopes and elevations ranging from 200 to 600 meters above sea level. The climate in Mao&#x2019;er Mountain belongs to the temperate continental monsoon climate. It experiences long and dry winters, short and warm summers, concentrated rainfall, frequent spring droughts, and occasional fall freezes. The area has an annual frost-free period of approximately 125 days and an average annual precipitation of around 700mm. Being the source of the Ash River and the Ujimi River, the area is fertile, and the soil primarily consists of dark brown loam. The soil is rich in trace elements and organic matter, accounting for approximately 68.18% of the forest area. Since its establishment, the Mao&#x2019;er Mountain Experimental Forestry Farm has developed significant forestry land, covering an area of 26,000 hectares, with a forest coverage rate of 83.29%. It possesses abundant forest resources, playing a crucial role in maintaining the ecological balance of the region.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Location of the study area; <bold>(A)</bold> Map of Heilongjiang Province, China, with red dots representing the study area. <bold>(B)</bold> Geographic location of the study area.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g002.tif"/>
</fig>
<p>Due to the limited scope of existing research on SDTs and the absence of publicly available datasets, we conducted this study using a self-built dataset. In this study, the study area was photographed and scanned using a high-resolution CCD sensor (DigiCAM-60) carried by the LiCHy system to obtain 27.4G of raw remote sensing image data. The flight altitude was 1000m and the day of shooting was clear and cloudless. <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref> shows the detailed parameters of the CCD sensor.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Main parameters of CCD sensors.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Main parameters</th>
<th valign="top" align="center">Parameter value</th>
<th valign="top" align="center">Main parameters</th>
<th valign="top" align="center">Parameter value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">Frame Size</td>
<td valign="top" align="center">8956*6708</td>
<td valign="top" align="center">Pixel Size</td>
<td valign="top" align="center">0.25m*0.25m</td>
</tr>
<tr>
<td valign="top" align="center">Imaging Sensor Size</td>
<td valign="top" align="center">3ns</td>
<td valign="top" align="center">Bit Depth</td>
<td valign="top" align="center">16bits</td>
</tr>
<tr>
<td valign="top" align="center">FOV</td>
<td valign="top" align="center">56.2&#xb0;</td>
<td valign="top" align="center">Focal Length</td>
<td valign="top" align="center">50mm</td>
</tr>
<tr>
<td valign="top" align="center">Ground Resolution</td>
<td valign="top" align="center">0.12m</td>
<td valign="top" align="center"/>
<td valign="top" align="center"/>
</tr>
</tbody>
</table>
</table-wrap>
<p>To prepare the original RGB images from the CCD sensor for target recognition, a series of preprocessing steps are performed. First, considering that the mismatch of aspect ratio may affect the training effect of the model, the original image is cropped to 10824 RGB images of 3*1024*1024 uniform size using ArcGIS software.</p>
<p>Secondly, not every image in the dataset contains SDTs, and images with dead trees have significantly fewer instances compared to healthy trees. Considering the balance of samples in the object detection task and the limited computational resources, in order to allow the model to focus more on the core objective, which is the detection of dead trees, we need to filter out images containing SDTs. From the entire dataset, 1928 im-ages containing dead trees are selected. Using the labelimg software, we annotated dead trees in the images to create labels in VOC format for subsequent comparative experiments. These VOC format labels were then converted into YOLO format. The study primarily focused on detecting dead tree crowns using RGB images. During annotation, the focus was on distinguishing dead trees from the background, requiring marking only the crowns of dead trees. By combining visual interpretation and on-site surveys, specific rules were established: the crown of an individual tree was the target, and if all its tree tops showed signs of death, it was labeled as a dead tree (category &#x201c;0&#x201d;). The labeling result is shown in the <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>, the annotation file of image is used to represent the label category and the coordinates of the rectangular marking box.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Data annotation; <bold>(A)</bold> label of SDTs; <bold>(B)</bold> annotations files of image.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g003.tif"/>
</fig>
<p>To enhance the model&#x2019;s robustness and generalization ability while avoiding overfitting, data augmentation techniques are applied. The dataset is expanded through random flipping, mirroring, and luminance adjustments. This augmentation process generates a total of 9640 dataset samples. <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref> showcases some of the samples after data enhancement. The labeled dataset is then di-vided into training, validation, and test sets in a ratio of 6:2:2. This division results in a total of 5784 training set samples, 1928 validation set samples, and 1,928 test set samples for this experiment.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Some samples after data enhancement.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g004.tif"/>
</fig>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Related work</title>
<sec id="s2_2_1">
<label>2.2.1</label>
<title>YOLOv7</title>
<p>This paper presents an algorithm based on the YOLOv7 for detecting dead standing trees in large forests. The algorithm aims to enable rangers to accurately locate dead standing trees quickly, which is crucial for the maintenance of forest resources and biodiversity. Given the requirements for accuracy and real-time performance in SDTs detection, the YOLOv7 model is chosen as the foundation for detecting and locating dead trees.</p>
<p>The YOLOv7 (<xref ref-type="bibr" rid="B40">Wang C. et&#xa0;al., 2023</xref>) comprises three basic models with increasing parameter counts: YOLOv7-tiny for edge GPU, YOLOv7 for normal GPU, and YOLOv7-w6 for cloud GPU. Additionally, there are four extended models based on the basic models, namely YOLOv7-X, YOLOv7-E6, YOLOv7-D6, and YOLOv7-E6E. The model structure of YOLOv7 is illustrated in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>. The overall detection logic of YOLOv7 is similar to that of YOLOv4 and YOLOv5.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Structure of YOLOv7 network.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g005.tif"/>
</fig>
<p>The YOLOv7 model consists of four main components: input, backbone, neck and head. The input component performs preprocessing operations such as online data enhancement and resizing on the original image to obtain a 640*640*3 RGB image. In <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>, we use the abbreviation &#x2018;Cat&#x2019; to represent &#x2018;concatenate,&#x2019; which is employed to concatenate the outputs of multiple feature maps or branches along the channel axis. This is done to provide a richer feature representation and is commonly used for multi-scale information fusion. The backbone component is responsible for feature extraction from the input RGB image. In the YOLOv7 model, the backbone utilizes the ELAN module, which controls the shortest and longest gradient paths to achieve more effective learning and convergence. It generates three feature maps that serve as inputs to the neck.</p>
<p>The neck component is responsible for multi-scale feature fusion. It introduces the SPPCSPC module and optimizes the PAN module. The SPPCSPC module combines the Spatial Pyramid Pooling (SPP) structure with the Cross Stage Partial (CSP) structure. The role of the SPP structure is to obtain different receptive fields through max-pooling, and the role of the CSP structure is to divide the input feature map into two parts. Each part is separately processed through a subnetwork and in the subsequent layer. The two parts of the feature map are then concatenated as the input for the next layer. The SPPCSPC module combines the advantages of both structures by parallel processing the features into two parts. Only one part undergoes SPP structure processing, and the final step involves concatenating the two parts to reduce model computation and improve training speed. The PAN module further enhances the model&#x2019;s learning capability by introducing an ELAN-W structure, which is similar to that used in the backbone. This structure improves learning without changing the gradient paths. The PAN module efficiently fuses multi-scale feature maps, enabling the model to learn and capture information at different scales effectively.</p>
<p>The head component is responsible for predicting image features. It incorporates the RepConv design, which utilizes a heavily parameterized convolutional architecture to enrich the gradient diversity of feature maps at different scales. This reduces model complexity, enhances the model&#x2019;s prediction ability, and predicts the bounding box location and confidence information of SDTs using three feature maps.</p>
</sec>
<sec id="s2_2_2">
<label>2.2.2</label>
<title>Attention mechanisms</title>
<p>When performing a visual task, human vision will quickly focus towards important regions and prioritize limited attention to process the critical part of the task, researchers propose to process data more efficiently by incorporating an attention mechanism based on this characteristic of human vision. In recent years, the attention mechanism, as a plug-and-play and very effective module, has been widely used in a variety of deep learning tasks such as natural language processing, computer vision and data prediction (<xref ref-type="bibr" rid="B32">Niu et&#xa0;al., 2021</xref>).</p>
<p>The combination of attention mechanism and convolutional neural network is the focus of research in the field of computer vision, and the addition of the attention mechanism enables the model to focus its attention on the object region of the image (<xref ref-type="bibr" rid="B18">Kim and Verghese, 2012</xref>), differentiating from processing the whole image, focusing on extracting the object region features, and effectively improving the model performance. In terms of the object detection task in the field of computer vision, the introduction of the attention mechanism can make the object feature extraction more adequate, reduce the interference of the background image and negative samples (<xref ref-type="bibr" rid="B4">Chai et&#xa0;al., 2023</xref>), and realize the effective improvement of the model detection performance.</p>
<p>In this paper, several experiments are conducted with the YOLOv7 model, and it is found that the model is not suitable for feature extraction of small targets in remote sensing images, which exhibits issues of leakage and misdetection when detecting some SDTs. Therefore, the attention module is added to the YOLOv7 model to improve its characterization ability and further improve the model detection accuracy.</p>
</sec>
<sec id="s2_2_3">
<label>2.2.3</label>
<title>Bounding box regression loss function</title>
<p>Traditional target localization usually uses the Mean Square Error (MSE) loss function to compute the coordinates of the predicted bounding box centroid as well as the loss of width and height (<xref ref-type="bibr" rid="B33">Redmon et&#xa0;al., 2016</xref>), which directly estimates the offset and is susceptible to the interference of outliers and poor robustness. To address the limitations of traditional BBR methods, researchers have proposed several improved loss functions. Ross Girshick introduced the Intersection over Union (IoU) loss (<xref ref-type="bibr" rid="B10">Girshick, 2015</xref>), which calculates the intersection and concurrency ratio between the predicted and true bounding boxes. This loss function reduces the impact of large-scale bounding boxes on the model&#x2019;s loss. However, it lacks attention to the non-overlapping area between the two bounding boxes. To overcome this, the Generalized-IoU (GIoU) loss (<xref ref-type="bibr" rid="B35">Rezatofighi et&#xa0;al., 2019</xref>) was proposed, which uses the area of the smallest bounding box that encloses both boxes as the denominator, providing a better measure of overlap. The Distance-IoU (DIoU) loss incorporates the distance between the centroids of the two bounding boxes into the loss function, further improving the detection performance. Building upon the DIoU loss, the Complete-IoU (CIoU) loss incorporates the aspect ratio into the calculation of the loss function (<xref ref-type="bibr" rid="B51">Zheng et&#xa0;al., 2022</xref>). This enhancement improves the convergence speed of model training and the accuracy of bounding box detection.</p>
<p>In the YOLOv7 model, the CIoU loss is used for BBR. However, it does not fully consider the balance between high and low-quality examples. To reduce the impact of low-quality example regression on the detection performance, this paper adopts a more balanced gradient allocation method. By focusing the loss function mainly on ordinary quality bounding boxes, the detection performance of the model is further improved.</p>
</sec>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Improved YOLOv7 SDTs detection model</title>
<sec id="s2_3_1">
<label>2.3.1</label>
<title>SimAM attention mechanism</title>
<p>SDTs detection is challenging due to the complexity and variability of target scale and picture background. Remote sensing images often contain irrelevant features like roads and houses, which can interfere with dead trees detection. Moreover, the distribution of dead trees in the images can be diverse, requiring high-performance detection and localization by the model.</p>
<p>Taking inspiration from Li, Y (<xref ref-type="bibr" rid="B23">Li et&#xa0;al., 2023</xref>), who proposed the Attention-YOLOv4 algorithm to reduce background interference in detecting small target traffic signs, this paper proposes the introduction of the SimAM module (<xref ref-type="bibr" rid="B49">Yang et&#xa0;al., 2021</xref>) to improve the model&#x2019;s anti-interference ability in dead trees detection. The SimAM module, based on visual neuro-science theory, optimizes the design of the energy function to compute different neuron weights. It provides a fast closed-form solution for the optimized energy function, enhancing the feature extraction capability of the model without introducing additional parameters or increasing computational complexity.</p>
<p>The minimum energy function effectively reduces the computational amount while calculating the corresponding weights of each neuron in the dead trees feature map and discriminates the linear differentiability among neurons, the minimum energy function can be expressed by <xref ref-type="disp-formula" rid="eq1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="eq3">3</xref>:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:msubsup>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>*</mml:mo>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>4</mml:mn>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mn>2</mml:mn>
<mml:msup>
<mml:mover accent="true">
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>M</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>M</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In the equation, <inline-formula>
<mml:math display="inline" id="im1">
<mml:mi>t</mml:mi>
</mml:math>
</inline-formula> represents the target neuron in a single channel of the feature map, <inline-formula>
<mml:math display="inline" id="im2">
<mml:mi>x</mml:mi>
</mml:math>
</inline-formula> represents the other neurons, <inline-formula>
<mml:math display="inline" id="im3">
<mml:mi>M</mml:mi>
</mml:math>
</inline-formula> represents the number of neurons in a single channel, <inline-formula>
<mml:math display="inline" id="im4">
<mml:mi>&#x3bb;</mml:mi>
</mml:math>
</inline-formula> represents the canonical term, and <inline-formula>
<mml:math display="inline" id="im5">
<mml:mover accent="true">
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:msup>
<mml:mover accent="true">
<mml:mi>&#x3c3;</mml:mi>
<mml:mo>^</mml:mo>
</mml:mover>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represent the mean and variance of the other neurons in a single channel, respectively. According to the above equations, it can be seen that the smaller the minimum capability <inline-formula>
<mml:math display="inline" id="im7">
<mml:mrow>
<mml:msubsup>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>*</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is, the more linearly separable the target neuron is from other neurons in a single channel, and the more critical it is for model feature extraction. The weights corresponding to each neuron in the feature map can be obtained from <inline-formula>
<mml:math display="inline" id="im8">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">/</mml:mo>
<mml:msubsup>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>*</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>. Finally, the model undergoes overall refinement through the scaling operator (<xref ref-type="disp-formula" rid="eq4">Equation 4</xref>).</p>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mover accent="true">
<mml:mi>X</mml:mi>
<mml:mo>&#x2dc;</mml:mo>
</mml:mover>
</mml:mstyle>
<mml:mo>=</mml:mo>
<mml:mtext>sigmoid</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>E</mml:mi>
</mml:mstyle>
</mml:mfrac>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2609;</mml:mo>
<mml:mstyle mathvariant="bold" mathsize="normal">
<mml:mi>X</mml:mi>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where E groups the minimum energy of all neurons, Sigmoid function is used to prevent the value of E from being too large. The structure of SimAM is shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>. The feature map is fed into the SimAM attention mechanism to get the weights of each neuron and then normalized. Then each neuron of the original feature map is multiplied by the corresponding weights to obtain the output feature map.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>SimAM attention mechanism.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g006.tif"/>
</fig>
</sec>
<sec id="s2_3_2">
<label>2.3.2</label>
<title>Improved YOLOv7 model with introduction of SimAM attention mechanism</title>
<p>SimAM is a plug-and-play module that enhances the network&#x2019;s representational ability by computing 3D weights, unlike channel and spatial attention mechanisms that treat each neuron equally. In the YOLOv7 network, the SimAM attention mechanism is incorporated into backbone and neck feature extraction network. This module aims to focus more on the detailed features of SDTs and improve the model&#x2019;s detection performance. The structure of the YOLOv7 network with the SimAM attention mechanism is depicted in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>SimAM module embedded design.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g007.tif"/>
</fig>
</sec>
<sec id="s2_3_3">
<label>2.3.3</label>
<title>Wise-IoU loss function</title>
<p>For SDTs detection using the YOLOv7 model, the CIoU is utilized to calculate the BBR loss of the target. <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref> shows the parameter information when the true and predicted bounding boxes are overlapped, and <xref ref-type="disp-formula" rid="eq5">Equation 5</xref> is used to calculate the IoU loss. <xref ref-type="disp-formula" rid="eq6">Equation 6</xref> is constructed to represent the BBR loss, where the penalty term <inline-formula>
<mml:math display="inline" id="im10">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is used to measure the effect of geometric factors on the BBR loss. Zheng, Z et al. (<xref ref-type="bibr" rid="B51">Zheng et al., 2022</xref>) simultaneously considered three geometric elements, namely the intersection and concurrency ratio of the two bounding boxes, the distance from the centroid and the aspect ratio. They constructed both <italic>L</italic>
<sub>
<italic>CIoU</italic>
</sub> and <italic>R</italic>
<sub>
<italic>CIoU</italic>
</sub> (as illustrated in <xref ref-type="disp-formula" rid="eq7">Equations 7</xref>, <xref ref-type="disp-formula" rid="eq8">8</xref>). The equations are shown as follows.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Parameter information when true and predicted bounding boxes are overlapped,the overlapped area can be expressed as <inline-formula>
<mml:math display="inline" id="im9">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g008.tif"/>
</fig>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x211b;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>&#x211b;</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:msub>
<mml:mi>&#x211b;</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mi>g</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:msubsup>
<mml:mi>H</mml:mi>
<mml:mi>g</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mi>v</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>4</mml:mn>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c0;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>tan</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mtext>tan</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In <xref ref-type="disp-formula" rid="eq9">Equation 9</xref>, <italic>&#x3b1;</italic> denotes the balance parameter and <inline-formula>
<mml:math display="inline" id="im12">
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is used to measure the consistency of the aspect ratio. Labeling inaccuracies will inevitably occur when labeling a large amount of data, resulting in the appearance of some low-quality examples, and the computation of the localization loss will not be able to reduce the competitiveness of the low-quality examples if the geometric factors are taken into account, which affects the generalization performance of the model. To mitigate the impact of low-quality examples, this paper replaces the CIoU loss function in the YOLOv7 network with the <inline-formula>
<mml:math display="inline" id="im13">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> loss function (<xref ref-type="bibr" rid="B38">Tong et&#xa0;al., 2023</xref>). By using <inline-formula>
<mml:math display="inline" id="im14">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the interference caused by low-quality examples during model training is reduced, improving the overall performance of the model. The following equations show the calculation of the <inline-formula>
<mml:math display="inline" id="im15">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> loss based on attention (as illustrated in <xref ref-type="disp-formula" rid="eq10">Equations 10</xref>, <xref ref-type="disp-formula" rid="eq11">11</xref>).</p>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x211b;</mml:mi>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msub>
<mml:mi>&#x211b;</mml:mi>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>exp</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mi>g</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>+</mml:mo>
<mml:msubsup>
<mml:mi>H</mml:mi>
<mml:mi>g</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>*</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Among them, <inline-formula>
<mml:math display="inline" id="im16">
<mml:mrow>
<mml:msub>
<mml:mi>&#x211b;</mml:mi>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can increase the model&#x2019;s focus on common examples, and * denotes the separate separation of the length <inline-formula>
<mml:math display="inline" id="im17">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and width <inline-formula>
<mml:math display="inline" id="im18">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the enclosing box, which serves to barely affect the convergence of the <inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:msub>
<mml:mi>&#x211b;</mml:mi>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> even if geometrical factors such as aspect ratios are not taken into account.</p>
<p>Based on the dynamic non-monotonic focusing mechanism, the degree of difference of the anchor box is denoted by <inline-formula>
<mml:math display="inline" id="im20">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> (as illustrated in <xref ref-type="disp-formula" rid="eq12">Equations 12</xref>). The mechanism primarily focuses on prioritizing common examples, reducing the gradient gain allocated to high- and low-quality examples, and preventing larger gradients of low-quality examples from interfering with the BBR. The coefficient <inline-formula>
<mml:math display="inline" id="im21">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is set to construct the <inline-formula>
<mml:math display="inline" id="im22">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> loss function (as illustrated in <xref ref-type="disp-formula" rid="eq13">Equations 13</xref>).</p>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mo>*</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>+</mml:mo>
<mml:mi>&#x221e;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq13">
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>r</mml:mi>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>r</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>&#x3b2;</mml:mi>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:msup>
<mml:mi>&#x3b1;</mml:mi>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>When <inline-formula>
<mml:math display="inline" id="im23">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula>
<mml:math display="inline" id="im24">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> <inline-formula>
<mml:math display="inline" id="im25">
<mml:mi>&#x3b2;</mml:mi>
</mml:math>
</inline-formula> varies with <inline-formula>
<mml:math display="inline" id="im26">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, allowing the gradient gain assignment criterion to adapt accordingly. This ensures that <inline-formula>
<mml:math display="inline" id="im27">
<mml:mrow>
<mml:mi>W</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>v</mml:mi>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> can dynamically adjust the gradient gain assignment, giving greater attention to common examples in a timely manner.</p>
<p>Setting the momentum <inline-formula>
<mml:math display="inline" id="im28">
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula> bravely improves the focus on common examples early in model training (as illustrated in <xref ref-type="disp-formula" rid="eq14">Equations 14</xref>), <inline-formula>
<mml:math display="inline" id="im29">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mtext>&#xa0;</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> represents the total number of batches during training, and <inline-formula>
<mml:math display="inline" id="im30">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mtext>&#xa0;</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> denotes the number of epochs at which the IoU loss approaches convergence.</p>
<disp-formula id="eq14">
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mroot>
<mml:mrow>
<mml:mn>0.05</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mroot>
</mml:mrow>
</mml:math>
</disp-formula>
<p>By increasing the model&#x2019;s focus on average-quality examples, the interference caused by low-quality examples to the BBR is reduced, resulting in a more rapid and smooth BBR convergence to enhance the model&#x2019;s detection performance.</p>
</sec>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Experimental environment and training parameter</title>
<p>The experimental environment and parameter settings used to train the model during the experiment are shown in the <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Experimental environment and training parameter.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="center">Name</th>
<th valign="top" align="center">Specification</th>
<th valign="top" align="center">Name</th>
<th valign="top" align="center">Value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">CPU</td>
<td valign="middle" align="center">Intel(R) Xeon(R) Gold 6354</td>
<td valign="middle" align="center">Optimizer</td>
<td valign="middle" align="center">Adam</td>
</tr>
<tr>
<td valign="middle" align="center">GPU</td>
<td valign="middle" align="center">NVIDIA GeForce RTX 3090</td>
<td valign="middle" align="center">Epochs</td>
<td valign="middle" align="center">300</td>
</tr>
<tr>
<td valign="middle" align="center">Operating System</td>
<td valign="middle" align="center">Ubuntu 18.04</td>
<td valign="middle" align="center">Learning Rate</td>
<td valign="middle" align="center">0.001</td>
</tr>
<tr>
<td valign="middle" align="center">Computing Platform</td>
<td valign="middle" align="center">CUDA 11.1</td>
<td valign="middle" align="center">Weight Decay</td>
<td valign="middle" align="center">0.0005</td>
</tr>
<tr>
<td valign="middle" align="center">Framework</td>
<td valign="middle" align="center">Pytorch 1.8.2</td>
<td valign="middle" align="center">Momentum</td>
<td valign="middle" align="center">0.937</td>
</tr>
<tr>
<td valign="middle" align="center">Language</td>
<td valign="middle" align="center">Python 3.8</td>
<td valign="middle" align="center">Batch Size</td>
<td valign="middle" align="center">16</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Model training from scratch will lead to slow convergence and poor results. In this paper, pretrained weights were used to accelerate the convergence of the model during training when experiments were conducted using the SDTs dataset. The initial learning rate is set to 0.001, the cosine fire reduction strategy is used to adjust the learning rate, the adaptive size of the image is set to 640*640, and 300 epochs of training are performed.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Evaluation metrics</title>
<p>The model detection performance was evaluated by comparing the magnitude of precision (P), recall (R), mean average precision (mAP) and frames per second (FPS) for detecting SDTs images before and after the model improvement, while ensuring that the experimental environments were the same. The precision represents the proportion of positive targets among all targets predicted by the model, and the recall represents the proportion of positive targets among all ground-truth targets predicted by the model (as illustrated in <xref ref-type="disp-formula" rid="eq15"><bold>Equations 15</bold></xref>, <xref ref-type="disp-formula" rid="eq16"><bold>16</bold></xref>).</p>
<disp-formula id="eq15">
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq16">
<label>(16)</label>
<mml:math display="block" id="M16">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>According to the true target bounding boxes and predicted target bounding boxes, they are categorized into true positive cases, false positive cases, false negative cases, and true negative cases, and their corresponding numbers of detection boxes are denoted by TP, FP, FN, and TN, respectively. Neither precision nor recall metrics alone can show the detection capability of the model.</p>
<p>In order to comprehensively evaluate the detection performance of the model, the P-R curve is drawn with R as the vertical coordinate and P (the maximum P value is taken when R is the same) as the vertical coordinate, and the area surrounded by the curve and the coordinate axis is recorded as the average precision (AP) of single-category object detection. The mean Average Precision (mAP) represents the mean of AP of each category, which is calculated by <xref ref-type="disp-formula" rid="eq17">Equations 17</xref>, <xref ref-type="disp-formula" rid="eq17">18</xref>:</p>
<disp-formula id="eq17">
<label>(17)</label>
<mml:math display="block" id="M17">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq18">
<label>(18)</label>
<mml:math display="block" id="M18">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>N</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where N is the number of target categories, in this paper, the detection target category is only SDTs, so AP = mAP in the following.</p>
</sec>
</sec>
<sec id="s3" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Comparison of different attention mechanisms</title>
<p>To validate the effectiveness of the improved algorithm with the introduction of the attention mechanism, this paper employed the SimAM attention mechanism and compared it with another simple yet effective module, the Parameter-Free Average Attention Module (PfAAM)(<xref ref-type="bibr" rid="B19">K&#xf6;rber, 2022</xref>). We further compared these mechanisms to the Squeeze-and-Excitation Networks (SE) channel attention mechanism (<xref ref-type="bibr" rid="B15">Hu et&#xa0;al., 2018</xref>), the hybrid attention mechanism Convolutional Block Attention Module (CBAM) (<xref ref-type="bibr" rid="B47">Woo et&#xa0;al., 2018</xref>), and Coordinate Attention (CA) (<xref ref-type="bibr" rid="B14">Hou et&#xa0;al., 2021</xref>), which incorporates location information into channel attention. In order to ensure a rigorous and effective comparison, different attention mechanisms are added to the same network position while keeping the rest of the network structure unchanged. The experimental environment and model training parameters are kept consistent, and the weights are loaded for testing and comparison after training. The experimental results are presented in the <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison of detection results of different attention mechanisms.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" colspan="5" align="center">Attention Mechanisms</th>
<th valign="middle" rowspan="2" align="center">Parameters (M)</th>
<th valign="middle" rowspan="2" align="center">
<italic>P</italic>(%)</th>
<th valign="middle" rowspan="2" align="center">
<italic>R</italic>(%)</th>
<th valign="middle" rowspan="2" align="center">
<italic>mAP</italic>
<break/>@0.5 (%)</th>
<th valign="middle" rowspan="2" align="center">
<italic>mAP</italic>
<break/>@0.5:0.95(%)</th>
<th valign="middle" rowspan="2" align="center">FPS</th>
</tr>
<tr>
<th valign="middle" align="center">SE</th>
<th valign="middle" align="center">CBAM</th>
<th valign="middle" align="center">CA</th>
<th valign="middle" align="center">PfAAM</th>
<th valign="middle" align="center">SimAM</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="top" align="center"/>
<td valign="top" align="center"/>
<td valign="middle" align="center">35.47</td>
<td valign="middle" align="center">90.64</td>
<td valign="middle" align="center">90.85</td>
<td valign="middle" align="center">96.47</td>
<td valign="middle" align="center">73.17</td>
<td valign="middle" align="center">122</td>
</tr>
<tr>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="top" align="center"/>
<td valign="top" align="center"/>
<td valign="middle" align="center">35.8</td>
<td valign="middle" align="center">91.65</td>
<td valign="middle" align="center">91.53</td>
<td valign="middle" align="center">97.03</td>
<td valign="middle" align="center">73.58</td>
<td valign="middle" align="center">105</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="top" align="center"/>
<td valign="top" align="center"/>
<td valign="middle" align="center">35.89</td>
<td valign="middle" align="center">91.53</td>
<td valign="middle" align="center">89.63</td>
<td valign="middle" align="center">96.36</td>
<td valign="middle" align="center">72.62</td>
<td valign="middle" align="center">79</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">&#x221a;</td>
<td valign="top" align="center"/>
<td valign="top" align="center"/>
<td valign="middle" align="center">35.76</td>
<td valign="middle" align="center">91.46</td>
<td valign="middle" align="center">92.01</td>
<td valign="middle" align="center">97.13</td>
<td valign="middle" align="center">74.03</td>
<td valign="middle" align="center">93</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="top" align="center">&#x221a;</td>
<td valign="top" align="center"/>
<td valign="middle" align="center">35.47</td>
<td valign="middle" align="center">91.64</td>
<td valign="middle" align="center">92.73</td>
<td valign="middle" align="center">97.29</td>
<td valign="middle" align="center">74.08</td>
<td valign="middle" align="center">109</td>
</tr>
<tr>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center"/>
<td valign="top" align="center"/>
<td valign="top" align="center">&#x221a;</td>
<td valign="middle" align="center">35.47</td>
<td valign="middle" align="center">92.89</td>
<td valign="middle" align="center">92.03</td>
<td valign="middle" align="center">97.48</td>
<td valign="middle" align="center">74.14</td>
<td valign="middle" align="center">112</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>From <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>, it can be observed that the CBAM_YOLOv7 model has the highest number of parameters compared to the YOLOv7 model, with an increase of 0.42M. It shows a slight improvement of 0.89% in precision, a decrease of 1.22% in recall, a decrease of 0.11% in mAP@0.5, and a significant reduction in detection speed by 43 FPS. The SE_YOLOv7 and CA_YOLOv7 model exhibits improved performance in all metrics except for a decrease in detection speed and an increase in the number of parameters. The SE_YOLOv7 model shows a 1.01% increase in precision value, a 0.68% increase in recall value, and a 0.56% increase in mAP@0.5 value compared to the YOLOv7 model. The CA_YOLOv7 model shows a 0.82% increase in precision value, a 1.16% increase in recall value, and a 0.66% increase in mAP@0.5 value compared to the YOLOv7 model. Without adding more parameters, PfAAM_YOLOv7 demonstrated improvements of 1% in precision value, 1.88% in recall value, and 0.82% in mAP@0.5 value compared to YOLOv7. The SimAM_YOLOv7 model demonstrates optimal performance in all metrics, enhancing precision by 2.25%, recall by 1.18%, and mAP@0.5 by 1.01%. Additionally, the SimAM_YOLOv7 model achieves the highest detection speed among the five models with attention mechanisms, reaching 112 FPS.</p>
<p>Through the comparison and analysis of the experimental results, it can be concluded that the SimAM_YOLOv7 model exhibits better detection performance compared to the original YOLOv7 model, as well as the SE_YOLOv7, CBAM_YOLOv7, CA_YOLOv7 and PfAAM_YOLOv7 models, except for a slightly lower detection speed compared to the original YOLOv7 model. Furthermore, to further verify the impact of introducing different attention mechanisms on the model&#x2019;s detection performance, the detection results are visualized and compared by loading images from the test set for each model. Some detection results are shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Ground-truth and prediction result.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g009.tif"/>
</fig>
<p>From the images in Group (A), it can be observed that for small target images, YOLOv7, SE_YOLOv7, CBAM_YOLOv7 and CA_YOLOv7 all exhibit instances of leakage detection, with one target missed detection. CBAM_YOLOv7 mistakenly identifies a tree with a similar color to the target as a dead standing tree, while PfAAM_YOLOv7 incorrectly identifies one dead tree as two separate instances. In Group (B), YOLOv7 misses two targets, and the visualization results indicate that even after the introduction of SE, CBAM and CA attention mechanisms, there are still instances of leakage when detecting small targets, with two targets missed detection, respectively. After the introduction of the PfAAM attention mechanism, three targets were missed. However, when using the SimAM_YOLOv7 model to detect SDTs, it successfully detects all ground-truth targets in the two test images. This suggests that it is capable of achieving comprehensive and accurate detection for images with complex backgrounds, similar colors and small targets.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Comparison of loss functions</title>
<p>The impact of different BBR loss functions on model convergence is evaluated un-der identical experimental conditions and model training parameters. In this paper&#x2019;s experiments, the CIoU and WIoU losses are introduced into the YOLOv7 model for comparison. The YOLOv7 model with the CIoU loss is denoted as YOLOv7-CIoU, while the YOLOv7 model with the improved loss function is referred to as YOLOv7-WIoU. The change curves of the two types of bounding box localization losses during the training process are depicted in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref>.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>Loss function iteration comparison.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g010.tif"/>
</fig>
<p>From the <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref>, it is evident that both the CIoU loss and the WIoU loss reach convergence before 10 epochs during the training process. However, the WIoU loss converges faster and exhibits greater stability compared to the CIoU loss. Starting from the 87th epoch, the BBR loss of YOLOv7-WIoU becomes lower than that of YOLOv7-CIoU, and the discrepancy between the two loss values further amplifies in subsequent training. Eventually, at the end of training, the BBR loss values for YOLOv7-WIoU and YOLOv7-CIoU are 0.07537 and 0.07556, respectively, indicating a reduction of 0.00019 in the WIoU loss compared to the CIoU loss. In summary, this experiment validates the effectiveness of utilizing the WIoU loss and demonstrates its significance in enhancing the model&#x2019;s performance.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Ablation experiments</title>
<p>The improvement algorithm proposed in this paper focuses on two key enhancements. To further evaluate the algorithm&#x2019;s performance, we integrated SimAM into the backbone and neck of the YOLOv7 model for separate comparisons. Ablation experiments are conducted by loading the weights of the trained model with different improvement points into the network. The experiments are divided into six groups, controlling variables while ensuring that the experimental environment and training parameters remain unchanged. The resulting experimental outcomes are presented in the <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Results of ablation experiments.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">YOLOv7</th>
<th valign="middle" colspan="3" align="center">SimAM</th>
<th valign="middle" rowspan="2" align="center">WIoU</th>
<th valign="middle" rowspan="2" align="center">
<italic>P</italic>(%)</th>
<th valign="middle" rowspan="2" align="center">
<italic>R</italic>(%)</th>
<th valign="middle" rowspan="2" align="center">
<italic>mAP</italic>
<break/>@0.5(%)</th>
<th valign="middle" rowspan="2" align="center">
<italic>mAP</italic>
<break/>@0.5:0.95(%)</th>
<th valign="middle" rowspan="2" align="center">FPS</th>
</tr>
<tr>
<th valign="middle" align="center">Backbone</th>
<th valign="middle" colspan="2" align="center">Neck</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center" colspan="2"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">90.64</td>
<td valign="middle" align="center">90.85</td>
<td valign="middle" align="center">96.47</td>
<td valign="middle" align="center">73.17</td>
<td valign="middle" align="center">122</td>
</tr>
<tr>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center" colspan="2"/>
<td valign="middle" align="center"/>
<td valign="middle" align="center">92.73</td>
<td valign="middle" align="center">91.94</td>
<td valign="middle" align="center">97.16</td>
<td valign="middle" align="center">73.38</td>
<td valign="middle" align="center">115</td>
</tr>
<tr>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center" colspan="2">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">92.12</td>
<td valign="middle" align="center">91.84</td>
<td valign="middle" align="center">96.95</td>
<td valign="middle" align="center">73.20</td>
<td valign="middle" align="center">114</td>
</tr>
<tr>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center" colspan="2">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center">92.89</td>
<td valign="middle" align="center">92.03</td>
<td valign="middle" align="center">97.48</td>
<td valign="middle" align="center">74.14</td>
<td valign="middle" align="center">112</td>
</tr>
<tr>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center" colspan="2"/>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">93.26</td>
<td valign="middle" align="center">91.6</td>
<td valign="middle" align="center">97.34</td>
<td valign="middle" align="center">74.77</td>
<td valign="middle" align="center">123</td>
</tr>
<tr>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center" colspan="2">&#x221a;</td>
<td valign="middle" align="center">&#x221a;</td>
<td valign="middle" align="center">94.31</td>
<td valign="middle" align="center">93.13</td>
<td valign="middle" align="center">98.03</td>
<td valign="middle" align="center">74.94</td>
<td valign="middle" align="center">108</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The experimental results of integrating SimAM into the YOLOv7 backbone and neck, respectively, indicate that both embedding methods enhance the model&#x2019;s detection performance. The addition of SimAM to the backbone significantly enhanced the model&#x2019;s detection accuracy. Specifically, the precision, recall and mAP@0.5 values increased by 0.61%, 0.1%, and 0.21% respectively, compared to the improvements observed in the neck. However, it is clear that embedding SimAM in both backbone and neck is more effective in improving the detection performance.</p>
<p>The SimAM_YOLOv7 model, integrating SimAM into both the backbone and neck of the YOLOv7 architecture, improves performance across all metrics compared to the original YOLOv7 model, except for a reduction in detection speed by 10FPS. It achieves this improvement by extracting feature information through calculating the corresponding weights of each neuron in the feature map. By replacing the CIoU loss with the WIoU loss in YOLOv7, the convergence speed of the model is improved, and all performance indexes show improvement compared to the original YOLOv7 model. Specifically, the precision, recall and mAP@0.5 values are enhanced by 2.62%, 0.75%, and 0.87% respectively, while the detection speed improves by 1 FPS.</p>
<p>Furthermore, when both the SimAM parameter-free attention mechanism and the WIoU loss are introduced, the proposed algorithm demonstrates significantly superior performance compared to the other models. Compared to the original YOLOv7 model, the proposed algorithm shows improvements of 3.67% in precision value, 2.28% in recall value, and 1.56% in mAP@0.5 value, while the detection speed decreases by 14 FPS. These results effectively meet the real-time and accuracy requirements for SDTs detection. <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref> presents some detection results from the YOLOv7 model and the proposed model in this paper on the SDTs dataset.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>YOLOv7 model and improved model prediction results; <bold>(A)</bold> Ground-truth. <bold>(B)</bold> Predicted by YOLOv7. <bold>(C)</bold> Predicted by ours.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g011.tif"/>
</fig>
<p>According to the <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref>, when using the YOLOv7 model to detect dead trees, there are instances of leakage and misdetection. For example, in group (B) images, the targets on the right edge of the first and second images are not detected, and in the third picture, the model mistakenly detects similarly colored land and healthy standing trees as SDTs.</p>
<p>However, when comparing the results of the improved model proposed in this paper in group (C) images, it shows better detection performance for targets at the image edges and effectively improves the confidence level of detecting dead trees. The improved model is less affected by complex backgrounds and similar standing tree colors. The visualization results in <xref ref-type="fig" rid="f11">
<bold>Figure&#xa0;11</bold>
</xref> demonstrate that introducing the attentional mechanism and the WIoU loss in this paper without increasing model parameters enhances the overall performance of the model, despite a slight reduction in detection speed. The proposal of the improved model is of great significance for further research on SDTs detection and forest resource preservation.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Comparison experiments</title>
<p>In order to substantiate the superiority of the proposed enhanced model, we conducted a comparative analysis with other commonly used algorithmic models under identical experimental conditions and dataset. The <xref ref-type="fig" rid="f12">
<bold>Figure&#xa0;12</bold>
</xref> illustrates the mAP@0.5-value change curve for each model during the training process, while <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref> presents the results of the comparative experiments.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Different model training processes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g012.tif"/>
</fig>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Results of comparison experiments.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Backbone</th>
<th valign="middle" align="center">
<italic>mAP</italic>
<break/>@0.5(%)</th>
<th valign="middle" align="center">
<italic>mAP</italic>
<break/>@0.5:0.95(%)</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">YOLOv8s</td>
<td valign="middle" align="center">Darknet53</td>
<td valign="middle" align="center">97.58</td>
<td valign="middle" align="center">74.62</td>
<td valign="middle" align="center">127</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv7</td>
<td valign="middle" align="center">Darknet53</td>
<td valign="middle" align="center">96.47</td>
<td valign="middle" align="center">73.17</td>
<td valign="middle" align="center">122</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv5s</td>
<td valign="middle" align="center">Darknet53</td>
<td valign="middle" align="center">94.59</td>
<td valign="middle" align="center">65.18</td>
<td valign="middle" align="center">112</td>
</tr>
<tr>
<td valign="middle" align="center">YOLOv4</td>
<td valign="middle" align="center">Darknet53</td>
<td valign="middle" align="center">79.15</td>
<td valign="middle" align="center">49.06</td>
<td valign="middle" align="center">50</td>
</tr>
<tr>
<td valign="middle" align="center">Faster R-CNN</td>
<td valign="middle" align="center">ResNet50_FPN</td>
<td valign="middle" align="center">89.89</td>
<td valign="middle" align="center">54.59</td>
<td valign="middle" align="center">31</td>
</tr>
<tr>
<td valign="middle" align="center">Ours</td>
<td valign="middle" align="center">SimAM_<break/>Darknet53</td>
<td valign="middle" align="center">98.03</td>
<td valign="middle" align="center">74.94</td>
<td valign="middle" align="center">108</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The figure illustrates that in this experiment, all models were trained for 300 epochs. It&#x2019;s evident that throughout the training process, the detection accuracy of the models (typically denoted by mAP@0.5) steadily enhances, ultimately reaching a stable state. This signifies the convergence of the model training process. Faster R-CNN demonstrates the quickest increase in mAP@0.5 compared to other models and stabilizes first. The final mAP@0.5 values for each model align with the results presented in <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>.</p>
<p>From the <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref>, it is evident that the proposed model in this paper achieves a higher mAP@0.5 value of 98.03% on the SDTs dataset, outperforming other main-stream models. Only YOLOv5s, YOLOv7 and YOLOv8s exhibit detection accuracies above 90%, which are lower by 3.44%, 1.56% and 0.45% respectively, compared to the improved model proposed in this paper. Among the other models, the Faster R-CNN model with Res-Net50+FPN as the backbone network demonstrates the highest mAP@0.5 value of 89.89%, while the detection mAP@0.5 value of the YOLOv4 models does not exceed 80%. These results verify the effectiveness and superiority of the improved model proposed in this paper in terms of dead trees detection accuracy.</p>
<p>In terms of model detection speed, specifically real-time detection performance, only four models, including the proposed model, achieve a speed higher than 100 FPS. The proposed model in this paper exhibits a detection speed of 108 FPS, which is slightly lower than YOLOv8s, YOLOv7 and YOLOv5s models by19, 14 and 4 FPS, respectively. However, it still holds a significant advantage over other models and fully satisfies the real-time demand for SDTs detection. Considering the detection mAP@0.5 and speed across various models, single-stage YOLO models, including YOLOv5 and subsequent versions, exhibit significant advantages over two-stage Faster R-CNN models. Our proposed improved model, in particular, surpasses other models in terms of detection performance. The potential reasons for the superior performance of our proposed model are twofold: Firstly, the SimAM attention mechanism significantly enhances the model&#x2019;s ability to extract individual features of standing dead trees without introducing additional parameters. This reduction in interference from complex backgrounds and variations in the scale of dead trees during the detection process alleviates issues of both missed detections and false positives, thereby improving the accuracy of small target detection. Secondly, the replacement of the CIoU loss function with WIoU enhances the model&#x2019;s robustness by focusing on bounding boxes of ordinary quality. This improvement accelerates the model&#x2019;s convergence speed, further enhancing the accuracy of automatic detection of standing dead trees in high-resolution aerial remote sensing images.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Impact of different sized datasets on model performance</title>
<p>To verify the effectiveness of the data augmentation method employed in this study, 1928 images were taken as the basis, and the data volume was expanded by 2 times and 4 times, respectively. The three datasets were then used to train the im-proved YOLOv7 model. The experimental results are summarized in the <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Comparison of different data volumes.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Number</th>
<th valign="middle" align="center">
<italic>P</italic>(%)</th>
<th valign="middle" align="center">
<italic>R</italic>(%)</th>
<th valign="middle" align="center">
<italic>mAP</italic>
<break/>@0.5(%)</th>
<th valign="middle" align="center">
<italic>mAP</italic>
<break/>@0.5:0.95(%)</th>
<th valign="middle" align="center">FPS</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="center">1928</td>
<td valign="top" align="center">72.48</td>
<td valign="top" align="center">69.40</td>
<td valign="top" align="center">76.70</td>
<td valign="top" align="center">44.52</td>
<td valign="top" align="center">108.40</td>
</tr>
<tr>
<td valign="top" align="center">5784</td>
<td valign="top" align="center">85.46</td>
<td valign="top" align="center">86.15</td>
<td valign="top" align="center">92.47</td>
<td valign="top" align="center">64.86</td>
<td valign="top" align="center">108.28</td>
</tr>
<tr>
<td valign="top" align="center">9640</td>
<td valign="middle" align="center">94.31</td>
<td valign="middle" align="center">93.13</td>
<td valign="middle" align="center">98.03</td>
<td valign="top" align="center">74.94</td>
<td valign="top" align="center">108.53</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref> shows that as the amount of data input into the model increases, the values of precision, recall, and mAP@0.5 gradually increase, but the growth rate slows down. When the dataset size reaches 9640 images, which was prepared for this study, the indicators reach their maximum values: precision at 94.31%, recall at 93.13%, and mAP@0.5 at 98.03%. The <xref ref-type="fig" rid="f13">
<bold>Figure&#xa0;13</bold>
</xref> illustrates the corresponding precision, recall, and mAP@0.5 metrics for the three datasets, along with their trends of change.</p>
<fig id="f13" position="float">
<label>Figure&#xa0;13</label>
<caption>
<p>The detection results and metric trends for datasets of different sizes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1278161-g013.tif"/>
</fig>
</sec>
</sec>
<sec id="s4" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>Efficient and accurate automated identification of SDTs in forests is crucial for safeguarding forest resources and biodiversity. Conventional dead trees detection methods often encounter challenges such as difficulty, high expenses, and limited timeliness. To address these issues, this paper combines airborne remote sensing and deep learning techniques to achieve real-time and efficient automated identification of individual SDTs.</p>
<p>In this study, we utilized airborne remote sensing images with a ground resolution of 0.12 m, captured from an altitude of 1,000 m, as the data source. To meet the accuracy and real-time requirements, we proposed the improved YOLOv7 model for automated identification of dead trees. The model achieved precision, recall, mAP@0.5 and FPS values of 94.31%, 93.13%, 98.03%, 108 respectively. In similar studies of dead tree detection, Chiang et&#xa0;al. applied transfer learning to the Mask RCNN network for automated detection of SDTs in aerial images (<xref ref-type="bibr" rid="B6">Chiang et&#xa0;al., 2020</xref>). Li et&#xa0;al. pro-posed the LLAM-MDCNet method based on the MDCN network to detect clusters of dead trees in aerial images, aiming to reduce the interference from complex back-grounds and variable target scales by introducing the LIAM attention module (<xref ref-type="bibr" rid="B24">Li et&#xa0;al., 2022</xref>). However, their methods did not achieve the recognition of individual dead trees. To address the issue, Jiang et&#xa0;al. improved the Faster R-CNN algorithm based on the Swin-Transformer to enhance the learning of global information, enabling the recognition of individual SDTs in UAV images (<xref ref-type="bibr" rid="B16">Jiang et&#xa0;al., 2023</xref>). However, due to the two-stage nature of the Faster R-CNN algorithm, involving candidate region extraction, target classification and bounding box regression (BBR), the detection speed is slower, making it challenging to meet real-time requirements for SDTs detection (<xref ref-type="bibr" rid="B34">Ren et&#xa0;al., 2017</xref>). In an effort to improve the trade-off between accuracy and efficiency, Wang et&#xa0;al. proposed the LDS-YOLO method, which reduced the number of model parameters by enhancing the backbone network and introducing the SoftPool method into the SPP module. The accuracy of dead trees detection achieved with LDS-YOLO was reported as 89.11%, meeting the real-time requirements for automated dead trees detection (<xref ref-type="bibr" rid="B43">Wang et&#xa0;al., 2022</xref>). However, there is room for improvement in terms of the accuracy of detecting small targets. Compared to previous studies, the model proposed in this paper exhibits superior detection performance in both accuracy and speed when identifying small-target dead trees at the individual tree scale.After screening the dataset, it became evident that the sample size was insufficient. To enhance the model&#x2019;s robustness and detection performance, data augmentation techniques such as random flipping, mirroring, and brightness adjustments were applied. Comparative experiments were conducted using the improved YOLOv7 model with varying data volumes. The results indicate that the mAP@0.5 value is positively correlated with the data volume, especially when the data volume is limited. The provided sample dataset in this paper reveals a dense distribution of trees in the Forestry Farm, with small and scattered canopies of dead trees. Challenges arise when recognizing dead trees, especially when shadows cast by tall trees obscure them, and in complex backgrounds where distinguishing dead trees becomes difficult. Visualizations in <xref ref-type="fig" rid="f9">
<bold>Figures&#xa0;9</bold>
</xref> and <xref ref-type="fig" rid="f11">
<bold>11</bold>
</xref> indicate omissions and misdetections when using the YOLOv7 model, suggesting that the model has not fully learned the distinctive features of dead trees. This study addresses this issue by introducing SimAM and WIoU, which extract more detailed features from the limited dataset and improve the model&#x2019;s focus on bounding boxes of average quality. Consequently, the detection accuracy of the model is significantly enhanced. Further-more, the improved YOLOv7 model exhibits faster detection speed compared to other mainstream models. This capability is crucial for rangers to monitor forest dynamics in real-time, effectively manage and protect forest resources.</p>
<p>This study has several limitations. Firstly, the improved model may face occlusion by healthy trees when recognizing dead trees, leading to misdetection and reducing the accuracy of the model. To address this issue, referring to Hell, M (<xref ref-type="bibr" rid="B12">Hell et&#xa0;al., 2022</xref>) and Wing, BM (<xref ref-type="bibr" rid="B46">Wing et&#xa0;al., 2015</xref>) for the detection of dead trees from LiDAR data, we can consider combining LiDAR data with optical remote sensing data. LiDAR data can provide information on tree height, shape, and point-cloud density, while optical remote sensing data can capture texture, color, and spectral features of the trees. By fusing the two types of features, we can achieve more accurate detection and localization of individual dead trees, thereby enhancing the accuracy of dead trees detection. Moreover, researchers have explored diseased and dead trees detection using multi-temporal multispectral images, ALS data and CIR images (<xref ref-type="bibr" rid="B17">Kaminska et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B48">Wu et&#xa0;al., 2023</xref>), which could serve as valuable references for our future research.</p>
<p>Another limitation of this study is the insufficient consideration of external environmental factors that may interfere with the experiment. Variations in lighting conditions and resolution can cause changes in the color and texture of SDTs, leading to detection interference. To mitigate the impact of these factors, we can use multiple image views or collect multi-temporal remote sensing images under different environmental conditions. This approach will provide more comprehensive tree contrast information and improve the robustness of the detection model.</p>
<p>To sum up, our proposed automatic SDTs detection model holds promising applications in forest protection and disaster prevention. In the future, we will explore and develop more real-time and efficient dead trees detection methods by incorporating multiple data sources, thereby further enhancing the accuracy and applicability of the model.</p>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusions</title>
<p>This study demonstrates the potential of deep learning algorithms in detecting SDTs from airborne remote sensing images. To overcome the limitations of traditional manual inventory methods, we propose an automatic detection model based on an improved YOLOv7 for efficient identification and localization of dead trees in remote sensing images. The model, built upon YOLOv7, addresses challenges posed by dense canopies and complex forest backgrounds by embedding the SimAM attention mechanism module in the backbone and neck. Compared to embedding the other four attention mechanisms in YOLOv7, the SimAM_YOLOv7 model has a smaller number of parameters and achieves higher detection accuracy. Additionally, The WIoU loss function is employed instead of the CIoU loss to enhance the model&#x2019;s focus on ordinary labeled samples, improving convergence speed and detection accuracy. The experimental results reveal that the improved model achieves precision, recall, and mAP@0.5 values of 94.31%, 93.13%, and 98.03%, respectively, representing a 3.67%, 2.28%, and 1.56% improvement over the original YOLOv7 model. Furthermore, the model outperforms other mainstream models in terms of the combined performance of detection accuracy and speed. The proposed model holds practical applications in forestry management, offering a convenient solution for forest resource and biodiversity conservation.</p>
</sec>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>SW: Conceptualization, Formal analysis, Methodology, Project administration, Writing &#x2013; review &amp; editing. HZ: Conceptualization, Formal analysis, Methodology, Validation, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. ZX: Conceptualization, Project administration, Writing &#x2013; review &amp; editing. HS: Conceptualization, Formal analysis, Project administration, Writing &#x2013; review &amp; editing.</p>
</sec>
</body>
<back>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This research was funded by the Forestry Science and Technology Promotion Demonstration Project of the Central Government, Grant Number Hei (2022)TG21, and the Fundamental Research Funds for the Central Universities, Grant Number 2572022DP04.</p>
</sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bernal</surname> <given-names>A. A.</given-names>
</name>
<name>
<surname>Kane</surname> <given-names>J. M.</given-names>
</name>
<name>
<surname>Knapp</surname> <given-names>E. E.</given-names>
</name>
<name>
<surname>Zald</surname> <given-names>H. S. J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Tree resistance to drought and bark beetle-associated mortality following thinning and prescribed fire treatments</article-title>. <source>For. Ecol. Manage.</source> <volume>530</volume>, <elocation-id>120758</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.foreco.2022.120758</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Butler</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Schlaepfer</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Spruce snag quantification by coupling colour infrared aerial photos and a GIS</article-title>. <source>For. Ecol. Manage.</source> <volume>195</volume>, <fpage>325</fpage>&#x2013;<lpage>339</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.foreco.2004.02.042</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Celik</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Unsupervised change detection in satellite images using principal component analysis and k-means clustering</article-title>. <source>IEEE Geosci. Remote Sens. Lett.</source> <volume>6</volume>, <fpage>772</fpage>&#x2013;<lpage>776</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/LGRS.2009.2025059</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chai</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Nie</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Qiao</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Remote sensing images background noise processing method for ship objects in instance segmentation</article-title>. <source>J. Indian Soc. Remote Sens.</source> <volume>51</volume>, <fpage>647</fpage>&#x2013;<lpage>659</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s12524-022-01631-7</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Hay</surname> <given-names>G. J.</given-names>
</name>
<name>
<surname>Castilla</surname> <given-names>G.</given-names>
</name>
<name>
<surname>St-Onge</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Powers</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>A multiscale geographic object-based image analysis to estimate lidar-measured forest canopy height using Quickbird imagery</article-title>. <source>Int. J. Geogr. Inf. Sci.</source> <volume>25</volume>, <fpage>877</fpage>&#x2013;<lpage>893</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1080/13658816.2010.496729</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chiang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Barnes</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Angelov</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep learning-based automated forest health diagnosis from aerial images</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>144064</fpage>&#x2013;<lpage>144076</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2020.3012417</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Coops</surname> <given-names>N. C.</given-names>
</name>
<name>
<surname>Gillanders</surname> <given-names>S. N.</given-names>
</name>
<name>
<surname>Wulder</surname> <given-names>M. A.</given-names>
</name>
<name>
<surname>Gergel</surname> <given-names>S. E.</given-names>
</name>
<name>
<surname>Nelson</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Goodwin</surname> <given-names>N. R.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Assessing changes in forest fragmentation following infestation using time series Landsat imagery</article-title>. <source>For. Ecol. Manage.</source> <volume>259</volume>, <fpage>2355</fpage>&#x2013;<lpage>2365</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.foreco.2010.03.008</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Eklundh</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Johansson</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Solberg</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Mapping insect defoliation in Scots pine with MODIS time-series data</article-title>. <source>Remote Sens. Environ.</source> <volume>113</volume>, <fpage>1566</fpage>&#x2013;<lpage>1573</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.rse.2009.03.008</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Farias</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Fabregas</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Dormido-Canto</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Vega</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Vergara</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Dormido Bencomo</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Applying deep learning for improving image classification in nuclear fusion devices</article-title>. <source>IEEE Access</source> <volume>6</volume>, <fpage>72345</fpage>&#x2013;<lpage>72356</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2018.2881832</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201d;<article-title>Fast R-CNN</article-title>,&#x201d; in <conf-name>2015 IEEE International Conference on Computer Vision (ICCV)</conf-name>, <conf-loc>Santiago, Chile</conf-loc>, <volume>2015</volume>, <fpage>1440</fpage>&#x2013;<lpage>1448</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1504.08083</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Peng</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Detection of standing dead trees after pine wilt disease outbreak with airborne remote sensing imagery by multi-scale spatial attention deep learning and gaussian kernel approach</article-title>. <source>Remote Sens.</source> <volume>14</volume>, <elocation-id>3075</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs14133075</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hell</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Brandmeier</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Briechle</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Krzystek</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Classification of tree species and standing dead trees with lidar point clouds using two deep neural networks: pointCNN and 3DmFV-net</article-title>. <source>PFG-J. Photogramm. Remote Sens. Geoinf. Sci.</source> <volume>90</volume>, <fpage>103</fpage>&#x2013;<lpage>121</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s41064-022-00200-4</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hicke</surname> <given-names>J. A.</given-names>
</name>
<name>
<surname>Logan</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Mapping whitebark pine mortality caused by a mountain pine beetle outbreak with high spatial resolution satellite imagery</article-title>. <source>Int. J. Remote Sens.</source> <volume>30</volume>, <fpage>4427</fpage>&#x2013;<lpage>4441</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1080/01431160802566439</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hou</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Coordinate attention for efficient mobile network design</article-title>,&#x201d; in <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Nashville, TN, USA</conf-loc>, <volume>2021</volume>, <fpage>13708</fpage>&#x2013;<lpage>13717</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01350</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Hu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Squeeze-and-excitation networks</source>. Available at: <uri xlink:href="https://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper.html">https://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper.html</uri> (Accessed <access-date>July 14, 2023</access-date>).</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Han</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A multi-scale approach to detecting standing dead trees in UAV RGB images based on improved faster R-CNN</article-title>. <source>PloS One</source> <volume>18</volume>, <elocation-id>e0281084</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pone.0281084</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kaminska</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Lisiewicz</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sterenczak</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Kraszewski</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Sadkowski</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Species-related single dead tree detection using multi-temporal ALS data and CIR imagery</article-title>. <source>Remote Sens. Environ.</source> <volume>219</volume>, <fpage>31</fpage>&#x2013;<lpage>43</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.rse.2018.10.005</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname> <given-names>Y.-J.</given-names>
</name>
<name>
<surname>Verghese</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>The selectivity of task-dependent attention varies with surrounding context</article-title>. <source>J. Neurosci.</source> <volume>32</volume>, <fpage>12180</fpage>&#x2013;<lpage>12191</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1523/JNEUROSCI.5992-11.2012</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>K&#xf6;rber</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2022</year>). <source>Parameter-free average attention improves convolutional neural network performance (Almost) free of charge</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2210.07828</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname> <given-names>C. K. F.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Muller-Landau</surname> <given-names>H. C.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wright</surname> <given-names>S. J.</given-names>
</name>
<name>
<surname>Cushman</surname> <given-names>K. C.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Cost-effective and accurate monitoring of flowering across multiple tropical tree species over two years with a time series of high-resolution drone imagery and deep learning</article-title>. <source>ISPRS-J. Photogramm. Remote Sens.</source> <volume>201</volume>, <fpage>92</fpage>&#x2013;<lpage>103</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.isprsjprs.2023.05.022</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Park</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Baek</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>C.-W.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Detection of damaged pine tree by the pine wilt disease using UAV image</article-title>. <source>Korean J. Remote Sens.</source> <volume>35</volume>, <fpage>359</fpage>&#x2013;<lpage>373</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.7780/KJRS.2019.35.3.2</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lei</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Nandi</surname> <given-names>A. K.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Multi-modality and multi-scale attention fusion network for land cover classification from VHR remote sensing images</article-title>. <source>Remote Sens.</source> <volume>13</volume>, <elocation-id>3771</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs13183771</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Meng</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Attention-YOLOV4: a real-time and high-accurate traffic sign detection algorithm</article-title>. <source>Multimed. Tools Appl.</source> <volume>82</volume>, <fpage>7567</fpage>&#x2013;<lpage>7582</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-022-13251-x</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Cai</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Xue</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>LLAM-MDCNet for detecting remote sensing images of dead tree clusters</article-title>. <source>Remote Sens.</source> <volume>14</volume>, <elocation-id>3684</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs14153684</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Luo</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Roques</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Early monitoring of forest wood-boring pests with remote sensing</article-title>. <source>Annu. Rev. Entomol.</source> <volume>68</volume>, <fpage>277</fpage>&#x2013;<lpage>298</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1146/annurev-ento-120220-125410</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Manning</surname> <given-names>A. D.</given-names>
</name>
<name>
<surname>Fischer</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lindenmayer</surname> <given-names>D. B.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Scattered trees are keystone structures - Implications for conservation</article-title>. <source>Biol. Conserv.</source> <volume>132</volume>, <fpage>311</fpage>&#x2013;<lpage>321</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biocon.2006.04.023</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Maxwell</surname> <given-names>A. E.</given-names>
</name>
<name>
<surname>Warner</surname> <given-names>T. A.</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>F.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Implementation of machine-learning classification in remote sensing: an applied review</article-title>. <source>Int. J. Remote Sens.</source> <volume>39</volume>, <fpage>2784</fpage>&#x2013;<lpage>2817</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1080/01431161.2018.1433343</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Mapping forest health using spectral and textural information extracted from SPOT-5 satellite images</article-title>. <source>Remote Sens.</source> <volume>8</volume>, <elocation-id>719</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs8090719</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miltiadou</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Agapiou</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Gonzalez Aracil</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Hadjimitsis</surname> <given-names>D. G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Detecting dead standing eucalypt trees from voxelised full-waveform lidar using multi-scale 3D-windows for tackling height and size variations</article-title>. <source>Forests</source> <volume>11</volume>, <elocation-id>161</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/f11020161</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nadrowski</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wirth</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Scherer-Lorenzen</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Is forest diversity driving ecosystem function and service</article-title>? <source>Curr. Opin. Environ. Sustain.</source> <volume>2</volume>, <fpage>75</fpage>&#x2013;<lpage>79</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cosust.2010.02.003</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Naga Srinivasu</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Krishna</surname> <given-names>T. B.</given-names>
</name>
<name>
<surname>Ahmed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Almusallam</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Khaled Alarfaj</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Allheeib</surname> <given-names>N.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Variational autoencoders-basedSelf-learning model for tumor identification and impact analysis from 2-D MRI images</article-title>. <source>J. Healthcare Eng.</source> <volume>2023</volume>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1155/2023/1566123</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A review on the attention mechanism of deep learning</article-title>. <source>Neurocomputing</source> <volume>452</volume>, <fpage>48</fpage>&#x2013;<lpage>62</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neucom.2021.03.091</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Divvala</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>You only look once: unified, real-time object detection</article-title>,&#x201d; in <conf-name>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Las Vegas, NV, USA</conf-loc>, <volume>2016</volume>, <fpage>779</fpage>&#x2013;<lpage>788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1506.02640</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Faster R-CNN: towards real-time object detection with region proposal networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>39</volume>, <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rezatofighi</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Tsoi</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Gwak</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sadeghian</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Reid</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Savarese</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Generalized intersection over union: A metric and a loss for bounding box regression</article-title>,&#x201d; in <conf-name>2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Long Beach, CA, USA</conf-loc>, 2019, <fpage>658</fpage>&#x2013;<lpage>666</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00075</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sirisha</surname> <given-names>U.</given-names>
</name>
<name>
<surname>Praveen</surname> <given-names>S. P.</given-names>
</name>
<name>
<surname>Srinivasu</surname> <given-names>P. N.</given-names>
</name>
<name>
<surname>Barsocchi</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Bhoi</surname> <given-names>A. K.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Statistical analysis of design aspects of various YOLO-based deep learning models for object detection</article-title>. <source>Int. J. Comput. Intell. Syst.</source> <volume>16</volume>, <fpage>126</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s44196-023-00302-w</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Srivastava</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Shukla</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Bansal</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A comprehensive review on soil classification using deep learning and computer vision techniques</article-title>. <source>Multimed. Tools Appl.</source> <volume>80</volume>, <fpage>14887</fpage>&#x2013;<lpage>14914</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11042-021-10544-5</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Tong</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2023</year>). <source>Wise-ioU: bounding box regression loss with dynamic focusing mechanism</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2301.10051</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Voulodimos</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Doulamis</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Doulamis</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Protopapadakis</surname> <given-names>E.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Deep learning for computer vision: A brief review</article-title>. <source>Comput. Intell. Neurosci.</source> <volume>2018</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1155/2018/7068349</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.-Y.</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Vancouver, BC, Canada</conf-loc>, <volume>2023</volume>, <fpage>7464</fpage>&#x2013;<lpage>7475</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2207.02696</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Habitat suitability of pine wilt disease in northeast China under climate change scenario</article-title>. <source>Forests</source> <volume>14</volume>, <elocation-id>1687</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/f14081687</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Gao</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Recent advances in the application of deep learning methods to forestry</article-title>. <source>Wood Sci. Technol.</source> <volume>55</volume>, <fpage>1171</fpage>&#x2013;<lpage>1202</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s00226-021-01309-2</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>LDS-YOLO: A lightweight small object detection method for dead trees from shelter forest</article-title>. <source>Comput. Electron. Agric.</source> <volume>198</volume>, <elocation-id>107035</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2022.107035</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Pu</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Mapping robinia pseudoacacia forest health conditions by using combined spectral, spatial, and textural information extracted from IKONOS imagery and random forest classifier</article-title>. <source>Remote Sens.</source> <volume>7</volume>, <fpage>9020</fpage>&#x2013;<lpage>9044</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs70709020</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Windrim</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Carnegie</surname> <given-names>A. J.</given-names>
</name>
<name>
<surname>Webster</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Bryson</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Tree detection and health monitoring in multispectral aerial imagery and photogrammetric pointclouds using machine learning</article-title>. <source>IEEE J. Sel. Top. Appl. Earth Observ. Remote Sens.</source> <volume>13</volume>, <fpage>2554</fpage>&#x2013;<lpage>2572</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSTARS.2020.2995391</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wing</surname> <given-names>B. M.</given-names>
</name>
<name>
<surname>Ritchie</surname> <given-names>M. W.</given-names>
</name>
<name>
<surname>Boston</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Cohen</surname> <given-names>W. B.</given-names>
</name>
<name>
<surname>Olsen</surname> <given-names>M. J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Individual snag detection using neighborhood attribute filtered airborne lidar data</article-title>. <source>Remote Sens. Environ.</source> <volume>163</volume>, <fpage>165</fpage>&#x2013;<lpage>179</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.rse.2015.03.013</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Woo</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Park</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>J.-Y.</given-names>
</name>
<name>
<surname>Kweon</surname> <given-names>I. S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>CBAM: convolutional block attention module</article-title>,&#x201d; in <conf-name>2018 European Conference on Computer Vision (ECCV)</conf-name>, <conf-loc>Munich, Germany</conf-loc>, <volume>2018</volume>, <fpage>3</fpage>&#x2013;<lpage>19</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-030-01234-2_1</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Detection of the monitoring window for pine wilt disease using multi-temporal UAV-based multispectral imagery and machine learning algorithms</article-title>. <source>Remote Sens.</source> <volume>15</volume>, <elocation-id>444</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/rs15020444</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>R.-Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>SimAM: A simple, parameter-free attention module for convolutional neural networks</article-title>,&#x201d; in <conf-name>Proceedings of the 38th International Conference on Machine Learning Proceedings of Machine Learning (ICML)</conf-name>, <conf-loc>Vienna, Austria</conf-loc>, <volume>2021</volume>, <fpage>11863</fpage>&#x2013;<lpage>11874</lpage>. Available at: <uri xlink:href="https://proceedings.mlr.press/v139/yang21o.html">https://proceedings.mlr.press/v139/yang21o.html</uri>.</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Lei</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>G.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>MKLM: a multiknowledge learning module for object detection in remote sensing images</article-title>. <source>Int. J. Remote Sens.</source> <volume>43</volume>, <fpage>2244</fpage>&#x2013;<lpage>2267</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1080/01431161.2022.2061316</pub-id>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>Q.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>Enhancing geometric factors in model learning and inference for object detection and instance segmentation</article-title>. <source>IEEE T. Cybern.</source> <volume>52</volume>, <fpage>8574</fpage>&#x2013;<lpage>8586</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TCYB.2021.3095305</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Yuan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Surveying coconut trees using high-resolution satellite imagery in remote atolls of the Pacific Ocean</article-title>. <source>Coord. Chem. Rev.</source> <volume>481</volume>, <elocation-id>113485</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.rse.2023.113485</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>