<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2024.1327237</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Pest recognition in microstates state: an improvement of YOLOv7 based on Spatial and Channel Reconstruction Convolution for feature redundancy and vision transformer with Bi-Level Routing Attention</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>He</surname>
<given-names>Junjie</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2409672"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Zhang</surname>
<given-names>Shihao</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Chunhua</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2409648"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Houqiao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2557072"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gao</surname>
<given-names>Jun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2409686"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Huang</surname>
<given-names>Wei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Qiaomei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Xinghua</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yuan</surname>
<given-names>Wenxia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2409969"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Yamin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2409778"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Lei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2409805"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Xu</surname>
<given-names>Jiayi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Zejun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Rukui</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wang</surname>
<given-names>Baijuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2407657"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Tea Science, Yunnan Agricultural University</institution>, <addr-line>Kunming</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Key Laboratory of Intelligent Organic Tea Garden Construction in University of Yunnan Province, Yunnan Agricultural University</institution>, <addr-line>Kunming</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Guoxiong Zhou, Central South University Forestry and Technology, China</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Sijia Yu, Rutgers, The State University of New Jersey, United States</p>
<p>Yunchao Tang, Guangxi University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Baijuan Wang, <email xlink:href="mailto:wangbaijuan2023@163.com">wangbaijuan2023@163.com</email>
</p>
</fn>
<fn fn-type="equal" id="fn003">
<p>&#x2020;These authors have contributed equally to this work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>05</day>
<month>02</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1327237</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>10</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>01</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 He, Zhang, Yang, Wang, Gao, Huang, Wang, Wang, Yuan, Wu, Li, Xu, Wang, Zhang and Wang</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>He, Zhang, Yang, Wang, Gao, Huang, Wang, Wang, Yuan, Wu, Li, Xu, Wang, Zhang and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>In order to solve the problem of precise identification and counting of tea pests, this study has proposed a novel tea pest identification method based on improved YOLOv7 network.</p>
</sec>
<sec>
<title>Methods</title>
<p>This method used MPDIoU to optimize the original loss function, which improved the convergence speed of the model and simplifies the calculation process. Replace part of the network structure of the original model using Spatial and Channel reconstruction Convolution to reduce redundant features, lower the complexity of the model, and reduce computational costs. The Vision Transformer with Bi-Level Routing Attention has been incorporated to enhance the flexibility of model calculation allocation and content perception.</p>
</sec>
<sec>
<title>Results</title>
<p>The experimental results revealed that the enhanced YOLOv7 model significantly boosted Precision, Recall, F1, and mAP by 5.68%, 5.14%, 5.41%, and 2.58% respectively, compared to the original YOLOv7. Furthermore, when compared to deep learning networks such as SSD, Faster Region-based Convolutional Neural Network (RCNN), and the original YOLOv7, this method proves to be superior while being externally validated. It exhibited a noticeable improvement in the FPS rates, with increments of 5.75 HZ, 34.42 HZ, and 25.44 HZ respectively. Moreover, the mAP for actual detection experiences significant enhancements, with respective increases of 2.49%, 12.26%, and 7.26%. Additionally, the parameter size is reduced by 1.39 G relative to the original model.</p>
</sec>
<sec>
<title>Discussion</title>
<p>The improved model can not only identify and count tea pests efficiently and accurately, but also has the characteristics of high recognition rate, low parameters and high detection speed. It is of great significance to achieve realize the intelligent and precise prevention and control of tea pests.</p>
</sec>
</abstract>
<abstract abstract-type="graphical">
<title>Graphical Abstract</title>
<p>
<graphic xlink:href="fpls-15-1327237-g010.tif" position="anchor"/>
</p>
</abstract>
<kwd-group>
<kwd>pest identification</kwd>
<kwd>improved Yolov7</kwd>
<kwd>MPDIou</kwd>
<kwd>Spatial and Channel Reconstruction Convolution</kwd>
<kwd>vision transformer with Bi-Level Routing Attention</kwd>
</kwd-group>
<contract-sponsor id="cn001">Yunnan Provincial Science and Technology Department<named-content content-type="fundref-id">10.13039/501100008871</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">Key Technologies Research and Development Program<named-content content-type="fundref-id">10.13039/501100012165</named-content>
</contract-sponsor>
<counts>
<fig-count count="9"/>
<table-count count="1"/>
<equation-count count="25"/>
<ref-count count="49"/>
<page-count count="14"/>
<word-count count="5801"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>The Yunnan tea-producing area is situated in a transitional zone between the tropical and subtropical regions. This region boasts an ample amount of rainfall, high temperatures, and a multitude of diverse landforms. These favorable conditions foster the growth and preservation of a wide array of resources, particularly the bountiful population of large-leaved tea trees (<xref ref-type="bibr" rid="B40">Yawen et&#xa0;al., 2001</xref>; <xref ref-type="bibr" rid="B1">Chen et&#xa0;al., 2005</xref>). However, it also creates favorable conditions for the growth and propagation of tea pests, and traditional pest monitoring and management methods were insufficient to meet the current demands of Yunnan tea gardens in terms of efficiency, coverage, and cost-effectiveness (<xref ref-type="bibr" rid="B42">Yunchao et&#xa0;al., 2023</xref>), resulting in the prevalence of multiple types and rapid proliferation of these pests. Additionally, this circumstance results in a reduction of both tea yield and quality (<xref ref-type="bibr" rid="B10">Hazarika et&#xa0;al., 2009</xref>). Therefore, there is an urgent need for intelligent and precise pest control in Yunnan&#x2019;s tea plantation management.</p>
<p>To achieve intelligent and precise pest prevention and control, the foremost challenge to address is the accurate identification and precise positioning of pests (<xref ref-type="bibr" rid="B33">Teske et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B31">Tang et&#xa0;al., 2023</xref>). The conventional target recognition algorithm primarily relies on analyzing the distribution attributes of pixels, such as color, texture, and edges within an image, to establish a comprehensive visual feature expression model. However, traditional image processing methods have limited capabilities in feature representation, only allowing for shallow vision expression. In addition, they suffer from issues such as poor generalization ability and lack of robustness, the applicability of it in complex scenarios has been constrained (<xref ref-type="bibr" rid="B6">Fengyun et&#xa0;al., 2023</xref>), making it impossible to achieve rapid and accurate identification of tea pests(<xref ref-type="bibr" rid="B4">Cheng et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B17">Kasinathan and Uyyala, 2021</xref>).</p>
<p>In recent years, the field of pest identification has experienced significant advancements thanks to the rapid development of machine vision, deep learning, and related technologies. Consequently (<xref ref-type="bibr" rid="B14">Hill et&#xa0;al., 1994</xref>; <xref ref-type="bibr" rid="B18">Kriegeskorte and Golan, 2019</xref>), neural network models have become widely popular and accepted in this domain. Xu Lijia et&#xa0;al. optimized the YOLOX network model by introducing a lightweight feature extraction network and combining the high-efficiency channel attention mechanism. The established pest detection model of Papilionidae has a recognition rate of up to 95% (<xref ref-type="bibr" rid="B38">Xu et&#xa0;al., 2023</xref>). Gong He et&#xa0;al., based on Fully Convolutional Networks, introduced a new DenseNet framework of Efficient Channel Attention, and established a rice pest detection model with a recognition rate of 98.28% (<xref ref-type="bibr" rid="B11">Gong et&#xa0;al., 2023</xref>). Qiang Jun et&#xa0;al. used the improved SSD (Single Shot Multibox Detector) model of the dual backbone network to detect citrus pests with an accuracy of 86.01% (<xref ref-type="bibr" rid="B25">Qiang et&#xa0;al., 2023</xref>). Jia-Hsin Huang et&#xa0;al. implemented a termite classification system based on the deep learning model MobileNetV2, and the detection accuracy of soldiers and workers reached 94.7% and 94.6%, respectively. Despite the high accuracy demonstrated in the aforementioned research on pest identification, notable challenges persist, including the extensive computational requirements and associated costs (<xref ref-type="bibr" rid="B15">Huang et&#xa0;al.,&#xa0;2021</xref>). The existing pest identification mainly focuses on large-sized and easy-to-identify pests. Most of the current research on small pests still uses a large-area pest identification method. However, there are only small variations in appearance among different types of pests, such as <italic>Empoasca pirisuga Matumura</italic> and <italic>Arboridia apicalis</italic>. On the other hand, there are substantial differences in appearance between different growth stages of the same types of pests, for example, <italic>Toxoptera aurantia</italic> larvae and adults. Consequently, the recognition accuracy of tea micro-insects is quite low.</p>
<p>Based on the aforementioned issues, this study focuses on the identification of tea pests as the primary objective and enhances the existing model by incorporating the YOLOv7 network to achieve faster and more accurate detection (<xref ref-type="bibr" rid="B34">Wang et&#xa0;al., 2023</xref>). To enhance the efficiency of the calculation process and accelerate the convergence speed of the model, MPDIou was utilized for optimizing the initial loss function (<xref ref-type="bibr" rid="B27">Siliang and Yong, 2023</xref>; <xref ref-type="bibr" rid="B37">Xing et&#xa0;al., 2023</xref>). Additionally, to maximize the model&#x2019;s efficiency by minimizing redundant features and reducing complexity and computational costs, we introduced Spatial and Channel Reconstruction Convolution. This method replaced a portion of the network structure in the original model (<xref ref-type="bibr" rid="B23">Ma et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B22">Liu et&#xa0;al., 2023</xref>). At the same time, vision transformer with Bi-Level Routing Attention was further added to make the model calculation allocation and content perception more flexible, so as to enhance the recognition efficiency of body-impaired pests (<xref ref-type="bibr" rid="B49">Zhu et&#xa0;al., 2023</xref>).</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Image acquisition</title>
<p>The images used in this study were collected at the Hekai base of Yuecheng Technology Co., Ltd., Menghai County, Xishuangbanna Prefecture, Yunnan Province (Latitude 21.5, Longitude 100.28). Image acquisition equipment is Magnification 200X, Lens structure4 elements in four groups, Coating Multilayer, Input 5V/1A macro lens. During the image acquisition stage, we employed additional measures to address the challenge of capturing small pests. In conjunction with collecting pest images on leaves, we pre-hang yellow pest boards on tea trees to effectively attract pests. When the insect board attracted a large number of pests, they were captured in photographs using a macro lens attached to a mobile device. To ensure accuracy in the recognition model, this study employed various mobile devices like the iPhone 14 Pro Max and Redmi K50 for data collection.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Image preprocessing</title>
<p>In the original images provided, we have classified images of four different pests: <italic>Empoasca pirisuga Matumura </italic>(<xref ref-type="bibr" rid="B41">Yin et&#xa0;al., 2021</xref>), <italic>Toxoptera aurantii </italic>(<xref ref-type="bibr" rid="B19">Li et&#xa0;al., 2019</xref>), <italic>Xyleborus fornicatus Eichhoff</italic>(<xref ref-type="bibr" rid="B28">Sivapalan, 1977</xref>), and <italic>Arboridia apicalis</italic> (<xref ref-type="bibr" rid="B48">Zhou et&#xa0;al., 2018</xref>). Among them, a set of high-quality images was selected as the initial dataset, including 112 images of <italic>Empoasca pirisuga Matumura</italic>, 115 images of <italic>Toxoptera aurantii</italic>, 92 images of <italic>Xyleborus fornicatus Eichhoff</italic>, and 98 images of <italic>Arboridia apicalis</italic>.</p>
<p>To address the problem of overfitting in the network caused by a limited number of training images, this study utilized image enhancement technology to augment the original data. By employing techniques like cropping (<xref ref-type="bibr" rid="B43">Zhang et&#xa0;al., 2005</xref>), rotation (<xref ref-type="bibr" rid="B30">Sun et&#xa0;al., 2019</xref>), local enlargement (<xref ref-type="bibr" rid="B32">Taniai et&#xa0;al., 2017</xref>), exposure adjustment (<xref ref-type="bibr" rid="B8">Graham-Bermann and Perkins, 2010</xref>), and adding Gaussian noise (<xref ref-type="bibr" rid="B24">Nataraj et&#xa0;al., 2009</xref>), the original dataset was expanded by a factor of 11, resulting in a total of 4,587 images. The specific operations conducted can be observed in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. Subsequently, we deleted 501 low-quality images (insects accounting for less than 20% of the image, extremely blurred, etc.) that were generated during the image enhancement process. Finally, a total of 1,008 images of <italic>Empoasca pirisuga Matumura</italic>, 1,033 images of <italic>Toxoptera aurantii</italic>, 1,024 images of <italic>Xyleborus fornicatus Eichhoff</italic>, and 1,021 images of <italic>Arboridia apicalis</italic> were successfully obtained. These images served as the essential datasets utilized in the present study.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Image enhancement results.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g001.tif"/>
</fig>
<p>In this study, the Labeling tool was utilized to accurately label the images in the dataset. <italic>Empoasca pirisuga Matumura</italic> was assigned the label &#x201c;A,&#x201d; <italic>Toxoptera aurantii</italic> was assigned the label &#x201c;B,&#x201d; <italic>Xyleborus fornicatus Eichhoffr</italic> was assigned the label &#x201c;C,&#x201d; and <italic>Arboridia apicalis</italic> was assigned the label &#x201c;D.&#x201d; After completing the annotation process, the TXT and XML files were generated. These files include the name and size of the pest, as well as the location information of the pest within the image. The image dataset was constructed as a training set, a test set and a verification set in a ratio of 6:2:2, and the specific division is shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Dataset partitioning.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Pest name</th>
<th valign="middle" align="center">Testing sets</th>
<th valign="middle" align="center">Training sets</th>
<th valign="middle" align="center">Validation sets</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">
<italic>Empoasca pirisuga Matumura</italic>
</td>
<td valign="middle" align="center">605</td>
<td valign="middle" align="center">202</td>
<td valign="middle" align="center">201</td>
</tr>
<tr>
<td valign="middle" align="center">
<italic>Toxoptera aurantii</italic>
</td>
<td valign="middle" align="center">620</td>
<td valign="middle" align="center">207</td>
<td valign="middle" align="center">206</td>
</tr>
<tr>
<td valign="middle" align="center">
<italic>Xyleborus fornicatus Eichhoffr</italic>
</td>
<td valign="middle" align="center">614</td>
<td valign="middle" align="center">205</td>
<td valign="middle" align="center">204</td>
</tr>
<tr>
<td valign="middle" align="center">
<italic>Arboridia apicalis</italic>
</td>
<td valign="middle" align="center">613</td>
<td valign="middle" align="center">204</td>
<td valign="middle" align="center">204</td>
</tr>
<tr>
<td valign="middle" align="center">Total</td>
<td valign="middle" align="center">2452</td>
<td valign="middle" align="center">818</td>
<td valign="middle" align="center">815</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Improvement of YOLOv7 algorithm</title>
<p>To enhance the convergence speed of the model, streamline the calculation process, diminish redundancy, decrease complexity, and minimize computational expense, the present study has made advancements to the YOLOv7 network. These improvements aim to facilitate greater flexibility in model calculation distribution and content perception. In this study, MPDIou was used to optimize the original loss function and Spatial and Channel reconstruction Convolution was used to replace part of the network structure of the original model, and vision transformer with Bi-Level Routing Attention was further added. The improved network structure is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Improved YOLOv7 network structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g002.tif"/>
</fig>
<sec id="s3_1">
<label>3.1</label>
<title>YOLOv7 network</title>
<p>YOLOv7 implemented a streamlined network architecture comprising Input (<xref ref-type="bibr" rid="B16">Jiang et&#xa0;al., 2022</xref>), Backbone, Neck, and Head components. This lightweight structure enables efficient and effective object detection and recognition. The Input layer plays a critical role in data preprocessing, encompassing various tasks such as data enhancement, image size scaling, and predefined candidate box size calculation. The Neck layer is a neck network that connects feature layers of different scales and performs feature fusion, while the Head layer is a head network, and the regression loss value is calculated by the loss function. The network effectively utilizes parameters and computational resources, resulting in decreased parameter count, improved inference speed, and heightened detection accuracy (<xref ref-type="bibr" rid="B5">Fan et&#xa0;al., 2023</xref>).</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Improvement of loss function</title>
<p>IoU (Intersection over Union) is a simple function to calculate the location loss (<xref ref-type="bibr" rid="B2">Cheng et&#xa0;al., 2021</xref>), and the overlap degree of the two bounding boxes is evaluated by calculating the intersection over union. Currently, several enhanced versions of the location loss calculation method have emerged, namely, GIoU (<xref ref-type="bibr" rid="B26">Rezatofighi et&#xa0;al., 2019</xref>), DIoU (<xref ref-type="bibr" rid="B46">Zheng et&#xa0;al., 2020</xref>), and CIoU (<xref ref-type="bibr" rid="B35">Wang and Song, 2021</xref>). The original YOLOv7 algorithm uses the CIoU function to calculate the positioning loss. The expression of CIoU is shown in <xref ref-type="disp-formula" rid="eq1">Equation (1)</xref>:</p>
<disp-formula id="eq1">
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>S</mml:mi>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi mathvariant="italic">&#x3c1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mtext>&#x3b1;</mml:mtext>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im1">
<mml:mi>b</mml:mi>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are the predicted box and the ground truth box, <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> represents the Euclidean distance between the two, and <italic>c</italic> denotes the diagonal distance of the minimum closure region that can contain both the prediction box and the true box. <italic>V</italic> and &#x3b1; are the evaluation parameters and the balance factor of the length-width ratio, respectively. The formulas are shown in <xref ref-type="disp-formula" rid="eq2">Equations (2</xref>, <xref ref-type="disp-formula" rid="eq3">3)</xref>:</p>
<disp-formula id="eq2">
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>4</mml:mn>
<mml:mrow>
<mml:msup>
<mml:mtext>&#x3c0;</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mfrac>
<mml:mi>w</mml:mi>
<mml:mi>h</mml:mi>
</mml:mfrac>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:msup>
<mml:mo>&#xa0;</mml:mo>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq3">
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Although CIoU considered the intersection area of the bounding box, the distance from the center point, and the aspect ratio of the bounding box, it used the different measurement method of length-width ratio instead of the real difference between width and confidence, which reduces the convergence speed of the model. Based on this, the study applies the latest MPDIoU loss function to enhance the original loss function. The structure of the improved loss function is illustrated in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>. To simultaneously address the regression of overlapping and non-overlapping bounding boxes, while considering the center point distance and the deviation of width and height, author adopted an approach that is called MPDIoU. This method utilizes a bounding box similarity measure based on the minimum point distance. By implementing this technique, the calculation process is simplified to a certain extent, the model&#x2019;s convergence speed is enhanced, and the regression results will be more accurate. Its expression is shown in <xref ref-type="disp-formula" rid="eq4">Equations (4</xref>&#x2013;<xref ref-type="disp-formula" rid="eq7">7)</xref>:</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>MPDIou structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g003.tif"/>
</fig>
<disp-formula id="eq4">
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>&#x2112;</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x2229;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>&#x222a;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq6">
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>1</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>B</mml:mi>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>B</mml:mi>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq7">
<label>(7)</label>
<mml:math display="block" id="M7">
<mml:mrow>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mn>2</mml:mn>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>B</mml:mi>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>B</mml:mi>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where A and B denote the prediction box and the true box, <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mtext>and</mml:mtext>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denote the upper left and lower right corner coordinates of bounding box A, respectively. <inline-formula>
<mml:math display="inline" id="im5">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>B</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>B</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im6">
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>B</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>B</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> denote the upper left and lower right corner coordinates of bounding box B.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Spatial and Channel Reconstruction Convolution</title>
<p>In order to diminish redundant features and reduce the complexity and computational cost of the model, this study implemented Spatial and Channel Reconstruction Convolution to replace a portion of the original YOLOv7 network structure. The Spatial and Channel Reconstruction Convolution consists of two components, SRU (Spatial Reconstruction Unit) and CRU (Channel Reconstruction Unit) (<xref ref-type="bibr" rid="B20">Li et&#xa0;al., 2023</xref>). The core of SRU is to suppress the spatial redundancy of feature map by means of separation&#x2013;reconstruction, while CRU further reduces the channel redundancy of feature map by means of segmentation&#x2013;conversion&#x2013;fusion.</p>
<p>The structure of Spatial and Channel reconstruction Convolution, SRU, and CRU is shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>. For the input feature map, the Spatial and Channel Reconstruction Convolution first adjusts the number of channels through the convolution of 1 <inline-formula>
<mml:math display="inline" id="im7">
<mml:mo>&#xd7;</mml:mo>
</mml:math>
</inline-formula> 1 and then uses SRU to operate the intermediate input features in the bottleneck residual block to generate spatial refinement features. Next, CRU is used to operate the spatial refinement features to generate channel refinement features. Finally, the number of channels in the feature map is restored by a 1 <inline-formula>
<mml:math display="inline" id="im8">
<mml:mo>&#xd7;</mml:mo>
</mml:math>
</inline-formula> 1 convolution and the residual operation is performed.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Spatial and Channel reconstruction Convolution overall structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g004.tif"/>
</fig>
<p>The separation operation of SRU primarily utilizes the scaling factor of Group Normalization to assess the information content of the feature map (<xref ref-type="bibr" rid="B36">Wu and He, 2018</xref>). This allows for improved separation of feature maps with varying levels of information, ensuring the retention of feature maps with rich information and filtering out those with lesser information. Its expression is shown in <xref ref-type="disp-formula" rid="eq8">Equation (8)</xref>. The reconstruction operation is founded on the cross-reconstruction technique, which aims to merge the informative and less informative features. This is accomplished by enhancing the information flow between the two, resulting in the generation of more comprehensive information features while conserving space. Its expression is shown in <xref ref-type="disp-formula" rid="eq9">Equation (9)</xref>.</p>
<disp-formula id="eq8">
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mfrac>
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>&#x3bc;</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msup>
<mml:mtext>&#x3c3;</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>+</mml:mo>
<mml:mtext>&#x3f5;</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>+</mml:mo>
<mml:mtext>&#x3b2;</mml:mtext>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq9">
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mn>11</mml:mn>
</mml:mrow>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo>&#x2295;</mml:mo>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mn>22</mml:mn>
</mml:mrow>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mn>21</mml:mn>
</mml:mrow>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo>&#x2295;</mml:mo>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mn>12</mml:mn>
</mml:mrow>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x222a;</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Among them, <inline-formula>
<mml:math display="inline" id="im9">
<mml:mo>&#x2297;</mml:mo>
</mml:math>
</inline-formula> represents element-by-element multiplication, <inline-formula>
<mml:math display="inline" id="im10">
<mml:mo>&#x2295;</mml:mo>
</mml:math>
</inline-formula> represents element-by-element addition, <inline-formula>
<mml:math display="inline" id="im11">
<mml:mrow>
<mml:mo>&#x222a;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represents splicing, <inline-formula>
<mml:math display="inline" id="im12">
<mml:mtext>&#x3bc;</mml:mtext>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im13">
<mml:mtext>&#x3c3;</mml:mtext>
</mml:math>
</inline-formula> are the mean and standard deviation of X, respectively. <italic>&#x3b5;</italic> is a small positive number added to stabilize division. <inline-formula>
<mml:math display="inline" id="im14">
<mml:mtext>&#x3b3;</mml:mtext>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im15">
<mml:mtext>&#x3b2;</mml:mtext>
</mml:math>
</inline-formula> are trainable affine transformations, <inline-formula>
<mml:math display="inline" id="im16">
<mml:mi>W</mml:mi>
</mml:math>
</inline-formula> is the weight value of the feature map, <inline-formula>
<mml:math display="inline" id="im17">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the weight with rich information, and <inline-formula>
<mml:math display="inline" id="im18">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the weight with not rich information.</p>
<p>The Split operation of CRU is to improve the computational efficiency of the model by dividing the spatial refinement features generated by SRU into two parts: <inline-formula>
<mml:math display="inline" id="im19">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im20">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and using 1 <inline-formula>
<mml:math display="inline" id="im21">
<mml:mo>&#xd7;</mml:mo>
</mml:math>
</inline-formula> 1 convolution to compress them respectively. The Transform operation uses different convolutions to extract the features of <inline-formula>
<mml:math display="inline" id="im22">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im23">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> obtained by the segmentation operation, so as to obtain two sets of feature maps with different information richness. The expressions are shown in <xref ref-type="disp-formula" rid="eq10">Equations (10</xref>&#x2013;<xref ref-type="disp-formula" rid="eq11">11)</xref>. The fusion operation is to extract the spatial channel information of the feature maps <inline-formula>
<mml:math display="inline" id="im24">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im25">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> by Pooling, and merge the features <inline-formula>
<mml:math display="inline" id="im26">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im27">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the form of channels to generate Channel-Refined Feature Y. Its expression is shown as <xref ref-type="disp-formula" rid="eq12">Equations (12</xref>&#x2013;<xref ref-type="disp-formula" rid="eq14">14)</xref>.</p>
<disp-formula id="eq10">
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mi>G</mml:mi>
</mml:msup>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq11">
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x222a;</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq12">
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>H</mml:mi>
</mml:munderover>
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>W</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq13">
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:msub>
<mml:mtext>&#x3b2;</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>&#x3b2;</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>&#x3b2;</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mtext>&#x3b2;</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq14">
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mtext>&#x3b2;</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mtext>&#x3b2;</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>Y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Among them, <inline-formula>
<mml:math display="inline" id="im28">
<mml:mrow>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mi>G</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im29">
<mml:mrow>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im30">
<mml:mrow>
<mml:msup>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are learnable weight matrices in convolution operations, and <inline-formula>
<mml:math display="inline" id="im31">
<mml:mrow>
<mml:msub>
<mml:mtext>&#x3b2;</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im32">
<mml:mrow>
<mml:msub>
<mml:mtext>&#x3b2;</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are feature importance vectors.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Vision transformer with Bi-Level Routing Attention</title>
<p>Attention is a fundamental element of the visual converter and a crucial tool for capturing long-term dependencies (<xref ref-type="bibr" rid="B9">Han et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B47">Zhou et&#xa0;al., 2021</xref>). In this study, it was observed that YOLOv7, when employed for pest recognition training, did not exhibit satisfactory performance in identifying images of body-impaired pests. Therefore, this study has enhanced the YOLOv7 network by incorporating vision transformer with Bi-Level Routing Attention. This integration has aimed to facilitate better computing allocation and enhance content perception, resulting in improved flexibility. The image has been divided into S <inline-formula>
<mml:math display="inline" id="im33">
<mml:mo>&#xd7;</mml:mo>
</mml:math>
</inline-formula> S non-overlapping regions by vision transformer with Bi-Level Routing Attention, and the region-level features have been calculated by average pooling. Then,&#xa0;perform coarse-grained regional-level routing, calculate and retrieve affinity. Next, perform public key normalization and aggregate the tensor of key-value pairs. Finally, during the collection and dispersion of key-value pairs, perform fine-grained token-to-token attention calculation, and the structure is depicted in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Vision transformer with Bi-Level Routing Attention Structure Diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g005.tif"/>
</fig>
<p>After the pest image is divided into S <inline-formula>
<mml:math display="inline" id="im34">
<mml:mo>&#xd7;</mml:mo>
</mml:math>
</inline-formula> S non-overlapping regions, the feature vector contained in each region is <inline-formula>
<mml:math display="inline" id="im35">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>. Here, H is the height of the original image, W is the width of the original image, and Q, K, V are obtained by linear mapping of the feature vectors. Its expression is as shown in <xref ref-type="disp-formula" rid="eq15">Equation (15)</xref>, where <inline-formula>
<mml:math display="inline" id="im36">
<mml:mrow>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211d;</mml:mi>
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im37">
<mml:mrow>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> r denotes the input image after segmentation, <inline-formula>
<mml:math display="inline" id="im38">
<mml:mrow>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>q</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math display="inline" id="im39">
<mml:mrow>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula>
<mml:math display="inline" id="im40">
<mml:mrow>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>v</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> denote the weight projection of query, key, and value, respectively. The region-level features are calculated by average pooling, and the average value of each region is calculated. <inline-formula>
<mml:math display="inline" id="im41">
<mml:mrow>
<mml:msup>
<mml:mi>Q</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211d;</mml:mi>
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, and the adjacency matrix of the inter-regional correlation between <inline-formula>
<mml:math display="inline" id="im42">
<mml:mrow>
<mml:msup>
<mml:mi>Q</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im43">
<mml:mrow>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is calculated. The expression is shown in <xref ref-type="disp-formula" rid="eq16">Equation (16)</xref>, where <inline-formula>
<mml:math display="inline" id="im44">
<mml:mrow>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the adjacency matrix of the correlation, <inline-formula>
<mml:math display="inline" id="im45">
<mml:mrow>
<mml:msup>
<mml:mi>Q</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the region-level query, <inline-formula>
<mml:math display="inline" id="im46">
<mml:mrow>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the region-level key, and <inline-formula>
<mml:math display="inline" id="im47">
<mml:mi>T</mml:mi>
</mml:math>
</inline-formula> represents the transpose operation. The&#xa0;coarse-grained region-level routing calculation uses the routing index matrix <inline-formula>
<mml:math display="inline" id="im48">
<mml:mrow>
<mml:msup>
<mml:mi>I</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x2115;</mml:mi>
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> to save the index of the first k links row by row, so that only the first <italic>k</italic> connections of each region are used when pruning the correlation graph. The expression is shown in <xref ref-type="disp-formula" rid="eq17">Equation (17)</xref>. The public key normalization operation is to aggregate the tensors of key and value, and the aggregation formula is shown in <xref ref-type="disp-formula" rid="eq18">Equations (18</xref>, <xref ref-type="disp-formula" rid="eq19">19)</xref>. Among them, <inline-formula>
<mml:math display="inline" id="im49">
<mml:mrow>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>g</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the tensor after the key aggregation, <inline-formula>
<mml:math display="inline" id="im50">
<mml:mi>K</mml:mi>
</mml:math>
</inline-formula> represents the key, <inline-formula>
<mml:math display="inline" id="im51">
<mml:mrow>
<mml:msup>
<mml:mi>I</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the routing index matrix, <inline-formula>
<mml:math display="inline" id="im52">
<mml:mrow>
<mml:msup>
<mml:mi>V</mml:mi>
<mml:mi>g</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents the tensor after the value aggregation, and <inline-formula>
<mml:math display="inline" id="im53">
<mml:mi>V</mml:mi>
</mml:math>
</inline-formula> represents the value. Collecting the scattered key-value pairs is to use the attention operation on the aggregated K&#x2013;V pairs to perform fine-grained label-to-label attention calculation, and its expression is shown in <xref ref-type="disp-formula" rid="eq20">Equation (20)</xref>. Here, O represents fine-grained mark-to-mark attention, and LCE (V) represents local context enhancement.</p>
<disp-formula id="eq15">
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>q</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>v</mml:mi>
</mml:msup>
<mml:mo>&#xa0;</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq16">
<label>(16)</label>
<mml:math display="block" id="M16">
<mml:mrow>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>Q</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:msup>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq17">
<label>(17)</label>
<mml:math display="block" id="M17">
<mml:mrow>
<mml:msup>
<mml:mi>I</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq18">
<label>(18)</label>
<mml:math display="block" id="M18">
<mml:mrow>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>g</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>g</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>I</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq19">
<label>(19)</label>
<mml:math display="block" id="M19">
<mml:mrow>
<mml:msup>
<mml:mi>V</mml:mi>
<mml:mi>g</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>g</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>I</mml:mi>
<mml:mi>r</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq20">
<label>(20)</label>
<mml:math display="block" id="M20">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>g</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>V</mml:mi>
<mml:mi>g</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Model training and result analysis</title>
<p>To assess the detection capabilities of the enhanced YOLOv7 algorithm on microscopic tea pests, this study established three groups of comparative experiments. Four networks, namely, improved YOLOv7, original YOLOv7, faster-RCNN (<xref ref-type="bibr" rid="B3">Cheng&#xa0;et&#xa0;al.,&#xa0;2018</xref>), and SSD (<xref ref-type="bibr" rid="B21">Liu et&#xa0;al., 2016</xref>), were employed to train and evaluate the model using various datasets. To ensure the scientificity and rigor of the model test results, the hardware equipment and software environment employed in this study are identical. The model was trained using the Windows 11 operating system. The running host was configured with a 12th Gen Intel (R) Core (TM) i7-12700 H 2.30 GHz processor, 512 GB solid-state drive and NAIDIA GeForce RTX 3070 laptop GPU graphics card, 16 GB RAM, NVIDIA 528.24 driver, CUDA 1.3.1 version, and network development was performed using Python 3.7 and Pycharm 2017.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Training results and analysis</title>
<p>The loss function serves as an indicator for quantifying the disparity between the predicted and actual outcomes of a model (<xref ref-type="bibr" rid="B44">Zhao et&#xa0;al., 2015</xref>; <xref ref-type="bibr" rid="B45">Zhao et&#xa0;al., 2016</xref>). It is of paramount importance as it enables evaluation of the model&#x2019;s performance. The lower the loss function value is, the closer the model prediction result is to the actual result, and the better the model performance is. As depicted in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>, it can be observed that the gradient descent rate of the loss function was significantly accelerated during the initial phase of model training in the improved YOLOv7 model. However, as the training progresses to the 100th round, the rate at which the loss function decreased started to slow down considerably. Additionally, the curve exhibited a distinct oscillation pattern, becoming notably prominent. As the training progressed, the curve observed a gradual stabilization phase after 200 rounds. Moreover, the loss function started to converge, resulting in the final total loss stabilizing below 3.4%. By comparing the loss function change curves between the original YOLOv7 and the improved version, we could observe a considerable decrease in the prediction box position loss, prediction box confidence loss, and classification loss in the improved YOLOv7. Among them, the position loss of the prediction box decreased most significantly, with a decrease of more than 15% on the training set and the test set.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Loss function curve change.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g006.tif"/>
</fig>
<p>In order to comprehensively evaluate the detection accuracy of the enhanced model, this study incorporated several evaluation metrics including Precision (<xref ref-type="bibr" rid="B29">Streiner and Norman, 2006</xref>), Recall (<xref ref-type="bibr" rid="B7">Gillund and Shiffrin, 1984</xref>), F1 (<xref ref-type="bibr" rid="B39">Yacouby and Axman, 2020</xref>), AP (average precision)(<xref ref-type="bibr" rid="B12">He et&#xa0;al., 2018</xref>), and mAP (mean average precision) (<xref ref-type="bibr" rid="B13">Henderson and Ferrari, 2017</xref>). The corresponding expressions are presented as <xref ref-type="disp-formula" rid="eq21">Equations (21</xref>, <xref ref-type="disp-formula" rid="eq25">25)</xref>.</p>
<disp-formula id="eq21">
<label>(21)</label>
<mml:math display="block" id="M21">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq22">
<label>(22)</label>
<mml:math display="block" id="M22">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq23">
<label>(23)</label>
<mml:math display="block" id="M23">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq24">
<label>(24)</label>
<mml:math display="block" id="M24">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:msubsup>
<mml:mo>&#x222b;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mstyle>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq25">
<label>(25)</label>
<mml:math display="block" id="M25">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>C</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Among them, <inline-formula>
<mml:math display="inline" id="im56">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the number of correct recognition, <inline-formula>
<mml:math display="inline" id="im57">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the number of recognition errors, <inline-formula>
<mml:math display="inline" id="im58">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the number of undetected, and <italic>C</italic> is the number of detected categories.</p>
<p>From a predictive standpoint, accuracy serves as a statistical indicator. It represents the proportion of samples that are correctly classified, that is, they are predicted to belong to a certain classification and indeed do. The recall rate is a vital indicator that measures the model&#x2019;s proficiency in accurately retrieving samples from the entire set of classifications. The balanced score is derived from a comprehensive evaluation of both accuracy and recall rate, combining them through the use of harmonic average. As shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>, compared with the original YOLOv7 model, the improved YOLOv7 in this study made significant progress in the detection effect. After improvement, the Precision metric exhibited an increase of 5.68%, while the Recall metric showed an increase of 5.14%. Additionally, the F1 metric witnessed an increase of 5.41%.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Precision, Recall, F1 Score Curve. Light blue represents <italic>Empoasca pirisuga Matsumura</italic>, orange represents <italic>Toxoptera aurantia</italic>, green represents <italic>Xyleborus fornicatus</italic>, red represents <italic>Arboridia apicalis</italic>, and dark blue represents all types of pests.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g007.tif"/>
</fig>
<p>AP is a widely employed metric for evaluating positioning accuracy and prediction accuracy. The AP value is determined based on the Precision and Recall of the model. By drawing the PR curve, Precision is set as the horizontal axis, and Recall is set as the vertical axis. The AP value can be obtained by measuring the area under the PR curve, and mAP is the average value of all kinds of AP. According to <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>, the improved model utilized in this study demonstrated advancements in recognizing <italic>Empoasca pirisuga Matumura</italic> when compared to the original YOLOv7, faster RCNN, and SSD. Specifically, there was a notable improvement of 2.26% when compared to the original YOLOv7, a significant enhancement of 9.23% as compared to faster RCNN, and a substantial progress of 5.68% in contrast to SSD. In terms of <italic>Toxoptera aurantii</italic> identification, the AP improvement was 2.72%, 9.4%, and 5.63%, respectively. For the identification of <italic>Xyleborus fornicatus Eichhoff</italic>, the AP improvement was 2.07%, 9.34%, and 7.93%, respectively. For the identification of <italic>Arboridia apicalis</italic>, there was an increase in AP of 3.26%, 10.27%, and 8.04%, respectively. The final mean mAP increases were 2.58%, 9.26%, and 6.82%, respectively.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Different model AP and mAP comparison.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g008.tif"/>
</fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Model detection experiment</title>
<p>In this study, the improved model&#x2019;s advantages were further verified through the detection and identification of <italic>Empoasca pirisuga Matumura</italic>, <italic>Toxoptera aurantii, Xyleborus fornicatus Eichhoffr</italic>, and <italic>Arboridia apicalis</italic> pest images with single-target and multi-target limb impairments, under varying light intensities. In order to guarantee the reliability of the results, the external verification sets used in the training and testing of the improved YOLOv7, YOLOv7, faster RCNN, and SSD networks were the same, and the training platform configuration was also consistent. The final comparison results were shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>. A represents <italic>Empoasca pirisuga Matumura</italic>, B represents <italic>Toxoptera aurantii</italic>, C represents <italic>Xyleborus fornicatus Eichhoffr</italic>, and D represents <italic>Arboridia apicalis</italic>.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Different model detection results comparison.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-15-1327237-g009.tif"/>
</fig>
<p>The experimental results that the model tested in this study can successfully detect single target and multi-target when the pest&#x2019;s body in the detection image was complete, and there was sufficient lighting. Notably, the improved YOLOv7 exhibited the highest confidence in its detection results, while the Faster-RCNN showed the lowest confidence. Moreover, the improved YOLOv7 exhibited an average confidence increase of over 2% when compared to the original YOLOv7. When the insect&#x2019;s body in the image remained undamaged but the light intensity was low, both the improved YOLOv7 and the original YOLOv7 algorithms can still produce detection results with the highest confidence. However, the average confidence level of the improved YOLOv7 model was considerably lower compared to the original YOLOv7. When the degree of physical disability of the detected pest was less than 50%, the tested model can still perform single-target and multi-target detection, but the confidence levelsignificantly reduced; among them, the improved YOLOv7 maintains the highest detection confidence; compared to the original YOLOv7, the confidence has been augmented by 7.8%. When the body degree of the detected pests was greater than 50%,theimproved YOLOv7 was still capable of detecting targets and had high detection confidence, while other models except improved YOLOv7 exhibited significant omission and recognition errors.</p>
<p>In the external verification of the model, the improved YOLOv7 showed significant advancements compared to the original YOLOv7, faster RCNN, and SSD. The improved YOLOv7 achieved an increase in frames per second by 5.75 HZ, 34.42 HZ, and 25.44 HZ, respectively, compared to the other models. Additionally, the mAP in actual detection improved by 2.49%, 12.26%, and 7.26%, respectively. Furthermore, the improved YOLOv7 managed to reduce the parameters by 1.39 G, building upon the foundation of the original YOLOv7. After conducting a comprehensive comparison, it was evident that the enhanced YOLOv7 utilized in this study surpassed the original YOLOv7 in terms of both detection accuracy and speed. Consequently, this improvement made it more advantageous for deploying the latter model on mobile terminals.</p>
</sec>
</sec>
<sec id="s5" sec-type="discussion">
<label>5</label>
<title>Discussion</title>
<sec id="s5_1">
<label>5.1</label>
<title>Effect of loss function improvement on YOLOv7 network</title>
<p>The loss function in machine learning plays a crucial role in evaluating the discrepancy between the predicted value and the actual value. An enhanced loss function can effectively enhance the precision and robustness of the model, subsequently influencing the&#xa0;training and detection performance of the YOLOv7 network. The MPDIoU employed a bounding box similarity measurement that builds upon the minimum point distance concept, thereby yielding a faster convergence speed in comparison to the CIoU within the YOLOv7 network. This approach not only simplified the calculation process to a certain degree but also improved the model&#x2019;s convergence speed while producing more accurate regression results.</p>
</sec>
<sec id="s5_2">
<label>5.2</label>
<title>The impact of Spatial and Channel reconstruction Convolution on YOLOv7 network</title>
<p>Currently, existing deep learning algorithms used for tea pest identification suffer from issues of complexity and high computational cost, leading to an abundance of redundant features. However, through the implementation of the Spatial and Channel Reconstruction Convolution, these redundant features within the feature map can be effectively mitigated. This can be achieved through the utilization of two key components: the SRU and the CRU. By incorporating these components, the complexity and computational cost of the model can be significantly reduced. Notably, this study successfully diminishes the complexity and computational expenses of the YOLOv7 network model by introducing the Spatial and Channel Reconstruction Convolution. This development holds immense importance for future implementation on mobile devices.</p>
</sec>
<sec id="s5_3">
<label>5.3</label>
<title>The impact of vision transformer with Bi-Level Routing Attention on YOLOv7 network</title>
<p>The incomplete limbs lead to the loss of crucial information about the target pests, hindering the deep learning model from obtaining a complete understanding of the pest characteristics and resulting in recognition errors and omissions. In this study, we found that the vision transformer with Bi-Level Routing Attention offered a superior recognition effect on limb-impaired pests. Additionally, it provided more flexible allocation of computational resources and improved content perception. Moreover, the memory occupancy rate and computation requirements were lower compared to the traditional self-attention mechanism. The inclusion of vision transformer with Bi-Level Routing Attention in this study significantly enhanced the confidence in assessing the degree of physical disability among detected pests, regardless of whether it was below or above 50%.</p>
<p>Although the visual recognition algorithm of this study can accurately identify tea pests, the collected area during the data acquisition process is relatively small, consisting of samples from only one base in Menghai County, Xishuangbanna Dai Autonomous Prefecture, Yunnan Province. Additionally, due to the diverse climate in Yunnan Province, the appearance of tea pests may vary. Therefore, in the future, our team will further expand the collection, no longer limited to one location, and collect pest data from different periods and more types to construct a network model with a wider applicability. In future work, we will also further train and deploy the improved YOLOv7 network model on edge devices and apply it to the production and management of Yunnan tea gardens, enabling accurate and fast identification and treatment of tea pests.</p>
</sec>
</sec>
<sec id="s6" sec-type="conclusions">
<label>6</label>
<title>Conclusion</title>
<p>This study achieved further optimization of the original loss function by employing MPDIou, which accelerated the convergence speed of the model, simplified the computational process, and improved the regression accuracy. The replacement of certain network structures with Spatial and Channel reconstruction Convolution reduced the redundant features of the model, decreased its complexity, and computational cost. The incorporation of vision transformer with Bi-Level Routing Attention enabled more flexible computational allocation and content awareness. The experimental results demonstrated that the improved YOLOv7 network performed well on the tea pest dataset.</p>
<p>The final total loss of the improved YOLOv7 network stabilized below 3.4%, a decrease of 0.8% compared to the original YOLOv7 network. Furthermore, the improved YOLOv7 model exhibited significant decreases in bounding box position loss, bounding box confidence loss, and classification loss, with the most remarkable decrease in bounding box position loss, which exceeded 15% on both the training and testing sets. Compared to the original YOLOv7 model, the improved YOLOv7 in this study showed significant progress in detection effectiveness, with a precision improvement of 5.68%, recall improvement of 5.14%, F1 improvement of 5.41%, and ultimately an mAP improvement of 2.58%. Additionally, when detecting limb-deficient pests, the improved YOLOv7 model still maintained higher detection accuracy and confidence compared to traditional deep learning models such as YOLOv7, faster RCNN, and SSD.</p>
<p>This study provided a feasible research method and important reference for addressing key issues in tea pest recognition, such as small datasets and difficulty in extracting pest features.</p>
</sec>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s8" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>The manuscript presents research on animals that do not require ethical approval for their study.</p>
</sec>
<sec id="s9" sec-type="author-contributions">
<title>Author contributions</title>
<p>JH: Conceptualization, Validation, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. SZ: Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. CY: Investigation, Methodology, Writing &#x2013; review &amp; editing. HW: Data curation, Validation, Writing &#x2013; review &amp; editing. JG: Methodology, Writing &#x2013; review &amp; editing. WH: Project administration, Writing &#x2013; review &amp; editing. QW: Investigation, Writing &#x2013; review &amp; editing. XW: Resources, Visualization, Writing &#x2013; review &amp; editing. WY: Supervision, Writing &#x2013; review &amp; editing. YW: Investigation, Writing &#x2013; review &amp; editing. L.L: Investigation, Writing &#x2013; review &amp; editing. JX: Investigation, Writing &#x2013; review &amp; editing. ZW: Data curation, Writing &#x2013; review &amp; editing. RZ: Validation, Writing &#x2013; review &amp; editing. BW: Funding acquisition, Project administration, Resources, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing.</p>
</sec>
</body>
<back>
<sec id="s10" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This research was funded by Development and demonstration of intelligent agricultural data sensing technology and equipment in plateau mountainous areas. (202302AE09002001), Study on the screening mechanism of phenotypic plasticity characteristics of large-leaf tea plants in Yunnan driven by AI based on data fusion (202301AS070083), Integration and Demonstration of Key Technologies for Improving Quality and Efficiency of the Tea Industry in Lvchun County under the National Key R&amp;D Project (2022YFD1601803),Yunnan Menghai County Smart Tea Industry Science and Technology Mission (202304Bl090013) and National Natural Science Foundation (32060702).</p>
</sec>
<sec id="s11" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Xia</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Pei</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Genetic diversity and differentiation of camellia sinensis l.(Cultivated tea) and its wild relatives in yunnan province of China, revealed by morphology, biochemistry and allozyme studies</article-title>. <source>Genet. Resour. Crop Evol.</source> <volume>52</volume>, <fpage>41</fpage>&#x2013;<lpage>52</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10722-005-0285-1</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Berg</surname> <given-names>A. C.</given-names>
</name>
<name>
<surname>Kirillov</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Boundary iou: improving object-centric image segmentation evaluation</article-title>,&#x201d; in <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Nashville, TN, USA</conf-loc>. pp. <fpage>15329</fpage>&#x2013;<lpage>15337</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01508</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Feris</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>T.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). &#x201c;<article-title>Revisiting rcnn: on awakening the classification power of faster rcnn</article-title>,&#x201d; In: <person-group person-group-type="editor">
<name>
<surname>Ferrari</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Hebert</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sminchisescu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Weiss</surname> <given-names>Y.</given-names>
</name>
</person-group> (eds) <source>Computer Vision &#x2013; ECCV 2018</source>. <publisher-loc>Springer, Cham</publisher-loc>: ECCV 2018. Lecture Notes in Computer Science, vol <volume>11219</volume>. doi: <pub-id pub-id-type="doi">10.1007/978-3-030-01267-0_28</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yue</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Pest identification via deep residual learning in complex background</article-title>. <source>Comput. Electron. Agric.</source> <volume>141</volume>, <fpage>351</fpage>&#x2013;<lpage>356</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2017.08.005</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Jinhui</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yunqi</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Shaojun</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Yunchao</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Transforming unmanned pineapple picking with spatio-temporal convolutional neural networks</article-title>. <source>Comput. Electron. Agric.</source> <volume>214</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108298</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fengyun</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Xingkang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zihao</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wei</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Jieli</surname> <given-names>D.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Detection and counting of banana bunches by integrating deep learning and classic image-processing algorithms</article-title>. <source>Comput. Electron. Agric.</source> <volume>209</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.107827</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gillund</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Shiffrin</surname> <given-names>R. M.</given-names>
</name>
</person-group> (<year>1984</year>). <article-title>A retrieval model for both recognition and recall</article-title>. <source>Psychol. Rev.</source> <volume>91</volume>, <fpage>1</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1037//0033-295X.91.1.1</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Graham-Bermann</surname> <given-names>S. A.</given-names>
</name>
<name>
<surname>Perkins</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Effects of early exposure and lifetime exposure to intimate partner violence (ipv) on child adjustment</article-title>. <source>Violence Victims</source> <volume>25</volume>, <fpage>427</fpage>&#x2013;<lpage>439</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1891/0886-6708.25.4.427</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Xiao</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>E.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Transformer in transformer</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>34</volume>, <fpage>15908</fpage>&#x2013;<lpage>15919</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2103.00112</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hazarika</surname> <given-names>L. K.</given-names>
</name>
<name>
<surname>Bhuyan</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Hazarika</surname> <given-names>B. N.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Insect pests of tea and their management</article-title>. <source>Annu. Rev. Entomology</source> <volume>54</volume>, <fpage>267</fpage>&#x2013;<lpage>284</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1146/annurev.ento.53.103106.093359</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Tonghe</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Tianye</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Jie</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Ruilong</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Ji</surname> <given-names>L.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Based on fcn and densenet framework for the research of rice pest identification methods</article-title>. <source>Agronomy</source> <volume>13</volume>, <elocation-id>410</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agronomy13020410</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sclaroff</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Local descriptors optimized for average precision</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, <conf-loc>Salt Lake City, UT, USA</conf-loc>. pp. <fpage>596</fpage>&#x2013;<lpage>605</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR.2018.00069</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Henderson</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Ferrari</surname> <given-names>V.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>End-to-end training of object class detectors for mean average precision</article-title>,&#x201d; In: <person-group person-group-type="editor">
<name>
<surname>Lai</surname> <given-names>S. H.</given-names>
</name>
<name>
<surname>Lepetit</surname> <given-names>V.</given-names>
</name>
<name>
<surname>Nishino</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Sato</surname> <given-names>Y.</given-names>
</name>
</person-group> (eds) <source>Computer Vision &#x2013; ACCV 2016</source>, <publisher-loc>Springer, Cham</publisher-loc>: ACCV 2016. Lecture Notes in Computer Science. vol <volume>10115</volume>. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-54193-8_13</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hill</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Marquez</surname> <given-names>L.</given-names>
</name>
<name>
<surname>O'Connor</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Remus</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>1994</year>). <article-title>Artificial neural network models for forecasting and decision making</article-title>. <source>Int. J. Forecasting</source> <volume>10</volume>, <fpage>5</fpage>&#x2013;<lpage>15</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/0169-2070(94)90045-0</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ni</surname> <given-names>H. C.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Tsai</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Termite pest identification method based on deep convolution neural networks</article-title>. <source>J. Econ. Entomol.</source> <volume>114</volume>, <fpage>2452</fpage>&#x2013;<lpage>2459</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/jee/toab162</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Xie</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Yan</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>An attention mechanism-improved yolov7 object detection algorithm for hemp duck count estimation</article-title>. <source>Agriculture</source> <volume>12</volume>, <elocation-id>1659</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/agriculture12101659</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kasinathan</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Uyyala</surname> <given-names>S. R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Machine learning ensemble with image processing for pest identification and classification in field crops</article-title>. <source>Neural Computing Appl.</source> <volume>33</volume>, <fpage>7491</fpage>&#x2013;<lpage>7504</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/S00521-020-05497-Z</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kriegeskorte</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Golan</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Neural network models and deep learning</article-title>. <source>Curr. Biol.</source> <volume>29</volume>, <fpage>R231</fpage>&#x2013;<lpage>R236</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.cub.2019.02.034</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Pokharel</surname> <given-names>S. S.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Parajulee</surname> <given-names>M. N.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>F.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Effects of elevated co2 on foliar soluble nutrients and functional components of tea, and population dynamics of tea aphid, <italic>toxoptera aurantii</italic>
</article-title>. <source>Plant Physiol. Biochem.</source> <volume>145</volume>, <fpage>84</fpage>&#x2013;<lpage>94</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.plaphy.2019.10.023</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Scconv: spatial and channel reconstruction convolution for feature redundancy</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Vancouver, BC, Canada</conf-loc>. pp. <fpage>6153</fpage>&#x2013;<lpage>6162</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00596</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Anguelov</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Erhan</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Szegedy</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Reed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>C</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). &#x201c;<article-title>Ssd: single shot multibox detector</article-title>,&#x201d; in <source>Computer Vision &#x2013; ECCV 2016</source>. <publisher-loc>Springer, Cham</publisher-loc>: ECCV 2016. Lecture Notes in Computer Science. vol <volume>9905</volume>. doi: <pub-id pub-id-type="doi">10.1007/978-3-319-46448-0_2</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;<article-title>Reconstructed convolution module based look-up tables for efficient image super-resolution</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>, <conf-loc>Paris, France</conf-loc>. pp. <fpage>12183</fpage>&#x2013;<lpage>12192</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCV51070.2023.01122</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Yi</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Scscn: a separated channel-spatial convolution net with attention for single-view reconstruction</article-title>. <source>IEEE Trans. On Ind. Electron.</source> <volume>67</volume>, <fpage>8649</fpage>&#x2013;<lpage>8658</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIE.2019.2950866</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nataraj</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Sarkar</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Manjunath</surname> <given-names>B. S.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Adding gaussian noise to &#x201c;denoise&#x201d; jpeg for detecting image resizing</article-title>,&#x201d; in <conf-name>2009 16th IEEE International Conference on Image Processing (ICIP)</conf-name>, (IEEE).</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qiang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Guan</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Du</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Detection of citrus pests in double backbone network based on single shot multibox detector</article-title>. <source>Comput. Electron. Agric.</source> <volume>212</volume>, <elocation-id>108158</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108158</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rezatofighi</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Tsoi</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Gwak</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Sadeghian</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Reid</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Savarese</surname> <given-names>S.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). &#x201c;<article-title>Generalized intersection over union: a metric and a loss for bounding box regression</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Long Beach, CA, USA</conf-loc>. pp. <fpage>658</fpage>&#x2013;<lpage>666</lpage>. doi: 10.1109/CVPR.2019.00075</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Siliang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Yong</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Mpdiou: a loss for efficient and accurate bounding box regression</article-title>. <source>Arxiv Preprint Arxiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2307.07662</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sivapalan</surname> <given-names>P.</given-names>
</name>
</person-group> (<year>1977</year>). <article-title>Population dynamics of xyleborus fornicatus eichhoff (coleoptera: scoly-tidae) in relation to yield trends in tea</article-title>. <source>Bull. Entomological Res.</source> <volume>67</volume>, <fpage>329</fpage>&#x2013;<lpage>335</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1017/S0007485300011159</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Streiner</surname> <given-names>D. L.</given-names>
</name>
<name>
<surname>Norman</surname> <given-names>G. R.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>&#x201c;precision&#x201d; and &#x201c;accuracy&#x201d;: two terms that are neither</article-title>. <source>J. Clin. Epidemiol.</source> <volume>59</volume>, <fpage>327</fpage>&#x2013;<lpage>330</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.jclinepi.2005.09.005</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Nie</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Rotate: knowledge graph embedding by relational rotation in complex space</article-title>. <source>Arxiv Preprint Arxiv:1902.10197</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1902.10197</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Leite</surname> <given-names>A. C.</given-names>
</name>
<name>
<surname>Xiong</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Precision control technology and application in agricultural pest and disease control</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>, <elocation-id>1163839</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1163839</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Taniai</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Matsushita</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Sato</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Naemura</surname> <given-names>T.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Continuous 3d label stereo matching using local expansion moves</article-title>. <source>IEEE Trans. On Pattern Anal. Mach. Intell.</source> <volume>40</volume>, <fpage>2725</fpage>&#x2013;<lpage>2739</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2017.2766072</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Teske</surname> <given-names>A. L.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Nansen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Kong</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Optimised dispensing of predatory mites by multirotor uavs in wind: a distribution pattern modelling approach for precision pest management</article-title>. <source>Biosyst. Eng.</source> <volume>187</volume>, <fpage>226</fpage>&#x2013;<lpage>238</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2019.09.009</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H. M.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Yolov7: trainable bag-of-freebies sets new state-of-the-art for real-time object detectors</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Vancouver, BC, Canada</conf-loc>. pp. <fpage>7464</fpage>&#x2013;<lpage>7475</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00721</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Song</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>ICIoU: improved loss based on complete intersection over union for bounding box regression</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>105686</fpage>&#x2013;<lpage>105695</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2021.3100414</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Group normalization</article-title>,&#x201d; in <conf-name>Proceedings of the European conference on computer vision (ECCV)</conf-name> <volume>128</volume>, <fpage>742</fpage>&#x2013;<lpage>755</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11263-019-01198-w</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xing</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Qian</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Pan</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Le</surname> <given-names>Q.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A lightweight model for real-time monitoring of ships</article-title>. <source>Electronics</source> <volume>12</volume>, <elocation-id>3804</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/electronics12183804</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Shi</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>He</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>W.</given-names>
</name>
<etal/>
</person-group>. (<year>2023</year>). <article-title>Asfl-yolox: an adaptive spatial feature fusion and lightweight detection method for insect pests of the papilionidae family</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>, <elocation-id>1176300</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1176300</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yacouby</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Axman</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Probabilistic extension of precision, recall, and f1 score for more thorough evaluation of classification models</article-title>,&#x201d; in <conf-name>Proceedings of the first workshop on evaluation and comparison of NLP systems</conf-name>. <fpage>79</fpage>&#x2013;<lpage>91</lpage>. doi: <pub-id pub-id-type="doi">10.18653/v1/2020.eval4nlp-1.9</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yawen</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Jianjun</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zongyi</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Shiquan</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Lihua</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Xiaoyan</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2001</year>). <article-title>The diversity and sustainable development of crop genetic resources in the lancang river valley</article-title>. <source>Genet. Resour. Crop Evol.</source> <volume>48</volume>, <fpage>297</fpage>&#x2013;<lpage>306</lpage>. doi: <pub-id pub-id-type="doi">10.1023/A:1011257700607</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yin</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Dai</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Residue pattern of chlorpyrifos and its metabolite in tea from cultivation to consumption</article-title>. <source>J. Sci. Food Agric.</source> <volume>101</volume>, <fpage>4134</fpage>&#x2013;<lpage>4141</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/jsfa.11049</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yunchao</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Chao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Candea</surname> <given-names>A. L.</given-names>
</name>
<name>
<surname>Ya</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Editorial: Precision control technology and application in agricultural pest and disease control</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>, <elocation-id>141163839</elocation-id>&#x2013;<lpage>1163839</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1163839</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2005</year>). &#x201c;<article-title>Auto cropping for digital photographs</article-title>,&#x201d; In: <conf-name>IEEE International Conference on Multimedia and Expo</conf-name>, <conf-loc>Amsterdam</conf-loc>. p. <fpage>4</fpage>. doi: <pub-id pub-id-type="doi">10.1109/ICME.2005.1521454</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Gallo</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Frosio</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Kautz</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Loss functions for neural networks for image processing</article-title>. <source>Arxiv Preprint Arxiv</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1511.08861</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Gallo</surname> <given-names>O.</given-names>
</name>
<name>
<surname>Frosio</surname> <given-names>I.</given-names>
</name>
<name>
<surname>Kautz</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Loss functions for image restoration with neural networks</article-title>. <source>IEEE Trans. On Comput. Imaging</source> <volume>3</volume>, <fpage>47</fpage>&#x2013;<lpage>57</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TCI.2016.2644865</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Distance-iou loss: faster and better learning for bounding box regression</article-title>. <source>Proceedings of the AAAI Conference on Artificial Intelligence</source> <volume>34</volume> (<issue>07</issue>), <fpage>12993</fpage>&#x2013;<lpage>13000</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1609/aaai.v34i07.6999</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). &#x201c;<article-title>Trar: routing the attention spans in transformer for visual question answering</article-title>,&#x201d; in <conf-name>IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>, <conf-loc>Montreal, QC, Canada</conf-loc>. pp. <fpage>2054</fpage>&#x2013;<lpage>2064</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00208</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Long</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Comparative transcriptome analysis of sogatella furcifera (horv&#xe1;th) exposed to different insecticides</article-title>. <source>Sci. Rep.</source> <volume>8</volume>, <fpage>8773</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-018-27062-4</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ke</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Lau</surname> <given-names>R. W.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Biformer: vision transformer with bi-level routing attention</article-title>,&#x201d; in <conf-name>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-loc>Vancouver, BC, Canada</conf-loc>. pp. <fpage>10323</fpage>&#x2013;<lpage>10333</lpage>. doi: <pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00995</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>