<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2022.1030021</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Plant Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Improved YOLOv4 recognition algorithm for pitaya based on coordinate attention and combinational convolution</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Fu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1894168"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cao</surname>
<given-names>Weihua</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Shunqing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cui</surname>
<given-names>Xiahua</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Yang</surname>
<given-names>Ning</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1107972"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Xinyue</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Xiaodong</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1885769"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Fu</surname>
<given-names>Sanling</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
</contrib>
</contrib-group>    <aff id="aff1">
<sup>1</sup>
<institution>College of Agricultural Equipment Engineering, Henan University of Science and Technology</institution>, <addr-line>Luoyang</addr-line>, <country>China</country>
</aff>    <aff id="aff2">
<sup>2</sup>
<institution>Collaborative Innovation Center of Machinery Equipment Advanced Manufacturing of Henan Province, Henan University of Science and Technology</institution>, <addr-line>Luoyang</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>School of Electrical and Information Engineering, Jiangsu University</institution>, <addr-line>Zhenjiang</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Key Laboratory of Modern Agricultural Equipment and Technology of Ministry of Education, Jiangsu University</institution>, <addr-line>Zhenjiang</addr-line>, <country>China</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>College of Physical Engineering, Henan University of Science and Technology</institution>, <addr-line>Luoyang</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: Yongliang Qiao, The University of Sydney, Australia</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: Salvador Valle-Guadarrama, Chapingo Autonomous University, Mexico; Jieli Duan, South China Agricultural University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Ning Yang, <email xlink:href="mailto:yangn@ujs.edu.cn">yangn@ujs.edu.cn</email>; Sanling Fu, <email xlink:href="mailto:fusanling@126.com">fusanling@126.com</email>
</p>
</fn>
<fn fn-type="other" id="fn002">
<p>This article was submitted to Sustainable and Intelligent Phytoprotection, a section of the journal Frontiers in Plant Science</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>18</day>
<month>10</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>13</volume>
<elocation-id>1030021</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>08</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>09</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Zhang, Cao, Wang, Cui, Yang, Wang, Zhang and Fu</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Zhang, Cao, Wang, Cui, Yang, Wang, Zhang and Fu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Accurate recognition method of pitaya in natural environment provides technical support for automatic picking. Aiming at the intricate spatial position relationship between pitaya fruits and branches, a pitaya recognition method based on improved YOLOv4 was proposed. GhostNet feature extraction network was used instead of CSPDarkNet53 as the backbone network of YOLOv4. A structure of generating a large number of feature maps through a small amount of calculation was used, and the redundant information in feature layer was obtained with lower computational cost, which can reduce the number of parameters and computation of the model. Coordinate attention was introduced to enhance the extraction of fine-grained feature of targets. An improved combinational convolution module was designed to save computing power and prevent the loss of effective features and improve the recognition accuracy. The Ghost Module was referenced in Yolo Head to improve computing speed and reduce delay. Precision, Recall, F1, AP, detection speed and weight size were selected as performance evaluation indexes of recognition model. 8800 images of pitaya fruit in different environments were used as the dataset, which were randomly divided into the training set, the validation set and the test set according to the ratio of 7:1:2. The research results show that the recognition accuracy of the improved YOLOv4 model for pitaya fruit is 99.23%. Recall, F1&#xa0;and AP are 95.10%, 98% and 98.94%, respectively. The detection speed is 37.2 frames&#xb7;s<sup>-1</sup>, and the weight size is 59.4MB. The improved YOLOv4 recognition algorithm can meet the requirements for the accuracy and the speed of pitaya fruit recognition in natural environment, which will ensure the rapid and accurate operation of the picking robot.</p>
</abstract>
<kwd-group>
<kwd>improved YOLOv4</kwd>
<kwd>GhostNet</kwd>
<kwd>coordinate attention</kwd>
<kwd>improved combinational convolution module</kwd>
<kwd>target recognition</kwd>
</kwd-group>
<counts>
<fig-count count="10"/>
<table-count count="3"/>
<equation-count count="15"/>
<ref-count count="37"/>
<page-count count="12"/>
<word-count count="4704"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<title>1 Introduction</title>
<p>Pitaya belongs to the cactus family, which has many branches and extends. The edges of the leaves are winged, wavy or crenellated, which make the harvesting process time-consuming and labor-intensive. Rapid and accurate recognition of pitaya fruit is a prerequisite for automatic picking by agricultural robots. Therefore, it has important research significance and application value to improve operation efficiency (<xref ref-type="bibr" rid="B19">Tang et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B9">Huang et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B28">Ye et&#xa0;al., 2021</xref>).</p>
<p>At present, scholars have carried out researches on the recognition of target fruits and vegetables based on traditional image processing technology (<xref ref-type="bibr" rid="B35">Zeng et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B16">Miao et&#xa0;al., 2021</xref>). <xref ref-type="bibr" rid="B6">Han et&#xa0;al. (2021)</xref> established an automatic quantification system based on HSV space model for the segmentation of amygdalus mira seeds, and the accuracy rate was 99.7%. <xref ref-type="bibr" rid="B32">Zhang et&#xa0;al. (2019)</xref> converted RGB image into a Lab space model, and used Hough circle transform to count the number of fruits, and the recognition accuracy was 94.01%. <xref ref-type="bibr" rid="B33">Zhang et&#xa0;al. (2020)</xref> proposed a pomegranate fruit recognition and classification method based on support vector machine and multi-feature fusion, and its classification accuracy was 75%. <xref ref-type="bibr" rid="B14">Liu et&#xa0;al. (2019)</xref> extracted the color and shape features of ripe apples to realize apple recognition, and Recall was 89.8%. <xref ref-type="bibr" rid="B3">Chu et&#xa0;al. (2019)</xref> proposed a method for identifying spherical fruits based on machine vision, and the recognition accuracy was over 95%. The above methods achieve fruits recognition by extracting the color, shape and texture features. However, those methods have problems such as long detection time, poor robustness, and low recognition accuracy, which are difficult to meet the recognition accuracy of target fruits in intricate environments (<xref ref-type="bibr" rid="B13">Liu et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B21">Tan et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B26">Xue et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B36">Zhao et&#xa0;al., 2020</xref>).</p>
<p>In recent years, the convolutional neural network (CNN) has been widely used in target recognition and detection (<xref ref-type="bibr" rid="B15">Lv et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B2">Cao et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B31">Zhang K. et&#xa0;al., 2021</xref>), which is mainly divided into two categories. One is the two-stage target detection method represented by region-CNN (RCNN) (<xref ref-type="bibr" rid="B5">Girshick et&#xa0;al., 2014</xref>), Fast RCNN (<xref ref-type="bibr" rid="B4">Girshick, 2015</xref>), Faster RCNN (<xref ref-type="bibr" rid="B18">Ren et&#xa0;al., 2015</xref>), etc., the steps for these methods are to obtain the target proposal box firstly, and then classified it in the proposal box. <xref ref-type="bibr" rid="B37">Zhu et&#xa0;al. (2020)</xref> proposed an improved Faster RCNN algorithm based on the botanical characteristics of lycium barbarum flowering period and fruit ripening period, and its average accuracy reached 74%. <xref ref-type="bibr" rid="B27">Yan et&#xa0;al. (2019)</xref> proposed an improved Faster RCNN algorithm to identify prickly pears, and its average recognition accuracy reached 92.01%. <xref ref-type="bibr" rid="B34">Zhang W. et&#xa0;al. (2021)</xref> proposed an improved Faster RCNN algorithm to identify tomatoes, and its average recognition accuracy reached 95.2%. Such algorithms have a long training time and slow detection speed. Another is the one-stage target detection method represented by SSD (<xref ref-type="bibr" rid="B12">Liu et&#xa0;al., 2016</xref>), YOLO (<xref ref-type="bibr" rid="B17">Redmon et&#xa0;al., 2016</xref>), etc., which completes the target proposal box and classification label in the same network. <xref ref-type="bibr" rid="B29">Yi et&#xa0;al. (2021)</xref> proposed a YOLOv4 model based on feature recursive fusion to identify citrus, and its detection accuracy reached 94.6%. <xref ref-type="bibr" rid="B22">Wang et&#xa0;al. (2021)</xref> proposed I-YOLOv4-Tiny target detection network, and introduced convolutional attention module to identify blueberry fruit in different environments, and its average accuracy reached 97.30%. <xref ref-type="bibr" rid="B30">Zhang F. et&#xa0;al. (2021)</xref> proposed an improved YOLOv4-LITE target detection algorithm to identify cherry tomatoes, and its average accuracy reached 99.15%. <xref ref-type="bibr" rid="B10">Li et&#xa0;al. (2021)</xref> used MobileNetV2 as YOLOv3 backbone network and introduced M-Res2Net module to identify grape fruit, and its average accuracy reached 81.2%. <xref ref-type="bibr" rid="B25">Xiong et&#xa0;al. (2020)</xref> proposed Des-YOLOv3 target detection network to identify ripe citrus at night, and its average accuracy reached 90.75%. Wu et&#xa0;al. proposed an improved YOLOv3 model based on clustering optimization and a new YOLOv5-B model to obtain target information and improve the accuracy and speed of small target detection (<xref ref-type="bibr" rid="B24">Wu et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B23">Wu et&#xa0;al., 2022</xref>). <xref ref-type="bibr" rid="B20">Tang et&#xa0;al. (2022)</xref> proposed a model method YOLO-Oleifera for Camellia oleifera fruit detection based on the YOLOv4-tiny model, which realize the learning of the characteristic information of Camellia oleifera fruit. <xref ref-type="bibr" rid="B11">Li et&#xa0;al. (2022)</xref> built a new type of agricultural machinery intelligent design system integrating image processing and knowledge reasoning, which provided a reference for intelligent design to guide actual production. In all, the above researches have problems such as complex calculation, large consumption, which are difficult to meet the rapid and accurate operation of picking robot on the target fruits in an intricate environment. Therefore, the recognition accuracy and speed need to be improved.</p>
<p>In order to solve the difficulty of identifying and picking pitaya fruit in natural environment, this paper proposed an improved YOLOv4 recognition algorithm that integrated coordinate attention and combinational convolution to improve the recognition speed and accuracy of pitaya fruit.</p>
</sec>
<sec id="s2" sec-type="materials|methods">
<title>2 Test materials and methods</title>
<sec id="s2_1">
<title>2.1 Test data acquisition</title>
<p>The pitaya fruit images were collected in the greenhouse of Taiwan Fuhao farm, Mengjin district, Luoyang city, Henan province, which were collected under natural light conditions on rainy and sunny days. The image acquisition device is Canon (Canon EOS 750D) single-lens reflex camera, the image resolution is 6000 &#xd7; 4000 pixels, and the format is JPG. In order to simulate the recognition system of picking robot, we chose to shoot from five angles: front, left, right, up and down. A total of 1,280 original images of pitaya fruit were collected, and 1,100 images of pitaya fruit growing environment in various natural environment including smooth light, backlight, overlap, occlusion, and adhesion were selected, as shown in <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>. In this research, the red heart &#x201c;soft branch big red&#x201d; pitaya fruit was used, which is suitable for planting in areas with a minimum temperature above 0 degrees in January all year round. They need to be tied and pruned at appropriate times. When the seedlings grow to the square (circle) position, they can be allowed to droop for early flowering and fruiting.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Some images of pitayas in greenhouse environment: <bold>(A)</bold> Mature pitayas in sunny days; <bold>(B)</bold> Mature pitayas in rainy days; <bold>(C)</bold> Occlusion of pitayas; <bold>(D)</bold> Adhesive pitayas; <bold>(E)</bold> Short distance pitayas; <bold>(F)</bold> Long distance pitayas.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g001.tif"/>
</fig>
</sec>
<sec id="s2_2">
<title>2.2 Data augmentation</title>
<p>Training a deep learning model requires a large number of data, and too small dataset will lead to the overfitting of neural network. Therefore, the data augmentation method was used to expand the number of samples. In this paper, methods such as translation transformation, random rotation, mirror flip, horizontal flip, brightness adjustment, salt and pepper noise were used for data enhancement. A total of 8800 images were obtained as the dataset.</p>
</sec>
<sec id="s2_3">
<title>2.3 Dataset preparation</title>
<p>We transfer the LabelImg image annotation software in Python to manually mark the rectangular box of the pitaya fruit in the image. The completely naked pitaya fruit is marked on the inside of its rectangular box, occluded or conglutinated pitaya fruit only needs to mark the exposed part of the image, and the pitaya fruit that appears less than 10% in the image is not marked. We set the target category to &#x201c;pitaya&#x201d;, and save it as.xml file after labeling all pitaya fruits in the image.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Pitaya fruit recognition network</title>
<sec id="s3_1">
<title>3.1 YOLOv4 network model</title>
<p>YOLO is a target recognition and localization algorithm based on deep neural network to achieve end-to-end prediction. YOLOv4 (<xref ref-type="bibr" rid="B1">Bochkovskiy et&#xa0;al., 2020</xref>) is an efficient and powerful target detection model combined with a large number of previous research techniques and combined innovation. YOLOv4 structure is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>YOLOv4 network structure diagram. * means repeat the operation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g002.tif"/>
</fig>
<p>The loss function of YOLOv4 consists of positive sample coordinate loss, positive sample confidence loss, negative sample confidence loss and positive sample classification loss. This paper uses Complete Intersection over Union (CIoU, taking into account the distance, overlap, scale and penalty terms between the target and the box, making the tar-get box regression more stable) as the loss function. CIoU makes the prediction box more consistent with the real box, so that the target box can be positioned accurately. It can also avoid the problem that the Intersection over Union (IoU, used to measure the degree of overlap between the prediction box and the real box in target detection) of the loss function is 0 which because the prediction box does not intersect the real box. The expression of CIoU is:</p>
<disp-formula>
<label>(1)</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>I</mml:mi>
<mml:mtext>o</mml:mtext>
<mml:mi>U</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c1;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mi>c</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Among them, <italic>&#x3c1;</italic>
<sup>2</sup>(<italic>b</italic>,<italic>b</italic>
<sup>
<italic>gt</italic>
</sup>) represents the Euclidean distance between the prediction box and the center point of the real box, <italic>c</italic> represents the diagonal distance which tangential to the rectangular box outside the prediction box and the real box, <italic>&#x3b2;</italic> is a measure of aspect ratio consistency parameter, <italic>v</italic> is the trade-off parameter. The YOLOv4 loss function expression is:</p>
<disp-formula>
<label>(2)</label>
<mml:math display="block" id="M2">
<mml:mtable>
<mml:mtr>
<mml:mtd>
<mml:mi>L</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
<mml:mo stretchy="false">[</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
<mml:mi>log</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
<mml:mo stretchy="false">)</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
<mml:mo stretchy="false">[</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
<mml:mi>log</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
<mml:mo stretchy="false">)</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mstyle>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mi>log</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mstyle>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
</disp-formula>
<p>Among them, <italic>&#x3bb;</italic>
<sub>
<italic>coord</italic>
</sub> is the weight coefficient of positive samples, <inline-formula>
<mml:math display="inline" id="im1">
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>M</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represents traversing all prediction boxes, <inline-formula>
<mml:math display="inline" id="im2">
<mml:mrow>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula>
<mml:math display="inline" id="im3">
<mml:mrow>
<mml:msubsup>
<mml:mi>I</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represent whether they are positive samples. 1 for positive samples, while 0 for others. <italic>w</italic>
<sub>
<italic>i</italic>
</sub> is the width of the center point of the prediction box, <italic>h</italic>
<sub>
<italic>i</italic>
</sub> is the height of the center point of the prediction box, <inline-formula>
<mml:math display="inline" id="im4">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="true">^</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is the sample value, <italic>C</italic>
<sub>
<italic>i</italic>
</sub> is the predicted value, <italic>&#x3bb;</italic>
<sub>
<italic>boobj</italic>
</sub> is the weight coefficient of negative samples.</p>
</sec>
<sec id="s3_2">
<title>3.2 Improved YOLOv4 network model</title>
<p>YOLOv4 uses CSPDarkNet53 backbone network. Although it has a good effect on object feature extraction, its own network parameters are too large, resulting in a slow recognition speed of YOLOv4. In addition, its model calculation is complex and requires a large amount of memory space. Accordingly, a fast, accurate and lightweight recognition model was proposed in this paper. Based on the traditional YOLOv4, GhostNet was used as the backbone network for feature extraction to reduce the computational complexity of the model, generate more feature maps and achieve rapid recognition of targets. To save computing power, learn more features and process more data in a shorter time, an improved combinational convolution module was used to replace the traditional combinational convolution at feature fusion, and the coordinate attention (CA, an attention mechanism that embeds location information into channel attention) was introduced. In order to compress the model, improve computing speed and reduce delay, Ghost Module was referenced in Yolo Head.</p>
<sec id="s3_2_1">
<title>3.2.1 GhostNet backbone network</title>
<p>GhostNet (<xref ref-type="bibr" rid="B7">Han et&#xa0;al., 2020</xref>) proposes a structure that generates a large number of feature maps only by a small amount of computation&#x2014;&#x2014;Ghost Module. It generates feature maps through a series of linear operations. The feature maps generated by linear operations are called Ghost feature maps, and the operated feature maps are called intrinsic feature maps. The conventional convolution module is shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3A</bold>
</xref>, and the Ghost Mod-ule is shown in <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3B</bold>
</xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Comparison diagram between the convolutional layer and the Ghost Module: <bold>(A)</bold> The convolutional layer; <bold>(B)</bold> The Ghost Module.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g003.tif"/>
</fig>
<p>If the input feature map size is <italic>h</italic>
<sub>1</sub>&#xd7;<italic>w</italic>
<sub>1</sub>&#xd7;<italic>c</italic>, the output feature map size is <italic>h</italic>
<sub>2</sub>&#xd7;<italic>w</italic>
<sub>2</sub>&#xd7;<italic>n</italic>, the convolution kernel size is <italic>k</italic>&#xd7;<italic>k</italic>, and the stride is <italic>s</italic>, then the conventional convolution and Ghost module FLOPs (measurable model complexity) are:</p>
<disp-formula>
<label>(3)</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(4)</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The ratio of the two is:</p>
<disp-formula>
<label>(5)</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>c</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:mfrac>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>s</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2248;</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<p>It can be seen that FLOPs of Ghost Module can be reduced to 1/<italic>s</italic> of the conventional convolution, which reduces the complexity of the model.</p>
</sec>
<sec id="s3_2_2">
<title>3.2.2 Improved combinational convolution-CA module</title>
<p>Five combinational convolution improvements were made at feature fusion to generate an improved combinational convolution module. In other words, a separable convolution and a residual edge structure were introduced. Among them, the separable convolution can reduce the computational complexity of the network and run faster. The residual edge can improve the learning effect of the model, prevent the loss of effective features, and effectively solve the problem of gradient disappearance. Based on this, two conventional convolutions of the traditional combinational convolution were replaced by separable convolutions. Moreover, the residual edge was added next to the former two convolutions to obtain an improved combinational convolution module. To help the model locate and identify object of interest more accurately, CA was introduced. As a result, an improved combinational convolution-CA module was obtained, as shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>Structure diagram of improved combinational convolution-CA. * means repeat the operation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g004.tif"/>
</fig>
<p>CA (<xref ref-type="bibr" rid="B8">Hou et&#xa0;al., 2021</xref>) embeds location information into channel attention, which can reduce the attention to secondary information and enhance the extraction of fine-grained feature of targets to improve model accuracy and generalization performance. CA includes coordinate information embedding and coordinate attention generation. The structure is shown in <xref ref-type="fig" rid="f5">
<bold>Figure&#xa0;5</bold>
</xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>CA structure diagram.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g005.tif"/>
</fig>
<p>The coordinate information embedding operation corresponds to X Avg Pool and Y Avg Pool in the figure. For the input <italic>X</italic>, it uses the pooling kernel of dimension (<italic>H</italic>, 1) and (1, <italic>W</italic>) to encode each channel along the horizontal and vertical coordinate directions, and the output expression of the cth channel with height <italic>h</italic> is:</p>
<disp-formula>
<label>(6)</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:msubsup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>W</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&lt;</mml:mo>
<mml:mi>W</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Similarly, the output expression of the cth channel of width <italic>w</italic> is:</p>
<disp-formula>
<label>(7)</label>    
<mml:math display="block" id="M7">
<mml:mrow>
<mml:msubsup>
<mml:mi>z</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>H</mml:mi>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>&lt;</mml:mo>
<mml:mi>H</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
</disp-formula>
<p>For CA generation operation, the two feature maps generated by the previous module are concatenated firstly, and then use a shared 1&#xd7;1 convolution transformation <italic>F</italic>
<sub>1</sub> and the expression is:</p>
<disp-formula>
<label>(8)</label>
<mml:math display="block" id="M8">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>=</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mo stretchy="false">[</mml:mo>
<mml:msup>
<mml:mi>z</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>z</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
<mml:mo stretchy="false">]</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The generated <italic>f</italic>&#x2208;<italic>R</italic>
<sup>
<italic>C</italic>/<italic>r</italic>&#xd7;(<italic>H</italic>+<italic>W</italic>)</sup> is the intermediate feature map of spatial information in horizontal and vertical directions, and <italic>r</italic> represents the down-sampling ratio to control the block size. Slice <italic>f</italic> into two separate tensors <italic>f</italic>
<sup>
<italic>h</italic>
</sup>&#x2208;<italic>R</italic>
<sup>
<italic>C</italic>/<italic>r</italic>&#xd7;<italic>H</italic>
</sup> and <italic>f</italic>
<sup>
<italic>w</italic>
</sup>&#x2208;<italic>R</italic>
<sup>
<italic>C</italic>/<italic>r</italic>&#xd7;<italic>W</italic>
</sup> along the spatial dimension, and then use two 1&#xd7;1 convolutions <italic>F</italic>
<sub>
<italic>h</italic>
</sub> and <italic>F</italic>
<sub>
<italic>w</italic>
</sub> to transform the feature map <italic>f</italic>
<sup>
<italic>h</italic>
</sup> and <italic>f</italic>
<sup>
<italic>w</italic>
</sup> to the same number of channels as the input <italic>X</italic>, and get the following result:</p><disp-formula>
<label>(9)</label>
<mml:math display="block" id="M9">
<mml:mrow>
<mml:msup>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mtext>f</mml:mtext>
<mml:mi>h</mml:mi>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(10)</label>
<mml:math display="block" id="M10">
<mml:mrow>
<mml:msup>
<mml:mi>g</mml:mi>
<mml:mi>w</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:msup>
<mml:mtext>f</mml:mtext>
<mml:mi>w</mml:mi>
</mml:msup>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Expanding <italic>g</italic>
<sup>
<italic>h</italic>
</sup> and <italic>g</italic>
<sup>
<italic>w</italic>
</sup> as the weight of attention. CA expression is:</p><disp-formula>
<label>(11)</label>
<mml:math display="block" id="M11">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
</mml:msubsup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
<mml:mo>&#xd7;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>w</mml:mi>
</mml:msubsup>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>The calculation can capture the precise position relationship, and then locate the exact position of the object of interest more accurately, so as to help the model identify better.</p>
</sec>
<sec id="s3_2_3">
<title>3.2.3 Improved combinational convolution-CA module at feature fusion</title>
<p>The improved combinational convolution-CA module was applied to the feature fusion (a, b, c, d) section, and the improved combinational convolution-CA module was used at ac, bd, abcd respectively, replacing the original CBL module with the improved combinational convolution-CA module, as shown in <xref ref-type="fig" rid="f6">
<bold>Figure&#xa0;6</bold>
</xref>.</p>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>Improved combinational convolution-CA module at fusion.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g006.tif"/>
</fig>
</sec>
</sec>
<sec id="s3_3">
<title>3.3 Model training and testing</title>
<sec id="s3_3_1">
<title>3.3.1 Test platform</title>
<p>This study uses Pytorch to improve the YOLOv4. Graphics processor unit (GPU) is NVIDIA Quadro P2200 16 G, and the central processing unit (CPU) is Intel(R) Xeon(R) Silver 4210R. The training and improvement of YOLOv4 model are carried out on Windows 10 operating system. The momentum of the momentum optimizer in the network is set to 0.9, the initial learning rate of the weight is set to 0.001, the attenuation coefficient is set to 0.0005, and the number of training iterations is 100. The loss curves for training and test sets are shown in <xref ref-type="fig" rid="f7">
<bold>Figure&#xa0;7</bold>
</xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>Loss value change curve of the training set and the test set.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g007.tif"/>
</fig>
</sec>
<sec id="s3_3_2">
<title>3.3.2 Pitaya fruit recognition network training</title>
<p>The flow chart of target detection network is shown in <xref ref-type="fig" rid="f8">
<bold>Figure&#xa0;8</bold>
</xref>. The model effect is verified on the same verification set by comparing different improved models.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>Flow chart of pitaya targets detection network.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g008.tif"/>
</fig>
</sec>
<sec id="s3_3_3">
<title>3.3.3 Model evaluation indicators</title>
<p>In this paper, Precision (P), Recall (R), F1, AP, detection speed and weight size are selected as model evaluation indexes. Since only the pitaya fruit in the image needs to be identified, the pitaya fruit is regarded as a positive sample. On the contrary, all other objects are regarded as negative sample.</p><disp-formula>
<label>(12)</label>
<mml:math display="block" id="M12">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(13)</label>
<mml:math display="block" id="M13">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(14)</label>
<mml:math display="block" id="M14">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>R</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula>
<label>(15)</label>
<mml:math display="block" id="M15">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mstyle displaystyle="true">
<mml:mrow>
<mml:munderover>
<mml:mo>&#x222b;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:munderover>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>R</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow> </mml:mstyle>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Among them, the meaning of TP, FP and FN are as follows:</p><list list-type="simple">
<list-item>
<p>TP: The number of positive samples correctly identified, which is the correct rate.</p>
</list-item>
<list-item>
<p>FP: The number of positive samples incorrectly identified, which is the error rate.</p>
</list-item>
<list-item>
<p>FN: The number of positive samples missed, which is the omission rate.</p>
</list-item>
</list>
</sec>
</sec>
</sec>
<sec id="s4" sec-type="results">
<title>4 Results and analysis</title>
<sec id="s4_1">
<title>4.1 Comparison of detection results of different backbone networks</title>
<p>In order to prove the superiority of the improved model under GhostNet framework, this paper conducted comparative experiments on the traditional YOLOv4 and different backbone networks. CSPDarkNet53, GhostNet, ShuffleNetV2, and EfficientNet were used as backbone networks to detect Precision, Recall, F1, AP, detection speed and weight size of pitaya fruit targets. The comparison results of the detection performance of different backbone networks are shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>. As can be seen from <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>, although Precision, Recall, F1, and AP of the models with GhostNet, ShuffleNetV2, and EfficientNet as the backbone networks have been decreased, the detection speed has been improved, and the weight size has been reduced significantly. Among them, the weight size of the model with GhostNet and ShuffleNetV2 as the&#xa0;backbone network is similar, but in terms of detection speed, the model with GhostNet as the backbone network is the fastest and the comprehensive effect is the best.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Comparison results of detection performance of different backbone networks.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Backbonenetworks</th>
<th valign="top" align="center">IoU score</th>
<th valign="top" align="center">Precision/%</th>
<th valign="top" align="center">Recall/%</th>
<th valign="top" align="center">F1 score/%</th>
<th valign="top" align="center">Average precision/%</th>
<th valign="top" align="center">Detection speed/frames&#xb7;s<sup>-1</sup>
</th>
<th valign="top" align="center">Weight size/MB</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="2" align="left">CSPDarkNet53</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">92.43</td>
<td valign="top" align="center">92.22</td>
<td valign="top" align="center">92</td>
<td valign="top" align="center">94.34</td>
<td valign="top" rowspan="2" align="center">27.4</td>
<td valign="top" rowspan="2" align="center">244</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">62.16</td>
<td valign="top" align="center">59.67</td>
<td valign="top" align="center">61</td>
<td valign="top" align="center">54.76</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">GhostNet</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">91.32</td>
<td valign="top" align="center">87.30</td>
<td valign="top" align="center">88</td>
<td valign="top" align="center">92.10</td>
<td valign="top" rowspan="2" align="center">32.6</td>
<td valign="top" rowspan="2" align="center">152</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">60.47</td>
<td valign="top" align="center">59.03</td>
<td valign="top" align="center">58</td>
<td valign="top" align="center">52.05</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">ShuffleNetV2</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">91.38</td>
<td valign="top" align="center">87.50</td>
<td valign="top" align="center">89</td>
<td valign="top" align="center">92.16</td>
<td valign="top" rowspan="2" align="center">30.2</td>
<td valign="top" rowspan="2" align="center">151</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">60.85</td>
<td valign="top" align="center">59.07</td>
<td valign="top" align="center">59</td>
<td valign="top" align="center">52.11</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">EfficientNet</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">90.86</td>
<td valign="top" align="center">86.91</td>
<td valign="top" align="center">86</td>
<td valign="top" align="center">91.72</td>
<td valign="top" rowspan="2" align="center">29.8</td>
<td valign="top" rowspan="2" align="center">163</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">60.03</td>
<td valign="top" align="center">58.40</td>
<td valign="top" align="center">57</td>
<td valign="top" align="center">51.78</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_2">
<title>4.2 Analysis of pitaya fruit identification results</title>
<p>The network structure of this paper was based on YOLOv4, it used GhostNet as the feature extraction backbone network, applied the improved combinational convolution-CA module to the feature fusion, and referenced the Ghost Module in Yolo Head. In order to prove the superiority of the improved network based on YOLOv4, it was necessary to compare and analyze the performance of the detection network before and after the improvement. The pitaya fruit recognition experiment was performed on YOLOv4 and the improved YOLOv4 network under the pitaya fruit dataset. The specific improvements were as follows: &#x2460;Replacement of the backbone network. GhostNet was used as the backbone network. &#x2461;The improved combinational convolution-CA module was used at feature fusion ac. &#x2462;The improved combinational convolution-CA module was used at feature fusion bd. &#x2463;The improved combinational convolution-CA module was used at feature fusion abcd. &#x2464;Ghost Module was referenced in Yolo Head. Effects of five improved methods on recognition of pitaya fruit in different natural environment as shown in <xref ref-type="fig" rid="f9">
<bold>Figure&#xa0;9</bold>
</xref>. Comprehensive comparison showed that GhostNet was used as the backbone network, the improved combinational convolution-CA module was used at feature fusion ac, and the improved algorithm of Ghost Module was referenced in Yolo Head to detect pitaya fruit in rainy days, occlusion and backlight conditions. It had high recognition ac-curacy, while the other four target detection networks had missed detection and false detection, and the recognition accuracy was lower than that of the YOLOv4+&#x2460;+&#x2461;+&#x2464; net-work structure model. Therefore, the improved algorithm in this paper has strong robustness and can adapt to different situations in natural environment.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>Different recognition algorithms for pitayas in three scenes: <bold>(A)</bold> Rainy weather; <bold>(B)</bold> Occlusion; <bold>(C)</bold> Backlighting.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g009.tif"/>
</fig>
<p>The comparison results of the detection performance of five different improved algorithms are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>. It can be seen from <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref> that Precision, Recall, F1, AP and detection speed of the YOLOv4+&#x2460;+&#x2461;+&#x2464; network structure model are higher than those of the other four target detection networks. When IoU is 0.50, these indexes were 99.23%, 95.10%, 98%, 98.94%, and 37.2 frames&#xb7;s<sup>-1</sup>, and the weight size was the smallest, which was 59.4MB, which proved that the model was significantly better than the other four network structures. Compared with the results of traditional YOLOv4, when IoU is 0.50, Precision was increased by 6.8 percentage points, Recall was increased by 2.88 percentage points, F1 was increased by 6 percentage points, AP was increased by 4.6 percentage points, the detection speed was increased by 9.8 frames&#xb7;s<sup>-1</sup>, and the weight size was reduced by 184.6MB.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Comparison results of detection performance of different improved networks.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Algorithms</th>
<th valign="top" align="center">IoU score</th>
<th valign="top" align="center">Precision/%</th>
<th valign="top" align="center">Recall/%</th>
<th valign="top" align="center">F1 score/%</th>
<th valign="top" align="center">Average&#xa0;precision/%</th>
<th valign="top" align="center">Detection speed/frames&#xb7;s<sup>-1</sup>
</th>
<th valign="top" align="center">Weight size/MB</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" rowspan="2" align="left">YOLOv4+&#x2460;+&#x2461;</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">97.15</td>
<td valign="top" align="center">92.86</td>
<td valign="top" align="center">95</td>
<td valign="top" align="center">97.38</td>
<td valign="top" rowspan="2" align="center">36.4</td>
<td valign="top" rowspan="2" align="center">60.7</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">79.47</td>
<td valign="top" align="center">76.18</td>
<td valign="top" align="center">77</td>
<td valign="top" align="center">74.50</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">YOLOv4+&#x2460;+&#x2462;</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">96.40</td>
<td valign="top" align="center">89.72</td>
<td valign="top" align="center">93</td>
<td valign="top" align="center">96.88</td>
<td valign="top" rowspan="2" align="center">35.9</td>
<td valign="top" rowspan="2" align="center">81.1</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">80.01</td>
<td valign="top" align="center">75.51</td>
<td valign="top" align="center">77</td>
<td valign="top" align="center">72.64</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">YOLOv4+&#x2460;+&#x2463;</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">96.35</td>
<td valign="top" align="center">89.75</td>
<td valign="top" align="center">93</td>
<td valign="top" align="center">96.86</td>
<td valign="top" rowspan="2" align="center">35.7</td>
<td valign="top" rowspan="2" align="center">99.4</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">78.50</td>
<td valign="top" align="center">74.81</td>
<td valign="top" align="center">76</td>
<td valign="top" align="center">72.58</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">YOLOv4+&#x2460;+&#x2461;+&#x2464;</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">99.23</td>
<td valign="top" align="center">95.10</td>
<td valign="top" align="center">98</td>
<td valign="top" align="center">98.94</td>
<td valign="top" rowspan="2" align="center">37.2</td>
<td valign="top" rowspan="2" align="center">59.4</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">80.00</td>
<td valign="top" align="center">76.19</td>
<td valign="top" align="center">78</td>
<td valign="top" align="center">74.62</td>
</tr>
<tr>
<td valign="top" rowspan="2" align="left">YOLOv4+&#x2460;+&#x2462;+&#x2464;</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">97.17</td>
<td valign="top" align="center">92.48</td>
<td valign="top" align="center">95</td>
<td valign="top" align="center">97.39</td>
<td valign="top" rowspan="2" align="center">36.8</td>
<td valign="top" rowspan="2" align="center">79.9</td>
</tr>
<tr>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">79.51</td>
<td valign="top" align="center">76.53</td>
<td valign="top" align="center">78</td>
<td valign="top" align="center">74.56</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Also, parameters, Flops, and MAC are frequently used when evaluating the size and computational complexity of deep learning models. Parameters represent the total number of parameters inside the model, which is used to measure the size of the model. Flops is the number of floating-point operations, which is used to measure the computational complexity of the model. MAC is the memory access cost, which is used to evaluate the memory usage of the model at runtime. The comparison results of traditional YOLOv4 and the above five different improved algorithms are shown in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>. It can be seen from <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref> that the total parameters of the YOLOv4+&#x2460;+&#x2461;+&#x2464; network structure model are the smallest, the model computational complexity is the lowest, and the model running memory is the least.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Comparison results of different improved networks.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="top" align="left">Algorithms</th>
<th valign="top" align="center">Parameters</th>
<th valign="top" align="center">Flops/G</th>
<th valign="top" align="center">MAC/MB</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">YOLOv4</td>
<td valign="top" align="center">64040001</td>
<td valign="top" align="center">29.95</td>
<td valign="top" align="center">606.54</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv4+&#x2460;+&#x2461;</td>
<td valign="top" align="center">15839686</td>
<td valign="top" align="center">13.88</td>
<td valign="top" align="center">578.49</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv4+&#x2460;+&#x2462;</td>
<td valign="top" align="center">21204446</td>
<td valign="top" align="center">13.88</td>
<td valign="top" align="center">582.72</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv4+&#x2460;+&#x2463;</td>
<td valign="top" align="center">25976382</td>
<td valign="top" align="center">20.7</td>
<td valign="top" align="center">600.26</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv4+&#x2460;+&#x2461;+&#x2464;</td>
<td valign="top" align="center">15503686</td>
<td valign="top" align="center">13.61</td>
<td valign="top" align="center">578.49</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv4+&#x2460;+&#x2462;+&#x2464;</td>
<td valign="top" align="center">20868446</td>
<td valign="top" align="center">13.61</td>
<td valign="top" align="center">582.72</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In conclusion, the improved YOLOv4 structure proposed in this paper is shown in <xref ref-type="fig" rid="f10">
<bold>Figure&#xa0;10</bold>
</xref>, which can effectively identify pitaya fruit in natural environment and meet the requirements of target recognition accuracy and speed, with the best overall performance.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>The improved YOLOv4 network structure diagram. * means repeat the operation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-13-1030021-g010.tif"/>
</fig>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<title>5 Conclusions and discussion</title>
<p>This paper proposed an improved YOLOv4 target recognition algorithm. It used GhostNet as the backbone network, used the improved combinational convolution-CA module at feature fusion, and referenced the Ghost Module in Yolo Head. The algorithm was lighter than the traditional YOLOv4, with faster detection speed and higher recognition accuracy. The model had high recognition accuracy when detecting pitaya fruit in rainy days, occlusion and backlight conditions.</p>
<p>Compared with the results of traditional YOLOv4, when IoU is 0.50, the accuracy of the improved YOLOv4 target recognition algorithm proposed in this paper on the augmented dataset can reach 99.23%, and the weight size is about 1/4 of the traditional YOLOv4. The average accuracy is improved by nearly 5 percentage points, and the detection speed is improved by nearly 10 frames&#xb7;s<sup>-1</sup>. Experiments prove that the YOLOv4 recognition algorithm proposed in this study that was combined CA with the improved combinational convolution has significant advantages.</p>
<p>The method is suitable for recognition of other fruits and even other kinds of objects. In future work, it is applied to the robot platform, and the corresponding robotic arm, manipulator, and binocular camera are equipped on its chassis to complete the entire picking process.</p>
</sec>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>Conceptualization, FZ and WC. Methodology, WC and SW. Software, WC and SW. Validation, FZ, WC and SW. Formal analysis, FZ. Investigation, WC, XC and XW. Resources, WC and SW. Data curation, FZ and WC. Writing&#x2014;original draft preparation, FZ and WC. Writing&#x2014;review and editing, FZ and WC. Visualization, FZ and WC. Supervision, FZ, NY and SF. Project administration, FZ and XZ. Funding acquisition, FZ and XZ. All contributed to the article and approved the submitted version.</p>
</sec>
<sec id="s8" sec-type="funding-information">
<title>Funding</title>
<p>This research was funded by the Scientific and Technological Project of Henan Province (No. 212102110029), and the National Natural Science Foundation of China (No. 61771224), High-tech Key Laboratory of Agricultural Equipment and Intelligence of Jiangsu Province (No. JNZ201901) and the Colleges and Universities of Henan Province Youth Backbone Teacher Training Program (No. 2017GGJS062).</p>
</sec>    <sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bochkovskiy</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C. Y.</given-names>
</name>
<name>
<surname>Liao</surname> <given-names>H. Y. M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>YOLOv4: Optimal speed and accuracy of object detection</article-title>. <source>ArXiv</source>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2004.10934</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Overview of image quality assessment method based on deep learning</article-title>. <source>Comput. Eng. Applications.</source> <volume>57</volume>, <fpage>27</fpage>&#x2013;<lpage>36</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3778/j.issn.1002-8331.2106-0228</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chu</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>N.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A method of fruit picking robot target identification based on machine vision</article-title>. <source>J. Chin. Agric. Mechanization.</source> <volume>50</volume>, <fpage>42</fpage>&#x2013;<lpage>50</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.13733/j.jcam.issn.2095-5553.2018.02.017</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Fast r-CNN</article-title>,&#x201d; in <conf-name>IEEE International Conference on Computer Vision</conf-name>, <conf-loc>Santiago, Chile</conf-loc>. <fpage>1440</fpage>&#x2013;<lpage>1448</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ICCV.2015.169</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Donahue</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Darrell</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Malik</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Rich feature hierarchies for accurate object detection and semantic segmentation</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <conf-loc>Columbus, OH, USA</conf-loc>. <fpage>580</fpage>&#x2013;<lpage>587</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2014.81</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Cui</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Construction of the automatic quantification system for the phenotype of amygdalus mira seeds based on HSV space and fitting ellipse</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>37</volume>, <fpage>202</fpage>&#x2013;<lpage>210</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2021.20.023</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Han</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Tian</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Xu</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>GhostNet: More features from cheap operations</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), ELECTR NETWORK</conf-name>.</citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hou</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Coordinate attention for efficient mobile network design</article-title>,&#x201d; in <conf-name>IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), ELECTR NETWORK</conf-name>.</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Deng</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>A new red-fleshed pitaya cultivar &#x2018;Meilong 1&#x2019;</article-title>. <source>Acta Hortic. Sin.</source> <volume>48</volume>, <fpage>1267</fpage>&#x2013;<lpage>1268</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.16420/j.issn.0513-353x.2019-1002</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>G.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ai</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Detection model for wine grapes using MobileNetV2 lightweight network</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>37</volume> (<issue>17</issue>), <fpage>168</fpage>&#x2013;<lpage>176</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2021.17.019</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Lian</surname> <given-names>G.</given-names>
</name>
<etal/>
</person-group>. (<year>2022</year>). <article-title>A novel agricultural machinery intelligent design system based on integrating image processing and knowledge reasoning</article-title>. <source>Appl. Sci.</source> <volume>12</volume>, <fpage>1</fpage>&#x2013;<lpage>24</lpage>. doi: <pub-id pub-id-type="doi">10.3390/app12157900</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Anguelov</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Erhan</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Szegedy</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Reed</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>C.</given-names>
</name>
<etal/>
</person-group>. (<year>2016</year>). &#x201c;<article-title>SSD: Single shot multibox detector</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision Springer</conf-name>, <conf-loc>Cham</conf-loc>. <fpage>21</fpage>&#x2013;<lpage>37</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/978-3-319-46448-0_2</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Ehsani</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Toudeshki</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Identifying immature and mature pomelo fruits in trees by elliptical model fitting in the cr-cb color space</article-title>. <source>Precis. Agriculture.</source> <volume>20</volume>, <fpage>138</fpage>&#x2013;<lpage>156</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11119-018-9586-1</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>D.</given-names>
</name>
<name>
<surname>Jia</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Ji</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A detection method for apple fruits based on color and shape features</article-title>. <source>IEEE Access.</source> <volume>5</volume>, <fpage>1</fpage>&#x2013;<lpage>1</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/ACCESS.2019.2918313</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Lu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Hong</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Xue</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Citrus recognition method based on improved YOLOv3-lite lightweight neural network</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>35</volume>, <fpage>205</fpage>&#x2013;<lpage>214</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2019.17.025</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miao</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Shen</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Image recognition algorithm and experiment of overlapped fruits in natural environment</article-title>. <source>Trans. Chin. Soc. Agric. Machinery.</source> <volume>42</volume>, <fpage>28</fpage>&#x2013;<lpage>51</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.6041/j.issn.1000-1298.2016.06.003</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Divvala</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Farhadi</surname> <given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>You only look once: Unified, real-time object detection</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <conf-loc>Las Vegas, NV, USA</conf-loc>. <fpage>779</fpage>&#x2013;<lpage>788</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2016.91</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ren</surname> <given-names>S.</given-names>
</name>
<name>
<surname>He</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Girshick</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Faster r-CNN: Towards real-time object detection with region proposal networks</article-title>,&#x201d; in <conf-name>Annual Conference on Neural Information Processing Systems</conf-name>, <conf-loc>Montreal, Canada</conf-loc>. <fpage>91</fpage>&#x2013;<lpage>99</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Recognition and localization methods for vision-based fruit picking robots: A review</article-title>. <source>Front. Plant Science.</source> <volume>11</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2020.00510</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Fruit detection and positioning technology for a camellia oleifera c. Abel orchard based on improved YOLOv4-tiny model and binocular stereo vision. Expert systems with applications</article-title> <volume>211</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.eswa.2022.118573</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tan</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>W. S.</given-names>
</name>
<name>
<surname>Gan</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Recognising blueberry fruit of different maturity using histogram oriented gradients and colour features in outdoor scenes</article-title>. <source>Biosyst. Engineering.</source> <volume>176</volume>, <fpage>59</fpage>&#x2013;<lpage>72</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biosystemseng.2018.08.011</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Qin</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Lei</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Tan</surname> <given-names>K.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Blueberry maturity recognition method based on improved YOLOv4-tiny</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>37</volume>, <fpage>170</fpage>&#x2013;<lpage>178</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2021.18.020</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Duan</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ai</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Zou</surname> <given-names>X.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Rachis detection and three-dimensional localization of cut off point for vision-based banana robot</article-title>. <source>Comput. Electron. Agriculture.</source>, <volume>198</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compag.2022.107079</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Duan</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Ye</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ai</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Multi-T arget recognition of bananas and automatic positioning for the inflorescence axis cutting point</article-title>. <source>Front. Plant Sci.</source> <volume>12</volume>, <elocation-id>1</elocation-id>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fpls.2021.705021</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiong</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhong</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>B.</given-names>
</name>
<name>
<surname>Sun</surname> <given-names>B.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Citrus detection method in night environment based on improved YOLOv3 network</article-title>. <source>Trans. Chin. Soc. Agric. Machinery.</source> <volume>51</volume>, <fpage>199</fpage>&#x2013;<lpage>206</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.6041/j.issn.1000-1298.2020.04.023</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xue</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Q.</given-names>
</name>
<name>
<surname>Tu</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Mao</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>X.</given-names>
</name>
<etal/>
</person-group>. (<year>2018</year>). <article-title>Immature mango detection based on improved YOLOv2</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>34</volume>, <fpage>173</fpage>&#x2013;<lpage>179</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2018.07.022</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yan</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Su</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<etal/>
</person-group>. (<year>2019</year>). <article-title>Recognition of Rosa roxbunghii in natural environment based on improved faster RCNN</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>35</volume>, <fpage>143</fpage>&#x2013;<lpage>150</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2019.18.018</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ye</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Ou</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Ning</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Zheng</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Qin</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Qin</surname> <given-names>J.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Effect of bearing branch on the fruit quality of pitaya</article-title>. <source>J. Yunnan Agric. Univ. (Natural Science)</source> <volume>36</volume>, <fpage>91</fpage>&#x2013;<lpage>96</lpage>. doi: &#xa0;<pub-id pub-id-type="doi">10.12101/j.issn.1004-390X(n).202005017</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yi</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Detecting and counting of spring-see citrus using YOLOv4 network model and recursive fusion of features</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>37</volume>, <fpage>161</fpage>&#x2013;<lpage>169</lpage>.  doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2021.18.019</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>F.</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Bao</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Recognition of dense cherry tomatoes based on improved YOLOv4-LITE lightweight neural network</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>37</volume> (<issue>16</issue>), <fpage>270</fpage>&#x2013;<lpage>278</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2021.16.033</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Su</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>Z.</given-names>
</name>
<etal/>
</person-group>. (<year>2021</year>). <article-title>Overview of deep convolutional neural networks for image classification</article-title>. <source>J. Image Graphics.</source> <volume>26</volume>, <fpage>2305</fpage>&#x2013;<lpage>2325</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11834/jig.200302</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ma</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Wu</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Fast detection and yield estimation of RipeCitrus fruit based on machine vision</article-title>. <source>Guangdong Agric. Sci.</source> <volume>46</volume>, <fpage>156</fpage>&#x2013;<lpage>161</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.16768/j.issn.1004-874X.2019.07.022</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>K.</given-names>
</name>
<name>
<surname>Ge</surname> <given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>W.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A method for organs classification and fruit counting on pomegranate trees based on multi-features fusion and support vector machine by 3D point cloud</article-title>. <source>Scientia Horticulturae.</source> <volume>278</volume>, <elocation-id>109791</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.scienta.2020.109791</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>X.</given-names>
</name>
<name>
<surname>Ding</surname> <given-names>R.</given-names>
</name>
<name>
<surname>Zhang</surname> <given-names>Z.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Detection and recognition method for tomato on faster r-CNN algorithm. journal of Shandong agricultural university (Natural science edition)</article-title> <volume>52</volume>, <fpage>624</fpage>&#x2013;<lpage>630</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3969/j.issn.1000-2324.2021.04.017</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname> <given-names>T.</given-names>
</name>
<name>
<surname>Jiang</surname> <given-names>M.</given-names>
</name>
<name>
<surname>Feng</surname> <given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Vision based target recognition and location for picking robot</article-title>. <source>Chin. J. Sci. Instrument.</source> <volume>42</volume>, <fpage>28</fpage>&#x2013;<lpage>51</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.19650/j.cnki.cjsi.J2107650</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Wen</surname> <given-names>C.</given-names>
</name>
<name>
<surname>Lin</surname> <given-names>S.</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Long</surname> <given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Tomato florescence recognition and detection method based on cascaded neural network</article-title>. <source>Trans. Chin. Soc. Agric. Engineering.</source> <volume>36</volume>, <fpage>143</fpage>&#x2013;<lpage>152</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.11975/j.issn.1002-6819.2020.24.017</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Zhou</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>J.</given-names>
</name>
<name>
<surname>Li</surname> <given-names>W.</given-names>
</name>
<name>
<surname>Jin</surname> <given-names>H.</given-names>
</name>
<etal/>
</person-group>. (<year>2020</year>). <article-title>Automatic identification technology of lycium barbarum flowering period and FruitRipening period based on faster r-CNN</article-title>. <source>Chin. J. Agrometeorology.</source> <volume>41</volume>, <fpage>668</fpage>&#x2013;<lpage>677</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3969/j.issn.1000-6362.2020.10.006</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>