<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1297828</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2023.1297828</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>YOLOv5-TS: Detecting traffic signs in real-time</article-title>
<alt-title alt-title-type="left-running-head">Shen et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2023.1297828">10.3389/fphy.2023.1297828</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Shen</surname>
<given-names>Jiquan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Ziyang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2419324/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Luo</surname>
<given-names>Junwei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/832518/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhang</surname>
<given-names>Xiaohong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1034461/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Software</institution>, <institution>Henan Polytechnic University</institution>, <addr-line>Jiaozuo</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Anyang Institute of Technology</institution>, <addr-line>Anyang</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>School of Computer Science and Technology</institution>, <institution>Henan Polytechnic University</institution>, <addr-line>Jiaozuo</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2042472/overview">Chun-Hui He</ext-link>, Xi&#x2019;an University of Architecture and Technology, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1706638/overview">Zhekang Dong</ext-link>, Hangzhou Dianzi University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1512622/overview">Bo Huang</ext-link>, Shanghai University of Engineering Sciences, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Xiaohong Zhang, <email>xh.zhang@hpu.edu.cn</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>24</day>
<month>11</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>11</volume>
<elocation-id>1297828</elocation-id>
<history>
<date date-type="received">
<day>20</day>
<month>09</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>11</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Shen, Zhang, Luo and Zhang.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Shen, Zhang, Luo and Zhang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Traffic sign detection plays a vital role in assisted driving and automatic driving. YOLOv5, as a one-stage object detection solution, is very suitable for Traffic sign detection. However, it suffers from the problem of false detection and missed detection of small objects. To address this issue, we have made improvements to YOLOv5 and subsequently introduced YOLOv5-TS in this work. In YOLOv5-TS, a spatial pyramid with depth-wise convolution is proposed by replacing maximum pooling operations in spatial pyramid pooling with depth-wise convolutions. It is applied to the backbone to extract multi-scale features at the same time prevent feature loss. A Multiple Feature Fusion module is proposed to fuse multi-scale feature maps multiple times with the purpose of enhancing both the semantic expression ability and the detail expression ability of feature maps. To improve the accuracy in detecting small even extra small objects, a specialized detection layer is introduced by utilizing the highest-resolution feature map. Besides, a new method based on k-means&#x2b;&#x2b; is proposed to generate stable anchor boxes. The experiments on the data set verify the usefulness and effectiveness of our work.</p>
</abstract>
<kwd-group>
<kwd>multi-scale feature fusion</kwd>
<kwd>YOLOv5</kwd>
<kwd>object detection</kwd>
<kwd>traffic sign detection</kwd>
<kwd>k-means&#x2b;&#x2b;</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Interdisciplinary Physics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Traffic signs convey vital information such as speed limits, lane changes, pedestrian crossings, and potential hazards. With the ever-increasing volume of vehicles on the roads, accurately detecting and interpreting traffic signs is essential for assisting drivers in making informed decisions and complying with traffic regulations. It also plays a vital role in providing critical information to autonomous vehicles, allowing them to understand road regulations, make informed decisions, and navigate complex traffic scenarios.</p>
<p>Traditional approaches to traffic sign detection relied on color or shape template matching. However, these methods often struggle with variability in lighting conditions, shooting angles, and so on. In recent years, deep learning has shown significant advantages in object detection [<xref ref-type="bibr" rid="B1">1</xref>], attracting the attention of researchers. Several studies [<xref ref-type="bibr" rid="B2">2</xref>&#x2013;<xref ref-type="bibr" rid="B4">4</xref>] have utilized Faster R-CNN [<xref ref-type="bibr" rid="B5">5</xref>] for traffic sign detection. R-CNN-based [<xref ref-type="bibr" rid="B6">6</xref>] methods detect objects in two stages, which limited detection speed, making them less suitable for real-time traffic sign detection scenarios. In contrast to R-CNN-based methods, You Only Look Once (YOLO) [<xref ref-type="bibr" rid="B7">7</xref>] can detect objects in one stage, offering faster detection speeds.</p>
<p>YOLOv5 is one of the variants of YOLO. It emphasizes both detection speed and accuracy. Therefore, it is very suitable for real-time traffic sign detection. However, it suffers from the false detection and miss detection of small even extra small objects. To improve the performance of YOLOv5 in detecting small even extra small objects Zhang et al.[<xref ref-type="bibr" rid="B8">8</xref>], introduced a new layer for detecting small objects, while [<xref ref-type="bibr" rid="B9">9</xref>,<xref ref-type="bibr" rid="B10">10</xref>] separately tried to reduce feature loss and alleviate the impact of feature loss in the process of feature extraction. Although many efforts have been devoted to improving the detection speed and detection accuracy, how to improve the performance in detecting small even extra small objects is still an open problem.</p>
<p>In this paper, we make several improvements to YOLOv5 and propose YOLOv5-TS to improve the performance in detecting small even extra small objects. We propose a spatial pyramid with Depth-wise convolution (SPDC) and combine it with a group of parallel strip convolution blocks to construct a multiple multi-scale feature fusion module (MFFM). MFFM is designed to extract multi-scale feature maps. Based on MFFM, we construct a special detection layer for small even extra small objects. We also improved the method to generate anchor boxes by exploiting k-means&#x2b;&#x2b; algorithm.</p>
<p>The main contributions of this work are described as follows:<list list-type="simple">
<list-item>
<p>(1) A spatial pyramid with depth-wise convolution is designed to extract multi-scale features without feature loss.</p>
</list-item>
<list-item>
<p>(2) A multiple feature fusion module is proposed to fuse multi-scale feature maps, and enhance the semantic and detailed expression capabilities.</p>
</list-item>
<list-item>
<p>(3) A new detection layer is constructed specially for detecting small even extra small objects.</p>
</list-item>
<list-item>
<p>(4) A new method based on the k-means&#x2b;&#x2b; algorithm is proposed to generate stable anchor boxes.</p>
</list-item>
</list>
</p>
<p>The rest of the paper is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> provides an overview of related work in the field of traffic sign detection. <xref ref-type="sec" rid="s3">Section 3</xref> describes the theoretical basis. <xref ref-type="sec" rid="s4">Section 4</xref> details our proposed framework for traffic sign detection. <xref ref-type="sec" rid="s5">Section 5</xref> presents experimental results and performance evaluation. Finally, <xref ref-type="sec" rid="s6">Section 6</xref> concludes the paper and discusses avenues for future research in this domain.</p>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>Traditional traffic sign detection methods identify traffic signs by matching predefined color [<xref ref-type="bibr" rid="B11">11</xref>&#x2013;<xref ref-type="bibr" rid="B13">13</xref>] or shape [<xref ref-type="bibr" rid="B14">14</xref>,<xref ref-type="bibr" rid="B15">15</xref>] templates. These methods are sensitive to lighting conditions and shooting angles, making it difficult to achieve stable detection results. Additionally, these methods detect traffic signs at low speeds and hence cannot work in real-time scenarios [<xref ref-type="bibr" rid="B16">16</xref>].</p>
<p>Deep learning has shown distinct advantages since its emergence [<xref ref-type="bibr" rid="B17">17</xref>&#x2013;<xref ref-type="bibr" rid="B19">19</xref>]. It has been utilized to detect traffic signs [<xref ref-type="bibr" rid="B20">20</xref>,<xref ref-type="bibr" rid="B21">21</xref>]. Some researchers [<xref ref-type="bibr" rid="B2">2</xref>&#x2013;<xref ref-type="bibr" rid="B4">4</xref>] detect traffic signs with R-CNN. However, R-CNN belongs to the category of two-stage object detection solutions. Although it can detect objects with high accuracy, it suffers from low detection speed. Therefore, it is not suitable for real-time traffic sign detection scenarios.</p>
<p>Different from R-CNN, You Look Only Once (YOLO) algorithm belongs to the category of one-stage detection solutions. It can detect objects at a high speed. YOLO has several versions, and some of the versions have been applied to traffic sign detection [<xref ref-type="bibr" rid="B22">22</xref>&#x2013;<xref ref-type="bibr" rid="B24">24</xref>]. YOLOv5 is the version which emphasizes both detection accuracy and detection speed. Therefore, it is more suitable for real-time traffic sign detection than other solutions. Many efforts have been devoted to improve the performance of YOLOv5 in detecting traffic signs. To improve the detection speed of YOLOv5, Li et al.[<xref ref-type="bibr" rid="B25">25</xref>] used ghost convolution [<xref ref-type="bibr" rid="B26">26</xref>], depth-wise convolution [<xref ref-type="bibr" rid="B27">27</xref>] and channel attention [<xref ref-type="bibr" rid="B28">28</xref>] to construct a light version backbone. Zhao et al.[<xref ref-type="bibr" rid="B29">29</xref>] applied GSConv [<xref ref-type="bibr" rid="B30">30</xref>] to the feature fusion layer to reduce computation complexity. To improve the detection accuracy, Bai et al.[<xref ref-type="bibr" rid="B31">31</xref>] utilized a transformer structure to replace SPP. Wan et al.[<xref ref-type="bibr" rid="B32">32</xref>] improved the backbone with MixConv [<xref ref-type="bibr" rid="B33">33</xref>] and the neck with integrated attentional feature fusion [<xref ref-type="bibr" rid="B34">34</xref>].</p>
<p>Considering the impact of detection delay on real-time decision-making, detection operations should be conducted at a relatively long distance from traffic signs. Therefore, the detection targets, that is, traffic signs, are relatively small. However, YOLOv5 suffers from the false detection problem and the missed detection problem of small objects. To improve the accuracy of YOLOv5 in detecting small objects, Zhang et al.[<xref ref-type="bibr" rid="B8">8</xref>] constructed an additional detection for small objects. Mahaur and Mishra [<xref ref-type="bibr" rid="B9">9</xref>] replaced the pooling layers in the SPP module with dilated convolutions to capture the multi-scale features which is important for detecting small objects. Wang et al.[<xref ref-type="bibr" rid="B10">10</xref>] utilized an adaptive attention module and a features enhancement module to alleviate the loss of features of objects, especially small objects.</p>
<p>Although a lot of work has been devoted to improving the performance of YOLOv5 in detecting small objects, how to improve the accuracy of YOLOv5 in detecting small even extra small traffic signs in real time is still an open problem.</p>
</sec>
<sec id="s3">
<title>3 Theoretical basis</title>
<sec id="s3-1">
<title>3.1 YOLOv5</title>
<p>YOLOv5 has serial versions, that is, YOLOv5n, YOLOv5s, YOLOv5m, YOLOv5l, and YOLOv5x. In this work, we exploit YOLOv5s to detect traffic signs since YOLOv5s strikes a remarkable balance between speed and accuracy. <xref ref-type="fig" rid="F1">Figure 1</xref> shows the structure of YOLOv5s. According to the figure, YOLOv5s consists of backbone, neck and head. The backbone is primarily composed of CBS and CSP [<xref ref-type="bibr" rid="B35">35</xref>]. It extracts features from input data. The neck consists of FPN [<xref ref-type="bibr" rid="B36">36</xref>] and PAN [<xref ref-type="bibr" rid="B37">37</xref>]. It aims to enrich the features in each feature map. The head performs regression predictions according to the feature maps output from the neck.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Structure of YOLOv5.</p>
</caption>
<graphic xlink:href="fphy-11-1297828-g001.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 SPPF</title>
<p>In YOLOv5, SPPF (Spatial Pyramid Pooling - Fast) is employed to capture information of different scales from an input feature map. It first utilizes a convolution layer to reduce the channels of the input image. And then, it exploits max pooling layer to generate feature maps of different scales. After that, it concatenates these feature maps along channels. Finally, it processes the concatenated feature map with a convolution operation to generate a feature map with rich features.</p>
</sec>
<sec id="s3-3">
<title>3.3 The k-means&#x2b;&#x2b; algorithm</title>
<p>The k-means&#x2b;&#x2b; clustering algorithm is an improved version of the k-means clustering algorithm. It is designed to optimize the selection of initial cluster centers. The k-means algorithm selects all initial cluster centers randomly, which may lead to local optimum. Different from k-means algorithm, k-means&#x2b;&#x2b; algorithm selects only the first initial cluster centers randomly. After that, it selects the other initial cluster centers according to the distances to existing cluster centers. The k-means&#x2b;&#x2b; algorithm can not only converge more fastly, but also avoid trapping local optimum.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Proposed method</title>
<p>In order to improve the performance of YOLOv5 in detecting traffic signs, we improve YOLOv5 and propose YOLOv5-TS. In this section, we introduce the details of the improvements.</p>
<sec id="s4-1">
<title>4.1 YOLOv5-TS</title>
<p>YOLOv5-TS is proposed based on the following improvements on YOLOv5. First, we propose SPDC by combining SPPF and depth-wise convolution. SPDC utilizes depth-wise convolutions to replace maximum pooling operations, thus avoiding the feature loss caused by the latter. It stacks multiple depth-wise convolutions to extract the multi-scale features of objects, which helps to capture the overall structures and local details of objects, and finally strengthens the expression ability of the fused feature map. Second, we propose a multiple multi-scale feature fusion module (MFFM) based on SPDC and a group of parallel strip convolution blocks. MFFM utilizes SPDC and the group of parallel strip convolution blocks to extract multi-scale feature maps, and exploits convolution and matrix operations to fuse those feature maps so as to enhance the semantic and detailed expression capabilities of feature maps. Third, we introduce a new detection layer of 160 &#xd7; 160 to improve the performance in detecting extra small objects. Besides, we delete the layer to detect large objects since those objects are not common in traffic design scenarios. Finally, we optimize the method to generate anchor boxes by replacing the k-means algorithm with the k-means&#x2b;&#x2b; algorithm. <xref ref-type="fig" rid="F2">Figure 2</xref> shows the structure of YOLOv5-TS.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Structure of YOLOv5-TS. The modules circled by the red dotted rectangle construct the new detection layer. The orange cuboid represents the proposed MFFM.</p>
</caption>
<graphic xlink:href="fphy-11-1297828-g002.tif"/>
</fig>
</sec>
<sec id="s4-2">
<title>4.2 Spatial pyramid involving depth-wise convolution</title>
<p>The Convolutional Neural Network (CNN) processes images only at specific scales. In reality, the scale of images is arbitrary. These images must be crapped or warped to a specific scale before being fed to CNNs [<xref ref-type="bibr" rid="B38">38</xref>]. However, crapping results in content loss and warping causes geometric distortion, which degrades detection accuracy. Spatial Pyramid Pooling (SPP) eliminates the limitation of deep convolutional neural network on the scale of input images by using multi-scale pooling, thereby avoiding the loss of features caused by cropping and the distortion caused by warping, and improving the accuracy of detection. Compared with SPP, Spatial Pyramid Pooling (SPPF) acquires feature maps of different receptive fields by stacking pooled layers with smaller kernels, and detecting objects with a higher speed.</p>
<p>SPP and SPPF utilize maximum pooling operations to extract features. The maximum pool operation retains only the maximum value in each region and discards all other values in the same region, which can lose critical information of targets. Compared with the maximum pooling operation, trained depth-wise convolution is more sensitive to the features of objects. It has the ability to retain the critical features of targets, which helps to improve the accuracy of detection. Based on the above analysis, we proposed a spatial pyramid with depth-wise convolution (SPDC). SPDC utilizes depth-wise convolutions to replace the multiple maximum pooling operations, thus avoiding the feature loss caused by the latter. It stacks multiple depth-wise convolutions to extract the multi-scale features of objects, which helps to capture the overall structures and local details of objects, and finally strengthens the expression ability of the fused feature map.</p>
<p>
<xref ref-type="fig" rid="F3">Figure 3</xref> shows the structure of SPDC. According to the map, SPDC first utilizes CBS to reduce the channels of an input feature map and then uses three tandem depth-wise convolutions to extract three feature maps of different scales. After that, it concatenates all the feature maps generated in the previous steps and utilizes CBS to generate the output feature map. Given an input feature map-<italic>F</italic>, <italic>f</italic>
<sub>
<italic>SPDC</italic>
</sub>(<italic>F</italic>) is utilized to represent the corresponding output of SPDC, and calculated according to Equation <xref ref-type="disp-formula" rid="e1">(1)</xref>. In the equation, <italic>f</italic>
<sub>
<italic>CBS</italic>
</sub>(&#x22c5;) and <italic>f</italic>
<sub>
<italic>con</italic>
</sub> (&#x22c5;) separately describe the functions related to CBS and concat. <italic>F</italic>
<sub>1</sub>, <italic>F</italic>
<sub>2</sub>, <italic>F</italic>
<sub>3</sub>, and <italic>F</italic>
<sub>4</sub> describe the feature maps extracted by the first CBS and the three tandem depth-wise convolutions, respectively. They are calculated according to Eq. <xref ref-type="disp-formula" rid="e2">(2)</xref> &#x223c; <xref ref-type="disp-formula" rid="e5">(5)</xref>. In those equations, <inline-formula id="inf1">
<mml:math id="m1">
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> denotes the depth-wise convolution with a kernel size of 5.<disp-formula id="e1">
<mml:math id="m2">
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">SPDC</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">CBS</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">con</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m3">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m4">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m5">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>5</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(4)</label>
</disp-formula>
<disp-formula id="e5">
<mml:math id="m6">
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">CBS</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Structure of SPDC. DWConv expresses a depth-wise convolution, while <italic>k</italic>
<sub>
<italic>a</italic>
</sub>, <italic>s</italic>
<sub>
<italic>b</italic>
</sub>, <italic>p</italic>
<sub>
<italic>c</italic>
</sub> indicate that the kernel size, stride, and padding of the depth-wise convolution are <italic>a</italic>, <italic>b</italic>, <italic>c</italic>, respectively.</p>
</caption>
<graphic xlink:href="fphy-11-1297828-g003.tif"/>
</fig>
</sec>
<sec id="s4-3">
<title>4.3 Multiple feature fusion module</title>
<p>Large-scale feature maps are generated in shallow networks. They often contain rich details, such as color, texture, etc. These details are conducive to capturing the subtle features and local structure of objects, which benefits classification. Small-scale feature maps are generated by deep networks. After passing through multiple convolution layers, they lose some details but obtain rich semantic information which is conducive to capturing the overall shapes and locations of objects. If the small-scale feature map and the large-scale feature map are fused, a feature map containing rich semantic information and rich detailed information will be produced, which is conducive to improving the detection accuracy and generalization ability of the model. According to the above analysis, a Multiple Feature Fusion module (MFFM) is proposed and applied to the backbone network of the improved YOLOv5.</p>
<p>MFFM includes three feature fusion operations. The first two feature fusion operations are designed to fuse multi-scale feature maps, while the last fusion operation is exploited to fuse the output feature maps of the former two fusion operations and the input feature map of MFFM to generate a new feature map with stronger feature expression ability. The first fusion operation is in SPDC. It is marked by the rectangle with dash lines in <xref ref-type="fig" rid="F3">Figure 3</xref>. The multi-scale feature maps input to this fusion operation are extracted by the three tandem depth-wise convolutions in SPDC. The multi-scale feature maps input to the second fusion operation are extracted by a group of parallel strip convolution blocks, where each block is composed of two different strip convolutions. They are fused with the input feature map of MFFM and the output feature map of the first feature fusion operation. The third feature fusion is implemented by performing matrix multiplication on the input feature map of MFFM and the output feature map of the second feature fusion. <xref ref-type="fig" rid="F4">Figure 4</xref> describes the structure of MFFM.<disp-formula id="e6">
<mml:math id="m7">
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(6)</label>
</disp-formula>
<disp-formula id="e7">
<mml:math id="m8">
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2295;</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(7)</label>
</disp-formula>
<disp-formula id="e8">
<mml:math id="m9">
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2297;</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2033;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2297;</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>F</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Structure of MFFM. &#x2295; and &#x2297; denote the matrix addition and matrix multiplication, respectively. Conv 1 &#xd7; 1 describes 1 &#xd7; 1 convolution. The operations surrounded by gray rectangle is utilized to extract multi-scale feature maps.</p>
</caption>
<graphic xlink:href="fphy-11-1297828-g004.tif"/>
</fig>
<p>Given an input feature map, <italic>F</italic>, the output of the first fusion operation is denoted as <inline-formula id="inf2">
<mml:math id="m10">
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. According to <xref ref-type="fig" rid="F3">Figure 3</xref>, <inline-formula id="inf3">
<mml:math id="m11">
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is equal to <italic>f</italic>
<sub>
<italic>SPDC</italic>
</sub>(<italic>F</italic>), and can be calculated by Equation <xref ref-type="disp-formula" rid="e1">(1)</xref>. Taking <italic>F</italic>&#x2032; to describe the input of the group of parallel strip convolution blocks, the output of the <italic>i</italic>th block can be denoted as <inline-formula id="inf4">
<mml:math id="m12">
<mml:msubsup>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> calculated according to Equation <xref ref-type="disp-formula" rid="e6">(6)</xref>. It is used as the input of the second fusion operation together with the input of MFFM, the output of the first fusion operation, and the output of the group of parallel strip convolution blocks. Eq. <xref ref-type="disp-formula" rid="e7">(7)</xref> shows the calculation of the output of the second fusion operation. Based on the above equations, the output of the third fusion operation, that is, the output of MFFM, is calculated by Equation <xref ref-type="disp-formula" rid="e8">(8)</xref>.</p>
</sec>
<sec id="s4-4">
<title>4.4 Multi-scale detection layers</title>
<p>YOLOv5 includes three detection layers. Those detection layers utilize three feature maps to detect objects of different scales, respectively. The first layer is constructed with the feature map of 80 &#xd7; 80 in which each pixel can be mapped to an area of 8 &#xd7; 8 in an input image. Therefore, it is suitable for detecting small objects. The second layer is constructed with the feature map of 40 &#xd7; 40 of which each pixel corresponds to a region of 16 &#xd7; 16 in an input image, and hence it is responsible for detecting medium objects. The third detection layer is utilized to detecting large objects since each pixel in the corresponding feature map, i.e., the feature map of 20 &#xd7; 20, is related to an area of 32 &#xd7; 32. However, considering the delay of detection and the real-time requirements of decision-making, detection should be carried out at a distance from objects, which indicates that the objects to be detected are usually small even extra small. Therefore, improving the detection performance of small even extra small objects is essential to improve the overall detection performance of traffic signs.</p>
<p>In order to improve the performance of traffic design, we introduce a special layer for extra small objects. The detection layer is constructed based on a 160 &#xd7; 160 feature map extracted by the backbone network. Each pixel in the feature map corresponds to an area of 4 &#xd7; 4 in an input image. The feature map is processed by MFFM to enhance the feature expression ability. After being processed by the neck network, it is used to predict extra small objects by the introduced layer. Considering that large objects are relatively not common in traffic sign detection scenarios and the feature map corresponding to detect large objects contains noise, we delete the detection layer of 20 &#xd7; 20. Finally, the improved solution includes three detection layers used to detect extra small objects, small objects, and medium objects, respectively.</p>
</sec>
<sec id="s4-5">
<title>4.5 Anchor box generation with k-means&#x2b;&#x2b;</title>
<p>Object detection algorithm always defines some bounding boxes in advance as anchor boxes. They set up multiple anchor boxes at each point, generate multiple prediction boxes according to these anchor boxes, and finally filter out qualified prediction boxes as detection results by indicators such as confidence. It is obvious that the selection of anchor boxes has a direct impact on detection results.</p>
<p>YOLOv5 exploits the k-means algorithm to select anchor boxes. However, the k-means algorithm initializes the center points of k clusters in a random way, which can probably result in unstable clustering results. Different from the k-means algorithm, the k-means&#x2b;&#x2b; algorithm initializes the center point of only one cluster in a random way. It initializes center points for the remaining (k-1) clusters according to the shortest distances from each non-center point to all center points, which alleviates the instability caused by random policies. The clustering results of the two algorithms on TT100K-23 dataset are shown in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Anchor boxes clustering results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Detection layers</th>
<th colspan="2" align="center">Anchor boxes</th>
</tr>
<tr>
<th align="center">Anchor boxes (k-means algorithm)</th>
<th align="center">Anchor boxes (k-means&#x2b;&#x2b; algorithm)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">(160 &#xd7; 160)</td>
<td align="center">(5, 6), (6, 7), (8, 9)</td>
<td align="center">(5, 6), (7, 8), (9, 10)</td>
</tr>
<tr>
<td align="center">(80 &#xd7; 80)</td>
<td align="center">(10, 11), (14, 14), (18, 19)</td>
<td align="center">(12, 13), (16, 17), (20, 22)</td>
</tr>
<tr>
<td align="center">(40 &#xd7; 40)</td>
<td align="center">(24, 26), (33, 34), (61, 55)</td>
<td align="center">(27, 29), (34, 36), (49, 54)</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5">
<title>5 Experiments</title>
<p>In this section, we present a detailed evaluation of YOLOv5-TS. First, we describe the experimental environments and evaluation metrics. Then, we describe the datasets used in this work. Finally, we discuss the results of the ablation experiments and the comparison experiments.</p>
<sec id="s5-1">
<title>5.1 Experimental setup</title>
<p>
<bold>Environments.</bold> All experiments are conducted on the same server equipped with an Intel Xeon Platinum 8260 <email>Processor@2.30GHz</email>, an NVIDIA RTX 3090 GPU, and 376&#xa0;GB of memory. The server is deployed with Ubuntu 20.04.4, Torch 1.12.1, CUDA 11.3, and Python 3.8 development environment.</p>
<p>
<bold>Evaluation metrics.</bold> We utilize five metrics, namely, Precision (P), Recall (R), mean Average Precision (mAP), F1-score, and Frames Per Second (FPS), to evaluate the performance of YOLOv5-TS. The first four metrics are calculated based on Eq. <xref ref-type="disp-formula" rid="e9">(9)</xref> &#x223c; <xref ref-type="disp-formula" rid="e13">(13)</xref> where TP, FP, and FN represent the true positive samples, the false positive samples, and the false negative samples, respectively. In Equation <xref ref-type="disp-formula" rid="e11">(11)</xref>, AP represents the average precision, P(R) describes the precision when recall is R. They are employed to assess the detection accuracy of YOLOv5-TS. The latter metric, FPS, is used to evaluate detection speed.<disp-formula id="e9">
<mml:math id="m13">
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(9)</label>
</disp-formula>
<disp-formula id="e10">
<mml:math id="m14">
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(10)</label>
</disp-formula>
<disp-formula id="e11">
<mml:math id="m15">
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x222b;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi>R</mml:mi>
</mml:math>
<label>(11)</label>
</disp-formula>
<disp-formula id="e12">
<mml:math id="m16">
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(12)</label>
</disp-formula>
<disp-formula id="e13">
<mml:math id="m17">
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
</sec>
<sec id="s5-2">
<title>5.2 Dataset</title>
<p>We use two datasets, that is, TT100K-23 and CCTSDB2021, to evaluate our solution. The dataset samples are as shown in <xref ref-type="fig" rid="F5">Figure 5</xref>.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Image samples from TT100K-23 <bold>(A)</bold> and <bold>(B)</bold>, and CCTSDB2021 <bold>(C)</bold> and <bold>(D)</bold>.</p>
</caption>
<graphic xlink:href="fphy-11-1297828-g005.tif"/>
</fig>
<p>
<bold>TT100K-23</bold>: TT100K [<xref ref-type="bibr" rid="B39">39</xref>] comprises a total of 100,000 images, with only 10,000 images having been labeled. These labeled images contain a variety of 30,000 traffic signs, distributed among approximately 200 different classes. To create TT100K-23, we carefully chose 6,229 images, covering the 23 categories with the highest number of instances. TT100K is randomly divided into a training set and a test set in the ratio of 9:1.</p>
<p>
<bold>CCTSDB2021</bold>: This dataset [<xref ref-type="bibr" rid="B40">40</xref>] contains 17,856 images with a total of 27,072 traffic signs. It is also divided into a training set and a test set in the ratio of 9:1. These traffic signs are divided into three classes: prohibitory, mandatory, and warning.</p>
</sec>
<sec id="s5-3">
<title>5.3 Ablation study</title>
<p>To analyze the effectiveness of the improvements in this work, we perform ablation experiments on the TT100K-23 dataset and describe the results in <xref ref-type="table" rid="T2">Table 2</xref>. In this table, Model0 denotes the original version of YOLOv5s, while Model1, Model2, and Model3 denote the variant version with a new detection layer, the variant with MFFM, and the variant with the anchor box generation method based on the k-means&#x2b;&#x2b; algorithm, respectively. According to the experimental results, Model1, Model2, and Model3 all achieved performance improvement compared to Model0, indicating that the introduction of the new detection layer, MFFM, and the anchor box generation method based on k-means&#x2b;&#x2b; helped to improve the performance of YOLOv5s in detecting traffic signs. Model4 represents the variant version with MFFM and 160&#x2a;160 detection layer. It obtained performance improvements compared to Model0. In addition, in the experiment, the values of <italic>k</italic>
<sub>1</sub>, <italic>k</italic>
<sub>2</sub>, and <italic>k</italic>
<sub>3</sub> in Equation <xref ref-type="disp-formula" rid="e6">(6)</xref> are set to 7, 11, and 21, respectively.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Ablation results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Models</th>
<th align="center">MFFM</th>
<th align="center">Detection layer</th>
<th align="center">k-means&#x2b;&#x2b;</th>
<th align="center">P (%)</th>
<th align="center">R (%)</th>
<th align="center">mAP (%)</th>
<th align="center">F1-score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Model0</td>
<td align="center">&#xd7;</td>
<td align="center">&#xd7;</td>
<td align="center">&#xd7;</td>
<td align="center">83.8</td>
<td align="center">87.5</td>
<td align="center">90.3</td>
<td align="center">85.6</td>
</tr>
<tr>
<td align="center">Model1</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">&#xd7;</td>
<td align="center">&#xd7;</td>
<td align="center">91.4</td>
<td align="center">84.5</td>
<td align="center">91.5</td>
<td align="center">87.8</td>
</tr>
<tr>
<td align="center">Model2</td>
<td align="center">&#xd7;</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">&#xd7;</td>
<td align="center">89.7</td>
<td align="center">85.9</td>
<td align="center">91.6</td>
<td align="center">87.7</td>
</tr>
<tr>
<td align="center">Model3</td>
<td align="center">&#xd7;</td>
<td align="center">&#xd7;</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">89.1</td>
<td align="center">86.7</td>
<td align="center">91.6</td>
<td align="center">87.8</td>
</tr>
<tr>
<td align="center">Model4</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">&#xd7;</td>
<td align="center">91.3</td>
<td align="center">87.1</td>
<td align="center">92.6</td>
<td align="center">89.1</td>
</tr>
<tr>
<td align="center">Model5</td>
<td align="center">&#xd7;</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">92</td>
<td align="center">85.9</td>
<td align="center">92.6</td>
<td align="center">89</td>
</tr>
<tr>
<td align="center">Model6</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">&#xd7;</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">91.1</td>
<td align="center">87.2</td>
<td align="center">92.5</td>
<td align="center">89.1</td>
</tr>
<tr>
<td align="center">Model7</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">
<italic>&#x2713;</italic>
</td>
<td align="center">92.5</td>
<td align="center">86.8</td>
<td align="center">93.7</td>
<td align="center">89.5</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s5-4">
<title>5.4 Performance comparison</title>
<p>We conduct experiments to evaluate the detection accuracy and speed of YOLOv5-TS by comparing it to several important object detection solutions, including Faster-RCNN [<xref ref-type="bibr" rid="B5">5</xref>], RetinaNet [<xref ref-type="bibr" rid="B41">41</xref>], CenterNet [<xref ref-type="bibr" rid="B42">42</xref>], SSD [<xref ref-type="bibr" rid="B43">43</xref>], YOLOv3 [<xref ref-type="bibr" rid="B44">44</xref>], YOLOv4 [<xref ref-type="bibr" rid="B45">45</xref>], YOLOv5n, YOLOv5s, YOLOv5m [<xref ref-type="bibr" rid="B46">46</xref>], YOLOX [<xref ref-type="bibr" rid="B47">47</xref>], YOLOv7 [<xref ref-type="bibr" rid="B48">48</xref>], YOLOv8n, and YOLOv8s [<xref ref-type="bibr" rid="B49">49</xref>]. Faster-RCNN is a two-stage algorithm, while all the other algorithms are one-stage algorithms. To ensure the fairness of training processes, the training parameters of batch_size and the number of iterations are separately set to 32 and 800, while all the other training parameters are configured with their default values.</p>
<p>We use P, R, mAP, and F1-score to evaluate detection accuracy and record the corresponding results in <xref ref-type="table" rid="T3">Table 3</xref> and <xref ref-type="table" rid="T4">Table 4</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Detection accuracy comparison on TT100K-23 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Models</th>
<th align="center">P (%)</th>
<th align="center">R (%)</th>
<th align="center">mAP (%)</th>
<th align="center">F1-score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Faster-RCNN</td>
<td align="center">61.1</td>
<td align="center">60.7</td>
<td align="center">62.3</td>
<td align="center">60.8</td>
</tr>
<tr>
<td align="center">RetinaNet</td>
<td align="center">45.4</td>
<td align="center">59.9</td>
<td align="center">44.9</td>
<td align="center">51.6</td>
</tr>
<tr>
<td align="center">CenterNet</td>
<td align="center">82.4</td>
<td align="center">66</td>
<td align="center">71.8</td>
<td align="center">73.2</td>
</tr>
<tr>
<td align="center">SSD</td>
<td align="center">97.3</td>
<td align="center">24.2</td>
<td align="center">85.8</td>
<td align="center">38.7</td>
</tr>
<tr>
<td align="center">YOLOv3</td>
<td align="center">90.1</td>
<td align="center">78.6</td>
<td align="center">85.3</td>
<td align="center">83.9</td>
</tr>
<tr>
<td align="center">YOLOv4-tiny</td>
<td align="center">79.2</td>
<td align="center">78</td>
<td align="center">79.8</td>
<td align="center">78.4</td>
</tr>
<tr>
<td align="center">YOLOv5n</td>
<td align="center">88.1</td>
<td align="center">81.4</td>
<td align="center">88.2</td>
<td align="center">84.6</td>
</tr>
<tr>
<td align="center">YOLOv5s</td>
<td align="center">83.8</td>
<td align="center">87.5</td>
<td align="center">90.3</td>
<td align="center">85.6</td>
</tr>
<tr>
<td align="center">YOLOv5m</td>
<td align="center">88.4</td>
<td align="center">86.4</td>
<td align="center">90.6</td>
<td align="center">87.3</td>
</tr>
<tr>
<td align="center">YOLOX</td>
<td align="center">89.7</td>
<td align="center">85.8</td>
<td align="center">89.2</td>
<td align="center">87.7</td>
</tr>
<tr>
<td align="center">YOLOv7</td>
<td align="center">93.2</td>
<td align="center">88.1</td>
<td align="center">94.4</td>
<td align="center">90.5</td>
</tr>
<tr>
<td align="center">YOLOv8n</td>
<td align="center">90</td>
<td align="center">77.8</td>
<td align="center">89.2</td>
<td align="center">83.4</td>
</tr>
<tr>
<td align="center">YOLOv8s</td>
<td align="center">89.2</td>
<td align="center">80.9</td>
<td align="center">90.9</td>
<td align="center">84.8</td>
</tr>
<tr>
<td align="center">YOLOv5-TS</td>
<td align="center">92.5</td>
<td align="center">86.8</td>
<td align="center">93.7</td>
<td align="center">89.5</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B25">25</xref>]</td>
<td align="center">90.9</td>
<td align="center">85.2</td>
<td align="center">91.3</td>
<td align="center">87.9</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B51">51</xref>]</td>
<td align="center">86.4</td>
<td align="center">87.4</td>
<td align="center">92</td>
<td align="center">86.8</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B50">50</xref>]</td>
<td align="center">91.5</td>
<td align="center">84.3</td>
<td align="center">91.5</td>
<td align="center">87.7</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B9">9</xref>]</td>
<td align="center">87.5</td>
<td align="center">86</td>
<td align="center">90.3</td>
<td align="center">86.7</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Detection accuracy comparison on CCTSDB2021 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Models</th>
<th align="center">P (%)</th>
<th align="center">R (%)</th>
<th align="center">mAP (%)</th>
<th align="center">F1-score</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Faster-RCNN</td>
<td align="center">95.6</td>
<td align="center">65.2</td>
<td align="center">95.6</td>
<td align="center">77.5</td>
</tr>
<tr>
<td align="center">RetinaNet</td>
<td align="center">94.1</td>
<td align="center">66.4</td>
<td align="center">94</td>
<td align="center">77.8</td>
</tr>
<tr>
<td align="center">CenterNet</td>
<td align="center">93.8</td>
<td align="center">82.8</td>
<td align="center">87.4</td>
<td align="center">87.9</td>
</tr>
<tr>
<td align="center">SSD</td>
<td align="center">98.1</td>
<td align="center">39.5</td>
<td align="center">94.1</td>
<td align="center">56.3</td>
</tr>
<tr>
<td align="center">YOLOv3</td>
<td align="center">97.2</td>
<td align="center">94.8</td>
<td align="center">98</td>
<td align="center">95.9</td>
</tr>
<tr>
<td align="center">YOLOv4-tiny</td>
<td align="center">91.7</td>
<td align="center">89.1</td>
<td align="center">92.9</td>
<td align="center">90.3</td>
</tr>
<tr>
<td align="center">YOLOv5n</td>
<td align="center">97.4</td>
<td align="center">94.2</td>
<td align="center">97.8</td>
<td align="center">95.7</td>
</tr>
<tr>
<td align="center">YOLOv5s</td>
<td align="center">96.1</td>
<td align="center">95.7</td>
<td align="center">97.9</td>
<td align="center">95.8</td>
</tr>
<tr>
<td align="center">YOLOv5m</td>
<td align="center">97.8</td>
<td align="center">97.8</td>
<td align="center">98.8</td>
<td align="center">97.8</td>
</tr>
<tr>
<td align="center">YOLOX</td>
<td align="center">96.7</td>
<td align="center">97.6</td>
<td align="center">98.3</td>
<td align="center">97.1</td>
</tr>
<tr>
<td align="center">YOLOv7</td>
<td align="center">94.8</td>
<td align="center">96.4</td>
<td align="center">97.1</td>
<td align="center">95.5</td>
</tr>
<tr>
<td align="center">YOLOv8n</td>
<td align="center">97.1</td>
<td align="center">96.3</td>
<td align="center">98.5</td>
<td align="center">96.6</td>
</tr>
<tr>
<td align="center">YOLOv8s</td>
<td align="center">97.3</td>
<td align="center">96.8</td>
<td align="center">98.8</td>
<td align="center">97</td>
</tr>
<tr>
<td align="center">YOLOv5-TS</td>
<td align="center">97.6</td>
<td align="center">98.4</td>
<td align="center">99.1</td>
<td align="center">98</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B25">25</xref>]</td>
<td align="center">97</td>
<td align="center">97.2</td>
<td align="center">98.6</td>
<td align="center">97.1</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B51">51</xref>]</td>
<td align="center">96.3</td>
<td align="center">97.5</td>
<td align="center">98.9</td>
<td align="center">96.8</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B50">50</xref>]</td>
<td align="center">97.5</td>
<td align="center">97.6</td>
<td align="center">98.9</td>
<td align="center">97.5</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B9">9</xref>]</td>
<td align="center">96.9</td>
<td align="center">96.6</td>
<td align="center">98.5</td>
<td align="center">96.7</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="table" rid="T3">Table 3</xref> shows the detection results on TT100K-23 dataset. According to the results, YOLOv5-TS obtained the highest P, R, mAP, and F1-score compared with all the other variants of YOLO except YOLOv5s and YOLOv7. Although YOLOV5-TS had a lower R than YOLOv5s, it obtained a higher P, mAP and F1-score, which indicates that YOLOv5-TS performed better than YOLOv5s.</p>
<p>
<xref ref-type="table" rid="T4">Table 4</xref> shows the detection results on CCTSDB2021 dataset. According to the results, YOLOv5-TS obtained the highest P, R, mAP, and F1-score compared with all the other variants of YOLO except YOLOv5m. Although YOLOv5m outperformed YOLOv5-TS on the p metric, it is surpassed by YOLOv5-TS on the other three metrics. Therefore, we think YOLOv5-Ts performs better than YOLOv5m on CCTSDB2021.</p>
<p>The results on TT100K-23 show that YOLOv7 had an obvious advantage over YOLOv5-TS. However, the advantage was given away on CCTSDB2021 according to <xref ref-type="table" rid="T4">Table 4</xref>. To further evaluate YOLOv5-TS and YOLOV7, we carried out the experiments to evaluate the detection speeds of different solutions since traffic sign detection is predominantly applied in real-time scenarios which demand not only high detection accuracy but also swift detection speed. <xref ref-type="table" rid="T5">Table 5</xref> and <xref ref-type="table" rid="T6">Table 6</xref> show the corresponding results. According to the table, YOLOv5-TS processed 67 frames per second, whereas YOLOv7 only handled 23 frames per second. This suggests that YOLOv5-TS is significantly more well-suited for real-time traffic sign detection than YOLOv7.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Detection speed comparison on TT100K-23 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Models</th>
<th align="center">FPS(f/s)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Faster-RCNN</td>
<td align="center">22</td>
</tr>
<tr>
<td align="center">RetinaNet</td>
<td align="center">20</td>
</tr>
<tr>
<td align="center">CenterNet</td>
<td align="center">32</td>
</tr>
<tr>
<td align="center">SSD</td>
<td align="center">47</td>
</tr>
<tr>
<td align="center">YOLOv3</td>
<td align="center">21</td>
</tr>
<tr>
<td align="center">YOLOv4-tiny</td>
<td align="center">51</td>
</tr>
<tr>
<td align="center">YOLOv5n</td>
<td align="center">75</td>
</tr>
<tr>
<td align="center">YOLOv5s</td>
<td align="center">70</td>
</tr>
<tr>
<td align="center">YOLOv5m</td>
<td align="center">54</td>
</tr>
<tr>
<td align="center">YOLOX</td>
<td align="center">25</td>
</tr>
<tr>
<td align="center">YOLOv7</td>
<td align="center">23</td>
</tr>
<tr>
<td align="center">YOLOv8n</td>
<td align="center">71</td>
</tr>
<tr>
<td align="center">YOLOv8s</td>
<td align="center">63</td>
</tr>
<tr>
<td align="center">YOLOv5-TS</td>
<td align="center">67</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B25">25</xref>]</td>
<td align="center">65</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B51">51</xref>]</td>
<td align="center">65</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B50">50</xref>]</td>
<td align="center">64</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B9">9</xref>]</td>
<td align="center">69</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Detection speed comparison on CCTSD2021 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Models</th>
<th align="center">FPS(f/s)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Faster-RCNN</td>
<td align="center">25</td>
</tr>
<tr>
<td align="center">RetinaNet</td>
<td align="center">21</td>
</tr>
<tr>
<td align="center">CenterNet</td>
<td align="center">34</td>
</tr>
<tr>
<td align="center">SSD</td>
<td align="center">61</td>
</tr>
<tr>
<td align="center">YOLOv3</td>
<td align="center">27</td>
</tr>
<tr>
<td align="center">YOLOv4-tiny</td>
<td align="center">65</td>
</tr>
<tr>
<td align="center">YOLOv5n</td>
<td align="center">77</td>
</tr>
<tr>
<td align="center">YOLOv5s</td>
<td align="center">74</td>
</tr>
<tr>
<td align="center">YOLOv5m</td>
<td align="center">59</td>
</tr>
<tr>
<td align="center">YOLOX</td>
<td align="center">25</td>
</tr>
<tr>
<td align="center">YOLOv7</td>
<td align="center">26</td>
</tr>
<tr>
<td align="center">YOLOv8n</td>
<td align="center">73</td>
</tr>
<tr>
<td align="center">YOLOv8s</td>
<td align="center">63</td>
</tr>
<tr>
<td align="center">YOLOv5-TS</td>
<td align="center">71</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B25">25</xref>]</td>
<td align="center">64</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B51">51</xref>]</td>
<td align="center">67</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B50">50</xref>]</td>
<td align="center">64</td>
</tr>
<tr>
<td align="center">solution[<xref ref-type="bibr" rid="B9">9</xref>]</td>
<td align="center">57</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>SSD is a one-stage solution. According to the results in <xref ref-type="table" rid="T3">Table 3</xref> and <xref ref-type="table" rid="T4">Table 4</xref>, SSD surpassed YOLOv5-TS in terms of P metric, but lagged behind YOLOv5-TS in terms of all the other three metrics. According to the results in <xref ref-type="table" rid="T5">Tables 5</xref> and <xref ref-type="table" rid="T6">Table 6</xref>, SSD detected traffic signs at a speed much slower than YOLOv5-TS did. Considering the above results, we think YOLOv5-TS performs better than SSD. RetinaNet and CenterNet are also one-stage algorithms. According to the results in <xref ref-type="table" rid="T3">Tables 3</xref>&#x2013;<xref ref-type="table" rid="T6">6</xref>, our solution performed better than them. The results in <xref ref-type="table" rid="T3">Tables 3</xref>&#x2013;<xref ref-type="table" rid="T6">6</xref> also indicated that our solution performed better than Faster-RCNN which is a two-stage algorithm.</p>
<p>To further evaluate YOLOv5-TS, we compared it with four different solutions, that is, [<xref ref-type="bibr" rid="B9">9</xref>,<xref ref-type="bibr" rid="B25">25</xref>,<xref ref-type="bibr" rid="B50">50</xref>,<xref ref-type="bibr" rid="B51">51</xref>]. All these solutions utilized YOLOv5 to detect traffic signs. They all made improvements to YOLOv5 and obtained performance gains. The corresponding results are recorded in <xref ref-type="table" rid="T3">Tables 3</xref>&#x2013;<xref ref-type="table" rid="T6">6</xref>. According to these four tables, YOLOv5-TS outperformed better on TT100K dataset and CCTSDB2021 dataset than the four solutions, regardless of which of the four evaluation indicators was used.</p>
<p>
<xref ref-type="fig" rid="F6">Figure 6</xref> shows the detected results of YOLOv5-Ts on the images captured at different distances, light conditions and shooting angles. According to the results, YOLOv5-TS correctly recognized all the small-size traffic signs in all the images.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Detection performance of the YOLOv5-TS model trained on the TT100K dataset. The detection results of the target are magnified and displayed at the bottom of the image.</p>
</caption>
<graphic xlink:href="fphy-11-1297828-g006.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<title>6 Conclusion</title>
<p>In this work, we analyzed the performance problem of YOLOv5 in real-time traffic sign detection. To address the performance issues, we proposed several enhancements to YOLOv5. Firstly, we introduced a spatial pyramid with depth-wise convolution to address feature loss in the SPPF module and extract multi-scale features more effectively. Secondly, we propose a multiple feature fusion module to further extract and fuse multi-scale features, enhancing feature representation. Thirdly, we introduced a specialized detection layer to improve the accuracy in detecting small even extra small traffic signs. Finally, we incorporated the k-means&#x2b;&#x2b; clustering algorithm to obtain anchor boxes better suited for the data sets. Experimental results demonstrate that the improved model effectively enhances accuracy without significantly increasing model complexity. In the future, we will implement the improvements in the work to YOLOv8.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/Supplementary Material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s8">
<title>Author contributions</title>
<p>JS: Writing&#x2013;original draft, Writing&#x2013;review and editing. ZZ: Writing&#x2013;original draft, Writing&#x2013;review and editing. JL: Writing&#x2013;original draft. XZ: Writing&#x2013;original draft.</p>
</sec>
<sec id="s9">
<title>Funding</title>
<p>The authors declare financial support was received for the research, authorship, and/or publication of this article. This work has been supported in part by the National Natural Science Foundation of China under Grant No. 61972134, Innovative and Scientific Research Team of Henan Polytechnic University under No. T2021-3.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tulbure</surname>
<given-names>A-A</given-names>
</name>
<name>
<surname>Tulbure</surname>
<given-names>A-A</given-names>
</name>
<name>
<surname>Dulf</surname>
<given-names>E-H</given-names>
</name>
</person-group>. <article-title>A review on modern defect detection models using dcnns&#x2013;deep convolutional neural networks</article-title>. <source>J Adv Res</source> (<year>2022</year>) <volume>35</volume>:<fpage>33</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1016/j.jare.2021.03.015</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Pi</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Traffic sign detection based on improved faster r-cnn for autonomous driving</article-title>. <source>The J Supercomputing</source> (<year>2022</year>) <volume>78</volume>:<fpage>7982</fpage>&#x2013;<lpage>8002</lpage>. <pub-id pub-id-type="doi">10.1007/s11227-021-04230-4</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Real-time small traffic sign detection with revised faster-rcnn</article-title>. <source>Multimedia Tools Appl</source> (<year>2019</year>) <volume>78</volume>:<fpage>13263</fpage>&#x2013;<lpage>78</lpage>. <pub-id pub-id-type="doi">10.1007/s11042-018-6428-0</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Tong</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>A three-stage real-time detector for traffic signs in large panoramas</article-title>. <source>Comput Vis Media</source> (<year>2019</year>) <volume>5</volume>:<fpage>403</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1007/s41095-019-0152-1</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ren</surname>
<given-names>S</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Faster r-cnn: towards real-time object detection with region proposal networks</article-title>. <source>Adv Neural Inf Process Syst</source> (<year>2015</year>) <volume>28</volume>.</citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Donahue</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Malik</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Rich feature hierarchies for accurate object detection and semantic segmentation</article-title>. In: <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>; <conf-date>June 2014</conf-date>; <conf-loc>Columbus, OH, USA</conf-loc> (<year>2014</year>). p. <fpage>580</fpage>&#x2013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Divvala</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>You only look once: unified, real-time object detection</article-title>. In: <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>; <conf-date>June 2016</conf-date>; <conf-loc>Las Vegas, Nevada, USA</conf-loc> (<year>2016</year>). p. <fpage>779</fpage>&#x2013;<lpage>88</lpage>.</citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Che</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>A real-time and lightweight traffic sign detection method based on ghost-yolo</article-title>. <source>Multimedia Tools Appl</source> (<year>2023</year>) <volume>82</volume>:<fpage>26063</fpage>&#x2013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1007/s11042-023-14342-z</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mahaur</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Mishra</surname>
<given-names>K</given-names>
</name>
</person-group>. <article-title>Small-object detection based on yolov5 in autonomous driving systems</article-title>. <source>Pattern Recognition Lett</source> (<year>2023</year>) <volume>168</volume>:<fpage>115</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1016/j.patrec.2023.03.009</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>Improved yolov5 network for real-time multi-scale traffic sign detection</article-title>. <source>Neural Comput Appl</source> (<year>2023</year>) <volume>35</volume>:<fpage>7853</fpage>&#x2013;<lpage>65</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-022-08077-5</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Sheng</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Automatic detection of road traffic signs from natural scene images based on pixel vector and central projected shape feature</article-title>. <source>IET Intell Transport Syst</source> (<year>2012</year>) <volume>6</volume>:<fpage>282</fpage>&#x2013;<lpage>91</lpage>. <pub-id pub-id-type="doi">10.1049/iet-its.2011.0105</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>G&#xf3;mez-Moreno</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Maldonado-Basc&#xf3;n</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Gil-Jim&#xe9;nez</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Lafuente-Arroyo</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Goal evaluation of segmentation algorithms for traffic sign recognition</article-title>. <source>IEEE Trans Intell Transportation Syst</source> (<year>2010</year>) <volume>11</volume>:<fpage>917</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1109/tits.2010.2054084</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Salti</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Petrelli</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Tombari</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Fioraio</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Di Stefano</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>Traffic sign detection via interest region extraction</article-title>. <source>Pattern Recognition</source> (<year>2015</year>) <volume>48</volume>:<fpage>1039</fpage>&#x2013;<lpage>49</lpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2014.05.017</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barnes</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Zelinsky</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Fletcher</surname>
<given-names>LS</given-names>
</name>
</person-group>. <article-title>Real-time speed sign detection using the radial symmetry detector</article-title>. <source>IEEE Trans Intell Transportation Syst</source> (<year>2008</year>) <volume>9</volume>:<fpage>322</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1109/tits.2008.922935</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fang</surname>
<given-names>C-Y</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S-W</given-names>
</name>
<name>
<surname>Fuh</surname>
<given-names>C-S</given-names>
</name>
</person-group>. <article-title>Road-sign detection and tracking</article-title>. <source>IEEE Trans vehicular Technol</source> (<year>2003</year>) <volume>52</volume>:<fpage>1329</fpage>&#x2013;<lpage>41</lpage>. <pub-id pub-id-type="doi">10.1109/TVT.2003.810999</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Abbas</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Ibrahim</surname>
<given-names>ME</given-names>
</name>
<name>
<surname>Jaffar</surname>
<given-names>MA</given-names>
</name>
</person-group>. <article-title>A comprehensive review of recent advances on deep vision systems</article-title>. <source>Artif Intelligence Rev</source> (<year>2019</year>) <volume>52</volume>:<fpage>39</fpage>&#x2013;<lpage>76</lpage>. <pub-id pub-id-type="doi">10.1007/s10462-018-9633-3</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>Multimodal neuromorphic sensory-processing system with memristor circuits for smart home applications</article-title>. <source>IEEE Trans Industry Appl</source> (<year>2022</year>) <volume>59</volume>:<fpage>47</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1109/tia.2022.3188749</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>CS</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Neuromorphic extreme learning machines with bimodal memristive synapses</article-title>. <source>Neurocomputing</source> (<year>2021</year>) <volume>453</volume>:<fpage>38</fpage>&#x2013;<lpage>49</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2021.04.049</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>CS</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>LL</given-names>
</name>
</person-group>. <article-title>Memristor-based hierarchical attention network for multimodal affective computing in mental health monitoring</article-title>. <source>IEEE Consumer Elect Mag</source> (<year>2022</year>) <volume>12</volume>:<fpage>94</fpage>&#x2013;<lpage>106</lpage>. <pub-id pub-id-type="doi">10.1109/mce.2022.3159350</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wali</surname>
<given-names>SB</given-names>
</name>
<name>
<surname>Abdullah</surname>
<given-names>MA</given-names>
</name>
<name>
<surname>Hannan</surname>
<given-names>MA</given-names>
</name>
<name>
<surname>Hussain</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Samad</surname>
<given-names>SA</given-names>
</name>
<name>
<surname>Ker</surname>
<given-names>PJ</given-names>
</name>
<etal/>
</person-group> <article-title>Vision-based traffic sign detection and recognition systems: current trends and challenges</article-title>. <source>Sensors</source> (<year>2019</year>) <volume>19</volume>:<fpage>2093</fpage>. <pub-id pub-id-type="doi">10.3390/s19092093</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Abbas</surname>
<given-names>K</given-names>
</name>
</person-group>. <article-title>A survey on deep learning and its applications</article-title>. <source>Comp Sci Rev</source> (<year>2021</year>) <volume>40</volume>:<fpage>100379</fpage>. <pub-id pub-id-type="doi">10.1016/j.cosrev.2021.100379</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Traffic sign detection algorithm based on improved yolov4-tiny</article-title>. <source>Signal Processing: Image Commun</source> (<year>2022</year>) <volume>107</volume>:<fpage>116783</fpage>. <pub-id pub-id-type="doi">10.1016/j.image.2022.116783</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Real-time traffic sign detection based on multiscale attention and spatial information aggregator</article-title>. <source>J Real-Time Image Process</source> (<year>2022</year>) <volume>19</volume>:<fpage>1155</fpage>&#x2013;<lpage>67</lpage>. <pub-id pub-id-type="doi">10.1007/s11554-022-01252-w</pub-id>
</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>P</given-names>
</name>
</person-group>. <article-title>Attention-yolov4: a real-time and high-accurate traffic sign detection algorithm</article-title>. <source>Multimedia Tools Appl</source> (<year>2023</year>) <volume>82</volume>:<fpage>7567</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1007/s11042-022-13251-x</pub-id>
</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>L</given-names>
</name>
</person-group>. <article-title>A novel lightweight traffic sign recognition model based on yolov5</article-title>. <source>J transportation Eng A: Syst</source> (<year>2023</year>) <volume>149</volume>:<fpage>04023025</fpage>. <pub-id pub-id-type="doi">10.1061/jtepbs.teeng-7461</pub-id>
</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>C</given-names>
</name>
</person-group>. <article-title>Ghostnet: more features from cheap operations</article-title>. In: <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>; <conf-date>June 2020</conf-date>; <conf-loc>Seattle, WA, USA</conf-loc> (<year>2020</year>). p. <fpage>1580</fpage>&#x2013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Howard</surname>
<given-names>AG</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Kalenichenko</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Weyand</surname>
<given-names>T</given-names>
</name>
<etal/>
</person-group> <article-title>Mobilenets: efficient convolutional neural networks for mobile vision applications</article-title> (<year>2017</year>). <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1704.04861">https://arxiv.org/abs/1704.04861</ext-link>.</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hou</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Coordinate attention for efficient mobile network design</article-title>. In: <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>; <conf-date>June 2021</conf-date>; <conf-loc>Nashville, TN, USA</conf-loc> (<year>2021</year>). p. <fpage>13713</fpage>&#x2013;<lpage>22</lpage>.</citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Sedg-yolov5: a lightweight traffic sign detection model based on knowledge distillation</article-title>. <source>Electronics</source> (<year>2023</year>) <volume>12</volume>:<fpage>305</fpage>. <pub-id pub-id-type="doi">10.3390/electronics12020305</pub-id>
</citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>Slim-neck by gsconv: a better design paradigm of detector architectures for autonomous vehicles</article-title> (<year>2022</year>). <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2206.02424">https://arxiv.org/abs/2206.02424</ext-link>.</citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bai</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>Z</given-names>
</name>
<etal/>
</person-group> <article-title>Two novel models for traffic sign detection based on yolov5s</article-title>. <source>Axioms</source> (<year>2023</year>) <volume>12</volume>:<fpage>160</fpage>. <pub-id pub-id-type="doi">10.3390/axioms12020160</pub-id>
</citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wan</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>M</given-names>
</name>
<name>
<surname>You</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Qu</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>A novel neural network model for traffic sign detection and recognition under extreme conditions</article-title>. <source>J Sensors</source> (<year>2021</year>) <volume>2021</volume>:<fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1155/2021/9984787</pub-id>
</citation>
</ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Tan</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>QV</given-names>
</name>
</person-group>. <article-title>Mixconv: mixed depthwise convolutional kernels</article-title> (<year>2019</year>). <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1907.09595">https://arxiv.org/abs/1907.09595</ext-link>.</citation>
</ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Gieseke</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Oehmcke</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Barnard</surname>
<given-names>K</given-names>
</name>
</person-group>. <article-title>Attentional feature fusion</article-title>. In: <conf-name>Proceedings of the IEEE/CVF winter conference on applications of computer vision</conf-name>; <conf-date>January 2021</conf-date>; <conf-loc>Waikoloa, HI, USA</conf-loc> (<year>2021</year>). p. <fpage>3560</fpage>&#x2013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B35">
<label>35.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>C-Y</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>H-YM</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y-H</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>P-Y</given-names>
</name>
<name>
<surname>Hsieh</surname>
<given-names>J-W</given-names>
</name>
<name>
<surname>Yeh</surname>
<given-names>I-H</given-names>
</name>
</person-group>. <article-title>Cspnet: a new backbone that can enhance learning capability of cnn</article-title>. In: <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops</conf-name>; <conf-date>June 2020</conf-date>; <conf-loc>Seattle, WA, USA</conf-loc> (<year>2020</year>). p. <fpage>390</fpage>&#x2013;<lpage>1</lpage>.</citation>
</ref>
<ref id="B36">
<label>36.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>T-Y</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Hariharan</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Belongie</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Feature pyramid networks for object detection</article-title>. In: <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>; <conf-date>July, 2017</conf-date>; <conf-loc>Honolulu, HI, USA</conf-loc> (<year>2017</year>). p. <fpage>2117</fpage>&#x2013;<lpage>25</lpage>.</citation>
</ref>
<ref id="B37">
<label>37.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Path aggregation network for instance segmentation</article-title>. In: <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>; <conf-date>June 2018</conf-date>; <conf-loc>Salt Lake City, UT, USA</conf-loc> (<year>2018</year>). p. <fpage>8759</fpage>&#x2013;<lpage>68</lpage>.</citation>
</ref>
<ref id="B38">
<label>38.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>LeCun</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Bottou</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Haffner</surname>
<given-names>P</given-names>
</name>
</person-group>. <article-title>Gradient-based learning applied to document recognition</article-title>. <source>Proc IEEE</source> (<year>1998</year>) <volume>86</volume>:<fpage>2278</fpage>&#x2013;<lpage>324</lpage>. <pub-id pub-id-type="doi">10.1109/5.726791</pub-id>
</citation>
</ref>
<ref id="B39">
<label>39.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Traffic-sign detection and classification in the wild</article-title>. In: <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>; <conf-date>June 2016</conf-date>; <conf-loc>Las Vegas, NV, USA</conf-loc> (<year>2016</year>). p. <fpage>2110</fpage>&#x2013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B40">
<label>40.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Kuang</surname>
<given-names>L-D</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Sherratt</surname>
<given-names>RS</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Cctsdb 2021: a more comprehensive traffic sign detection benchmark</article-title>. <source>Human-centric Comput Inf Sci</source> (<year>2022</year>) <volume>12</volume>. <pub-id pub-id-type="doi">10.22967/HCIS.2022.12.023</pub-id>
</citation>
</ref>
<ref id="B41">
<label>41.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>T-Y</given-names>
</name>
<name>
<surname>Goyal</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname>
<given-names>P</given-names>
</name>
</person-group>. <article-title>Focal loss for dense object detection</article-title>. <source>IEEE Trans Pattern Anal Machine Intelligence</source> (<year>2017</year>) <fpage>2980</fpage>&#x2013;<lpage>8</lpage>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1708.02002">https://arxiv.org/abs/1708.02002</ext-link>
</comment> (<comment>Accessed February 7, 2018</comment>).</citation>
</ref>
<ref id="B42">
<label>42.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Kr&#xe4;henb&#xfc;hl</surname>
<given-names>P</given-names>
</name>
</person-group>. <article-title>Objects as points</article-title> (<year>2019</year>). <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1904.07850">https://arxiv.org/abs/1904.07850</ext-link>.</citation>
</ref>
<ref id="B43">
<label>43.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Anguelov</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Erhan</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Szegedy</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Reed</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>C-Y</given-names>
</name>
<etal/>
</person-group> <article-title>Ssd: single shot multibox detector</article-title>. In: <conf-name>Proceedings of the Computer Vision&#x2013;ECCV 2016: 14th European Conference</conf-name>; <conf-date>October 2016</conf-date>; <conf-loc>Amsterdam, The Netherlands</conf-loc>. <publisher-name>Springer</publisher-name> (<year>2016</year>). p. <fpage>21</fpage>&#x2013;<lpage>37</lpage>.</citation>
</ref>
<ref id="B44">
<label>44.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Yolov3: an incremental improvement</article-title> (<year>2018</year>). <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1804.02767">https://arxiv.org/abs/1804.02767</ext-link>.</citation>
</ref>
<ref id="B45">
<label>45.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Bochkovskiy</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C-Y</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>H-YM</given-names>
</name>
</person-group>. <article-title>Yolov4: optimal speed and accuracy of object detection</article-title> (<year>2020</year>). <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2004.10934">https://arxiv.org/abs/2004.10934</ext-link>.</citation>
</ref>
<ref id="B46">
<label>46.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jocher</surname>
<given-names>G</given-names>
</name>
</person-group>. <source>YOLOv5 by ultralytics</source> (<year>2020</year>).</citation>
</ref>
<ref id="B47">
<label>47.</label>
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Ge</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Yolox: exceeding yolo series in 2021</article-title> (<year>2021</year>). <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2107.08430">https://arxiv.org/abs/2107.08430</ext-link>.</citation>
</ref>
<ref id="B48">
<label>48.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>C-Y</given-names>
</name>
<name>
<surname>Bochkovskiy</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>H-YM</given-names>
</name>
</person-group>. <article-title>Yolov7: trainable bag-of-freebies sets new state-of-the-art for real-time object detectors</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <conf-date>June 2020</conf-date>; <conf-loc>Vancouver, BC, Canada</conf-loc> (<year>2023</year>). p. <fpage>7464</fpage>&#x2013;<lpage>75</lpage>.</citation>
</ref>
<ref id="B49">
<label>49.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jocher</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Chaurasia</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Qiu</surname>
<given-names>J</given-names>
</name>
</person-group>. <source>YOLO by ultralytics</source> (<year>2023</year>).</citation>
</ref>
<ref id="B50">
<label>50.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dang</surname>
<given-names>TP</given-names>
</name>
<name>
<surname>Tran</surname>
<given-names>NT</given-names>
</name>
<name>
<surname>To</surname>
<given-names>VH</given-names>
</name>
<name>
<surname>Tran Thi</surname>
<given-names>MK</given-names>
</name>
</person-group>. <article-title>Improved yolov5 for real-time traffic signs recognition in bad weather conditions</article-title>. <source>J Supercomputing</source> (<year>2023</year>) <volume>79</volume>:<fpage>10706</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1007/s11227-023-05097-3</pub-id>
</citation>
</ref>
<ref id="B51">
<label>51.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Yolo-sg: small traffic signs detection method in complex scene</article-title>. <source>J Supercomputing</source> (<year>2023</year>) <volume>1&#x2013;22</volume>. <pub-id pub-id-type="doi">10.1007/s11227-023-05547-y</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>