<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurorobot.</journal-id>
<journal-title>Frontiers in Neurorobotics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurorobot.</abbrev-journal-title>
<issn pub-type="epub">1662-5218</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnbot.2023.1273251</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Learning geometric Jensen-Shannon divergence for tiny object detection in remote sensing images</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Ni</surname> <given-names>Shuyan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lin</surname> <given-names>Cunbao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Haining</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2400420/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Yang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Liao</surname> <given-names>Yurong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Na</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Electronic and Optical Engineering, Space Engineering University</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Institute of Artificial Intelligence, Beihang University</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Aerospace Optical-Microwave Integrated Precision Intelligent Sensing, Key Laboratory of Ministry of Industry and Information Technology, Beihang University</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Di Wu, Chinese Academy of Sciences (CAS), China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Alejandro Zacar&#x000ED;as, National Polytechnic Institute (IPN), Mexico; Xiaoda Liu, Amazon, United States; Enrique Garcia-Trinidad, Tecnol&#x000F3;gico de Estudios Superiores de Huixquilucan, Mexico</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Haining Wang <email>wanghaining&#x00040;buaa.edu.cn</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>09</day>
<month>11</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>17</volume>
<elocation-id>1273251</elocation-id>
<history>
<date date-type="received">
<day>05</day>
<month>08</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>25</day>
<month>10</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2023 Ni, Lin, Wang, Li, Liao and Li.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Ni, Lin, Wang, Li, Liao and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Tiny objects in remote sensing images only have a few pixels, and the detection difficulty is much higher than that of regular objects. General object detectors lack effective extraction of tiny object features, and are sensitive to the Intersection-over-Union (IoU) calculation and the threshold setting in the prediction stage. Therefore, it is particularly important to design a tiny-object-specific detector that can avoid the above problems. This article proposes the network JSDNet by learning the geometric Jensen-Shannon (JS) divergence representation between Gaussian distributions. First, the Swin Transformer model is integrated into the feature extraction stage as the backbone to improve the feature extraction capability of JSDNet for tiny objects. Second, the anchor box and ground-truth are modeled as two two-dimensional (2D) Gaussian distributions, so that the tiny object is represented as a statistical distribution model. Then, in view of the sensitivity problem faced by the IoU calculation for tiny objects, the JSDM module is designed as a regression sub-network, and the geometric JS divergence between two Gaussian distributions is derived from the perspective of information geometry to guide the regression prediction of anchor boxes. Experiments on the AI-TOD and DOTA datasets show that JSDNet can achieve superior detection performance for tiny objects compared to state-of-the-art general object detectors.</p></abstract>
<kwd-group>
<kwd>tiny object detection</kwd>
<kwd>remote sensing images</kwd>
<kwd>Jensen-Shannon divergence</kwd>
<kwd>deep learning</kwd>
<kwd>Gaussian distribution</kwd>
</kwd-group>
<counts>
<fig-count count="5"/>
<table-count count="5"/>
<equation-count count="14"/>
<ref-count count="37"/>
<page-count count="11"/>
<word-count count="6990"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>With the rapid development of satellite technology, remote sensing images collected by optical payloads often have a large width and high resolution, so the remote sensing images often contain a large number of tiny objects, which makes remote sensing object detection challenging in applications such as maritime search and rescue, flight scheduling, and ground surveillance. Despite the significant success of deep learning and convolutional neural networks (CNNs), many object detectors can perform various visual detection tasks with high quality (Liu et al., <xref ref-type="bibr" rid="B19">2016</xref>, <xref ref-type="bibr" rid="B18">2020</xref>; Ren et al., <xref ref-type="bibr" rid="B24">2017</xref>; Bochkovskiy et al., <xref ref-type="bibr" rid="B1">2020</xref>), such as salient object detection and crowd density detection. Since the object size and distribution of these scenes are very different with remote sensing images, it is particularly important to design a tiny object detection method suitable for remote sensing scenes.</p>
<p>The latest research in tiny object detection has mainly focused on multi-scale feature learning (Zhang X. et al., <xref ref-type="bibr" rid="B37">2022</xref>), context-based detection (Zhang K. et al., <xref ref-type="bibr" rid="B35">2022</xref>), network structure-based optimization (Lu et al., <xref ref-type="bibr" rid="B21">2023</xref>), data augmentation strategies (Kim and Hwang, <xref ref-type="bibr" rid="B11">2022</xref>) and so on. The above methods are all implemented by the CNN architecture, which drives the data training by enhancing the tiny object feature representation. In addition to the representation of tiny object features, the computation of the IoU for network training is also affected by object scale changes (Li et al., <xref ref-type="bibr" rid="B16">2021</xref>). The IoU between the ground-truth and anchor box directly reflects the positive and negative categories of the current anchor box, but the IoU is sensitive to objects of different pixel sizes, and a small position offset leads to a large change in the IoU value. As shown in <xref ref-type="fig" rid="F1">Figure 1</xref>, the results of the IoU calculation for objects with different pixel sizes are different. For example, when the offset pixel is 2, the IoU value of the object of 8 &#x000D7; 8 pixels is calculated as 0.39, and the IoU value of the object of 96 &#x000D7; 96 pixels is calculated as 0.92. Thus, the results are different. Then, when the IoU threshold is used to determine the positive and negative sample labels of the current anchor box, there is inaccurate classification of the respective labels. Therefore, the IoU calculation method is not suitable for the anchor-box label assignment mechanism of tiny objects. In addition, when there is no overlap or mutual inclusion between the anchor boxes and the ground-truth, the value of IoU remains unchanged and cannot reflect the positional regression relationship between the current anchor box and the ground-truth. This is often the case with tiny objects in remote sensing image.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>The area difference of the IoU calculation with objects of different pixel sizes. It shows that the IoU calculation method is sensitive to tiny remote-sensing objects.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-17-1273251-g0001.tif"/>
</fig>
<p>Based on the application potential of the Swin Transformer model (Liu et al., <xref ref-type="bibr" rid="B20">2021</xref>) in the remote sensing field, this article proposes a new object metric representation learning method (JSDNet), which uses the geometric JS divergence to measure the distribution distance of anchor boxes and the ground-truth. The closed form of geometric JS divergence was previously used to train Bayesian neural networks in reference (Deasy et al., <xref ref-type="bibr" rid="B4">2020</xref>; Thiagarajan and Ghosh, <xref ref-type="bibr" rid="B25">2022</xref>), which brings new inspiration to train deep networks. In this article, feature extraction based on Swin Transformer is firstly performed to find deeper feature representations and richer semantic information. Second, an object regression module (JSDM) is designed to model the object bounding box as a 2D Gaussian distribution, and we use the proposed geometric JS divergence with a closed-form to measure the similarity between the anchor boxes and ground-truth, avoiding the traditional IoU calculation which results in pixel offset sensitivity for tiny objects. The experiments on AI-TOD (Xu et al., <xref ref-type="bibr" rid="B28">2022</xref>) and DOTA (Xia et al., <xref ref-type="bibr" rid="B27">2018</xref>) datasets demonstrate the advanced performance of the proposed method.</p>
<p>We summarize the main contributions as follows:</p>
<list list-type="order">
<list-item><p>Swin Transformer is integrated into the CNN architecture, and a remote sensing tiny object detector called JSDNet is proposed. The semantic features of the object are extracted by Swin Transformer, and the CNN network is used for the classification and regression processes.</p></list-item>
<list-item><p>The geometric JS divergence with a closed-form is used as the distance measure between Gaussian distributions, which guides the regression loss branch of the object detection network, avoids the sensitivity of the IoU calculation method to tiny objects, and improves the detection performance of remote sensing tiny objects.</p></list-item>
<list-item><p>The regression loss is described from the perspective of information geometry, which provides a new thinking for the algorithm improvement in the tiny object detection field.</p></list-item>
</list>
</sec>
<sec id="s2">
<title>2. Related works</title>
<sec>
<title>2.1. Tiny object detection</title>
<p>Currently, research on tiny object detection mainly focuses on anchor-based optimization, network structure-based optimization, multi-scale feature learning, context-based information, and label classification strategy.</p>
<sec>
<title>2.1.1. Anchor-based optimization</title>
<p>Anchors are multiple bounding boxes with different sizes and aspect ratios that are generated centered on each pixel of the image. Yang et al. (<xref ref-type="bibr" rid="B29">2018</xref>) propose a dynamic mechanism named MetaAnchor, which can select appropriate anchor for dynamic generation. Zhang et al. (<xref ref-type="bibr" rid="B36">2017</xref>) propose a scale compensation anchor matching mechanism to improve the recall rate for tiny objects. Duan et al. (<xref ref-type="bibr" rid="B6">2019</xref>) propose using center points to improve the discrimination and screening ability of anchor. Tian et al. (<xref ref-type="bibr" rid="B26">2019</xref>) solved the problem of hyperparametric calculation caused by too many anchors. Yang et al. (<xref ref-type="bibr" rid="B33">2019</xref>) use point sets to represent the bounding box of tiny objects. Due to the large and dense number of tiny objects in the image, the effectiveness of current tiny object detection algorithms based on anchor still needs to be improved.</p>
</sec>
<sec>
<title>2.1.2. Network structure-based optimization</title>
<p>Optimizing backbone and neck can generally enhance feature extraction for objects and improve the performance of tiny object detection (Bochkovskiy et al., <xref ref-type="bibr" rid="B1">2020</xref>). Qiao et al. (<xref ref-type="bibr" rid="B23">2021</xref>) designed a recursive feature pyramid as a backbone network. Kong et al. (<xref ref-type="bibr" rid="B13">2020</xref>) designed new detection heads that can directly learn the possibility of tiny objects. Cai and Vasconcelos (<xref ref-type="bibr" rid="B2">2018</xref>) proposed a multi-stage network structure to improve the value of IoU layer by layer, solving the problem of over fitting in training.</p>
</sec>
<sec>
<title>2.1.3. Multi-scale feature learning</title>
<p>Shallow networks contain coordinate information of tiny objects, and using multi-scale feature learning can better fuse and enhance the features of tiny objects. Liu et al. (<xref ref-type="bibr" rid="B19">2016</xref>) proposed a single shot multi box detector (SSD) algorithm for hierarchical detection of feature maps of different scales. Lu et al. (<xref ref-type="bibr" rid="B22">2019</xref>) designed grid points for spatial feature information fusion. Han et al. (<xref ref-type="bibr" rid="B10">2022</xref>) proposed a multi-scale residual block, which obtains multi-scale context information by using dilated convolution in cascaded residual blocks. Literature (Deng et al., <xref ref-type="bibr" rid="B5">2022</xref>; Zeng et al., <xref ref-type="bibr" rid="B34">2022</xref>) improves the feature pyramid network, which can effectively solve the problem that feature coupling at different scales affects the performance of tiny object detection.</p>
</sec>
<sec>
<title>2.1.4. Context-based information</title>
<p>Effectively utilizing the background environment information around tiny objects can effectively improve the performance of tiny object detection. Feng et al. (<xref ref-type="bibr" rid="B7">2021</xref>) introduced the global context aware enhancement module, which activates the characteristics of the entire object by capturing the global visual context. Li et al. (<xref ref-type="bibr" rid="B15">2019</xref>), Leng et al. (<xref ref-type="bibr" rid="B14">2021</xref>), Cui et al. (<xref ref-type="bibr" rid="B3">2022</xref>) improved the performance of tiny object detection by constructing high-resolution and strong semantic feature maps.</p>
</sec>
<sec>
<title>2.1.5. Label classification strategy</title>
<p>Assigning high-quality anchor boxes to tiny objects is challenging, and many recent work has been carried out (Ge et al., <xref ref-type="bibr" rid="B9">2021</xref>). Kim and Lee (<xref ref-type="bibr" rid="B12">2020</xref>) proposed probabilistic anchor assignment, which assumes that the joint loss distribution of positive and negative samples follows a Gaussian distribution. Xu et al. (<xref ref-type="bibr" rid="B28">2022</xref>) proposed a ranking-based allocation strategy, significantly improving the impact of label allocation on tiny object detection.</p>
</sec>
</sec>
<sec>
<title>2.2. 2D Gaussian modeling for remote sensing object</title>
<p>IoU guided regression losses in object detection may lead to deviations in numerical calculations due to the following two issues: The loss form is not differentiable, and the loss calculated by IoU is inconsistent with the assessment. In order to solve the above challenges in remote sensing images, Yang et al. (<xref ref-type="bibr" rid="B30">2021a</xref>,<xref ref-type="bibr" rid="B31">b</xref>, <xref ref-type="bibr" rid="B32">2023</xref>) proposed to represent an oriented object as a two-dimensional Gaussian distribution of rotation, which brought new inspiration for object detection. Modeling a remote sensing object as a 2D Gaussian distribution <italic>N</italic>(<italic>m</italic>, &#x003A3;) at any angle:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mi>T</mml:mi></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mo>&#x003A3;</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>/</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mo>&#x0039B;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mi>T</mml:mi></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where, <italic>R</italic> represents a 2D rotation matrix and &#x0039B; represents a diagonal matrix of eigenvalues. Specifically, the anchor box and the ground-truth of the object are modeled as two 2D rotational Gaussian distributions, and then the distance between the two Gaussian distributions is measured to guide the regression network in training. The design of regression loss function can effectively adapt to the situation of orienting and dense object distribution in remote sensing image. Yang et al. (<xref ref-type="bibr" rid="B30">2021a</xref>) used Wasserstein distance for spatial distance measurement, while Yang et al. (<xref ref-type="bibr" rid="B31">2021b</xref>) used Kullback-Leibler divergence. These metrics are not closed forms in information geometry field.</p>
</sec>
</sec>
<sec id="s3">
<title>3. Proposed method</title>
<sec>
<title>3.1. Overall framework</title>
<p>The proposed tiny-object-detection framework JSDNet is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, using RetinaNet (Lin et al., <xref ref-type="bibr" rid="B17">2020</xref>) as the baseline algorithm. The framework comprises three main parts: the window attention backbone, the feature fusion network and the detection sub-network. First, Swin Transformer is used as the backbone for feature extraction. Owing to the large width and high pixel characteristics of remote sensing images, the original backbone of RetinaNet cannot effectively extract fine small object features from remote sensing images. Therefore, it is theoretically valid to use window-based self-attention operations. Swin Transformer processes the image into patches, proposes the concept of a moving window, and only calculates self-attention inside the window, which can effectively reduce the length of the sequence and reduce the computational complexity. JSDNet uses Swin Transformer as the backbone, which can handle the problem of different scale features hierarchically and then optimize the detection of remote sensing tiny objects by multi-scale feature maps. Second, JSDNet inputs the obtained multi-scale feature map into the feature pyramid network for feature fusion. The fusion process adopts a top-down transfer method to transfer the high-level feature semantics to the underlying structure. This is the same as the original feature fusion structure of RetinaNet.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>The proposed JSDNet framework. It consists of the Swin Transformer backbone, feature fusion module, and classification and regression sub-networks. The regression sub-network JSDM uses the geometric JS divergence for closed-form processing.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-17-1273251-g0002.tif"/>
</fig>
<p>Then, JSDNet feeds the fused features into the detection sub-network, which performs label classification and bounding-box regression tasks. In the bounding-box regression task, the JSDM models the object as a 2D Gaussian distribution from information geometry perspective and uses the abstract mean to calculate the geometric JS divergence, so that the JS divergence can be approximated as a similarity measurement of two Gaussian distributions that can produce closed-form expressions.</p>
</sec>
<sec>
<title>3.2. Gaussian distribution modeling for bounding box</title>
<p>Yang et al. (<xref ref-type="bibr" rid="B30">2021a</xref>,<xref ref-type="bibr" rid="B31">b</xref>, <xref ref-type="bibr" rid="B32">2023</xref>) proposed that the oriented object is represented as a rotating 2D Gaussian distribution, which brings new inspiration for object detection. However, a tiny object has a small number of pixels in the image, and the IoU calculation method is easily affected by the threshold setting. Modeling the tiny object as a 2D Gaussian distribution can avoid this problem and can also distinguish the object information from the redundant background. Specifically, the anchor box and ground-truth are represented by four parameters (<italic>x</italic><sub>0</sub>, <italic>y</italic><sub>0</sub>, <italic>w, h</italic>) of a centroid notation, where (<italic>x</italic><sub>0</sub>, <italic>y</italic><sub>0</sub>) represents the coordinates of the rectangle center point, <italic>w</italic> and <italic>h</italic> are the length and width of the rectangle, respectively. At this time, it is described as an inscribed ellipse as follows:</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:mn>4</mml:mn><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mn>4</mml:mn><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>y</mml:mi><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M3"><mml:mfrac><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:math></inline-formula> and <inline-formula><mml:math id="M4"><mml:mfrac><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:math></inline-formula> are semi-major axes of the ellipse, which are equivalent to half the length and width of the rectangle, respectively.</p>
<p>According to probability statistics, the probability density function of the 2D Gaussian distribution is as follows:</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M5"><mml:mrow><mml:mi>f</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mtext>x&#x000A0;|&#x000A0;</mml:mtext></mml:mstyle><mml:mi>&#x003BC;</mml:mi><mml:mo>,</mml:mo><mml:mo>&#x02211;</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>exp</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mo>&#x02212;</mml:mo><mml:mstyle scriptlevel='+1'><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mstyle><mml:msup><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003BC;</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mtext>T</mml:mtext></mml:msup><mml:mover><mml:mstyle mathsize='140%' displaystyle='true'><mml:mo>&#x02211;</mml:mo></mml:mstyle><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mover><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003BC;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>&#x003C0;</mml:mi><mml:mo>&#x0007C;</mml:mo><mml:mstyle displaystyle='true'><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:msup><mml:mo>&#x0007C;</mml:mo><mml:mrow><mml:mstyle scriptlevel='+1'><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mstyle></mml:mrow></mml:msup></mml:mrow></mml:mstyle></mml:mrow></mml:mfrac></mml:mrow></mml:math></disp-formula>
<p>where <bold>x</bold> denotes the coordinate variable (<italic>x, y</italic>), <bold>&#x003BC;</bold> denotes the mean vector, and &#x02211; denotes the covariance matrix. When the inscribed ellipse in (1) is set as a standard 2D Gaussian distribution, there is a conversion relationship between the elliptic and the Gaussian distribution in (3):</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M6"><mml:mrow><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>&#x003BC;</mml:mi></mml:mstyle><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mn>0</mml:mn></mml:msub></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mn>0</mml:mn></mml:msub></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>,</mml:mo><mml:mtext>&#x000A0;&#x000A0;</mml:mtext><mml:mstyle displaystyle='true'><mml:mo>&#x02211;</mml:mo><mml:mo>=</mml:mo></mml:mstyle><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mstyle scriptlevel='+1'><mml:mfrac><mml:mrow><mml:msup><mml:mi>w</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow><mml:mn>4</mml:mn></mml:mfrac></mml:mstyle></mml:mrow></mml:mtd><mml:mtd><mml:mn>0</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mrow><mml:mstyle scriptlevel='+1'><mml:mfrac><mml:mrow><mml:msup><mml:mi>h</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:mrow><mml:mn>4</mml:mn></mml:mfrac></mml:mstyle></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>At this time, both the ground-truth and anchor box can be modeled as a 2D Gaussian distribution according to the above-mentioned corresponding relationship.</p>
</sec>
<sec>
<title>3.3. Closed-form metrics for geometric JS divergence</title>
<p>Let (&#x003C7;, <italic>F</italic>) be the measurable space of the image plane, &#x003C7; be the sample space, and <italic>F</italic> be the &#x003C3;&#x02212;<italic>algebra</italic> of the measurable events. Denote the distribution variable established in last section as a positive measure &#x003BC;, the predicted frame of the object as <italic>P</italic>(&#x003BC;<sub>1</sub>, &#x003A3;<sub>1</sub>), and the true frame of the object as <italic>G</italic>(&#x003BC;<sub>2</sub>, &#x003A3;<sub>2</sub>). At this time, the most basic distribution distance KL Divergence can be defined as follows:</p>
<disp-formula id="E5"><label>(5)</label><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mo>:</mml:mo><mml:mi>G</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>:</mml:mo><mml:mo>=</mml:mo><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mo>&#x0002A;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>G</mml:mi><mml:mo>:</mml:mo><mml:mi>P</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>&#x0222B;</mml:mo><mml:mi>p</mml:mi><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mi>/</mml:mi><mml:mi>g</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>d</mml:mi><mml:mi>&#x003BC;</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>p</italic> and <italic>g</italic> represent the Radon-Nikodym derivatives of the Gaussian distribution <italic>P</italic> and <italic>G</italic> for the positive measure &#x003BC;, respectively, and &#x0201C;&#x0002A;&#x0201D; represents the inverse distance. It is clear that the KL divergence is an asymmetric distance. One method to achieve symmetric KL divergence is to convert to standard JS divergence, as follows:</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>J</mml:mi><mml:mi>S</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mo>:</mml:mo><mml:mi>G</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>:</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mo>:</mml:mo><mml:mfrac><mml:mrow><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>G</mml:mi><mml:mo>:</mml:mo><mml:mfrac><mml:mrow><mml:mi>P</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Yang et al. (<xref ref-type="bibr" rid="B31">2021b</xref>) discussed the use of JS divergence for distance measurement. However, the direct application of the above JS divergence to the distance metric leads to a problem where we ignore that the JS divergence between two Gaussian distributions is not available in closed form. Thus, we can hardly obtain a strict distance metric result and thus cannot accurately guide the regression process of the anchor box. Therefore, the JS divergence calculation for remote sensing tiny objects needs to use the closed-form formula, and the closed form can be obtained according to the given exponential family.</p>
<p><bold>Definition 1</bold> (Abstract mean function, AM). The abstract mean function <italic>AM</italic>(., .) is a continuous binary function, and on the domain of definition <italic>S</italic> &#x02282; &#x0211D;<sub>&#x0002B;</sub>, it satisfies the bounded range as follows:</p>
<disp-formula id="E7"><label>(7)</label><mml:math id="M9"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo class="qopname">inf</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>&#x02264;</mml:mo><mml:mi>A</mml:mi><mml:mi>M</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02264;</mml:mo><mml:mo class="qopname">sup</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x02200;</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>S</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>According to Frank (<xref ref-type="bibr" rid="B8">2019</xref>), based on <italic>AM</italic>, we construct a weighted expression <italic>AM</italic><sub>&#x003B1;</sub>(<italic>p, g</italic>) for probability distributions with densities <italic>p</italic> and <italic>g</italic>, where <italic>&#x003B1;</italic> &#x02208; [0, 1].</p>
<p><bold>Definition 2</bold> (Geometric statistical mixture, GSM). For the abstract mean function <italic>AM</italic><sub>&#x003B1;</sub>(<italic>p, g</italic>), with probability densities <italic>p</italic> and <italic>g</italic>, the mixture of distributions <italic>P</italic> and <italic>G</italic> with respect to the geometric mean M can be defined as:</p>
<disp-formula id="E8"><label>(8)</label><mml:math id="M10"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mi>G</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mi>&#x003B1;</mml:mi><mml:mi>M</mml:mi></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:mi>&#x003BC;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>:</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>A</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mi>&#x003B1;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>P</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>&#x003BC;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:mi>G</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>&#x003BC;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mrow><mml:msubsup><mml:mi>N</mml:mi><mml:mi>&#x003B1;</mml:mi><mml:mi>M</mml:mi></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mo>:</mml:mo><mml:mi>G</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mo>=</mml:mo><mml:mi>exp</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mtext>&#x0200B;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mi>&#x003B1;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mi>P</mml:mi><mml:mtext>&#x0200B;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mi>&#x003B1;</mml:mi><mml:mi>G</mml:mi><mml:mtext>&#x0200B;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mi>log</mml:mi><mml:msubsup><mml:mi>N</mml:mi><mml:mi>&#x003B1;</mml:mi><mml:mi>M</mml:mi></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mtext>&#x0200B;</mml:mtext><mml:mo>:</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mi>G</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M11"><mml:msubsup><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>:</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is the normalization sub-function. Now, for the distributions <italic>P</italic> and <italic>G</italic>, a statistical mixture function weighted by the geometric is obtained.</p>
<p><bold>Definition 3</bold> (Mean JS-divergence, AM-JS-divergence). Extending the concept of a geometric statistical mixture to the JS-divergence of two exponential family distributions, we obtain a generalized weighted form of geometric JS-divergence, and it is geometrically symmetric. The definition of mean JS divergence is as follows:</p>
<disp-formula id="E9"><label>(9)</label><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>J</mml:mi><mml:mi>S</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:msup><mml:mo>:</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mo>:</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msubsup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mi>G</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>G</mml:mi><mml:mo>:</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msubsup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mi>G</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>In particular, when &#x003B1; &#x0003D; 0 or &#x003B1; &#x0003D; 1, no significant mean JS divergence is obtained. The weights &#x003B1; imply a geometrical statistical mixture, so when <italic>&#x003B1;</italic> &#x02208; &#x02200;(0, 1), (<italic>JS</italic>)<sup><italic>M</italic></sup> can be used as the generalized JS divergence of the two exponential family distributions <italic>P</italic> and <italic>G</italic>.</p>
<p><bold>Proposition</bold> Assuming that the prediction box and ground-truth in the image conform to the 2D Gaussian distribution in the exponential family distribution and are denoted as <italic>P</italic>(&#x003BC;<sub>1</sub>, &#x003A3;<sub>1</sub>) and <italic>G</italic>(&#x003BC;<sub>2</sub>, &#x003A3;<sub>2</sub>), respectively, the geometric mean JS divergence between them can be expressed as follows:</p>
<disp-formula id="E10"><label>(10)</label><mml:math id="M13"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>J</mml:mi><mml:mi>S</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mi>G</mml:mi><mml:mi>&#x003B1;</mml:mi></mml:msub></mml:mrow></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>:</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msubsup><mml:mo>&#x003A3;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mtext>&#x0200B;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mi>&#x003B1;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mn>1</mml:mn></mml:msub><mml:mtext>&#x0200B;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mi>&#x003B1;</mml:mi><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mrow><mml:mo>+</mml:mo><mml:mi>log</mml:mi><mml:mfrac><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mi>&#x003B1;</mml:mi></mml:msub></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mn>1</mml:mn></mml:msub></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003B1;</mml:mi></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:msup></mml:mrow></mml:mfrac><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mn>2</mml:mn><mml:mo>+</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mtext>&#x0200B;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mi>&#x003B1;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mi>&#x003B1;</mml:mi></mml:msub><mml:mtext>&#x0200B;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mtext>T</mml:mtext></mml:msup><mml:msubsup><mml:mo>&#x003A3;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mi>&#x003B1;</mml:mi></mml:msub><mml:mtext>&#x0200B;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:mi>&#x003B1;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mi>&#x003B1;</mml:mi></mml:msub><mml:mtext>&#x0200B;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mtext>T</mml:mtext></mml:msup><mml:msubsup><mml:mo>&#x003A3;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mi>&#x003B1;</mml:mi></mml:msub><mml:mtext>&#x0200B;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mtext>&#x0200B;</mml:mtext><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mrow><mml:mo stretchy='false'>)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where (&#x003BC;<sub>&#x003B1;</sub>, &#x003A3;<sub>&#x003B1;</sub>) is the center of gravity of the matrix harmonics:</p>
<disp-formula id="E11"><label>(11)</label><mml:math id="M14"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mi>&#x003B1;</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mi>&#x003BC;</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mi>&#x003B1;</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:msubsup><mml:mo>&#x003A3;</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:msubsup><mml:mo>&#x003A3;</mml:mo><mml:mn>2</mml:mn><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mi>&#x003B1;</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mn>1</mml:mn></mml:msub><mml:msub><mml:mo>&#x003A3;</mml:mo><mml:mn>2</mml:mn></mml:msub><mml:msubsup><mml:mo stretchy='false'>)</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mo>&#x003A3;</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:msubsup><mml:mo>&#x003A3;</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:msubsup><mml:mo>&#x003A3;</mml:mo><mml:mn>2</mml:mn><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>According to proposition, JSDNet can learn the JS divergence representation between the prediction box and ground-truth, and then as the regression process of the anchor box (<italic>x</italic><sub>0</sub>, <italic>y</italic><sub>0</sub>, <italic>w, h</italic>). Specifically, the anchor-box regression process is realized by calculating the offset, which is the same as the fine-tuning mechanism of the parameter change of RetinaNet.</p>
<p><xref ref-type="fig" rid="F3">Figure 3</xref> shows the two-dimensional spatial regression calculation process of the JSDM module. First, the bounding box of a tiny object is modeled to obtain a 2D ellipse. Then, the geometric mean JS divergence is used as distance measure between two 2D Gaussian distributions. Finally, update the four parameters of the prediction box to make the regression network converge.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>2D space approximation process of geometric mean JS divergence module. <bold>(A)</bold> Modeling a Gaussian distribution for bounding box. <bold>(B)</bold> Calculate geometric JS divergence in 2D space. <bold>(C)</bold> Adjust predicted box to approximate two distributions.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-17-1273251-g0003.tif"/>
</fig>
</sec>
<sec>
<title>3.4. JSDNet training</title>
<p>This section defines the classification and regression loss function for JSDNet. First, a nonlinear relationship between the distance function and the geometric mean JS divergence is established. Specifically, square the geometric mean JS divergence in the proposition and convert it into a fractional form, as follows:</p>
<disp-formula id="E12"><label>(12)</label><mml:math id="M15"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>J</mml:mi><mml:mi>S</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>&#x003C4;</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>f</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>J</mml:mi><mml:mi>S</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mtext>&#x000A0;</mml:mtext><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mi>&#x003C4;</mml:mi><mml:mo>&#x02265;</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where, &#x003C4; is the offset hyperparameter, <italic>f</italic>(.) is the square operation of the distance function, and belongs to a nonlinear expression. Tiny objects in remote sensing usually occupy a small proportion of pixels, so horizontal bounding boxes are chosen to locate tiny objects. Assuming that the predicted bounding box of tiny objects follows a Gaussian distribution <italic>N</italic><sub><italic>P</italic></sub> and the ground-truth follows a Gaussian distribution <italic>N</italic><sub><italic>G</italic></sub>, each horizontal bounding box uses a four parameter definition method (<italic>x</italic><sub>0</sub>, <italic>y</italic><sub>0</sub>, <italic>w, h</italic>) to represent the center point coordinates and side length of the rectangle. Therefore, the calculation relationship between the relative translation (&#x003B4;<sub><italic>x</italic></sub>, &#x003B4;<sub><italic>y</italic></sub>) and the size scaling (&#x003B4;<sub><italic>w</italic></sub>, &#x003B4;<sub><italic>h</italic></sub>) is as follows, which can guide the horizontal bounding box of tiny objects to update coordinates.</p>
<disp-formula id="E13"><label>(13)</label><mml:math id="M16"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mi>&#x003B4;</mml:mi><mml:mi>x</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>x</mml:mi></mml:msub><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>/</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x000A0;&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003B4;</mml:mi><mml:mi>y</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>y</mml:mi></mml:msub><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>/</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>a</mml:mi></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>&#x003B4;</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>log</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mo>/</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x000A0;&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003B4;</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>log</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>/</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where, (<italic>x</italic><sub><italic>a</italic></sub>, <italic>y</italic><sub><italic>a</italic></sub>, <italic>w</italic><sub><italic>a</italic></sub>, <italic>h</italic><sub><italic>a</italic></sub>) represents an anchor box for the regression process. The differential calculation of the anchor box regression process may result in a very small value for (13). This typically results in regression losses that are much smaller than classification losses. Therefore, normalizing the mean and variance of (&#x003B4;<sub><italic>x</italic></sub>, &#x003B4;<sub><italic>y</italic></sub>, &#x003B4;<sub><italic>w</italic></sub>, &#x003B4;<sub><italic>h</italic></sub>), and incorporating the geometric mean form JS divergence in the previous section into the standard regression loss function, as shown in Equation (14), can avoid the limitations of traditional IoU loss.</p>
<disp-formula id="E14"><label>(14)</label><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>J</mml:mi><mml:mi>S</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
</sec>
<sec id="s4">
<title>4. Experiments</title>
<p>Our experiments are conducted on the AI-TOD and DOTA1.0 datasets and compared with advanced general object detectors to verify the effectiveness of the proposed method for remote sensing tiny object detection.</p>
<sec>
<title>4.1. Experimental settings</title>
<p>(1) Dataset: AI-TOD dataset is a remote sensing tiny object dataset with 28,036 images of 800 &#x000D7; 800 pixels, including eight categories and 700,621 tiny objects. These instances are different from objects in other datasets, as the instances have a small number of pixels. Therefore, the dataset is suitable for training and testing the tiny object detector proposed in this article. We abbreviate the AI-TOD object classes as airplane (APL), bridge (BR), storage-tank (ST), ship (SH), swimming-pool (SP), vehicle (VE), person (PE), and wind-mill (WM). The DOTA1.0 dataset is a public large-scale remote sensing image object detection dataset, with 2,806 satellite or aerial images of about 4,000 &#x000D7; 4,000 pixels, including 15 object categories and 188,282 instances. We only use data augmentation on the DOTA1.0 dataset to avoid network training overfitting.</p>
<p>(2) Evaluation Metrics: We use average precision (AP) and mean average precision (mAP) to compare the performance of different detectors. Also, we refer to the evaluation indicators definition in AI-TOD dataset, including <italic>AP</italic> calculation under different IoU thresholds, and the evaluation of different scales of pixels (<italic>AP</italic><sub><italic>vt</italic></sub>, <italic>AP</italic><sub><italic>t</italic></sub>, <italic>AP</italic><sub><italic>s</italic></sub> and <italic>AP</italic><sub><italic>m</italic></sub> represent 2-8 pixels, 8-16 pixels, 16-32 pixels, and 32-64 pixels, respectively), along with the accuracy calculations for each category.</p>
<p>(3) Details: All experiments are performed on a workstation with an NVIDIA RTX 3090 GPU (24G). We use Swin Transformer as the pretrained model for network fine-tuning. During model training, the SGD optimizer is used for gradient descent and updates, the initial learning rate is set to 0.001, and the weight coefficient &#x003B1; are compared with multiple sets of values. The strides for training AI-TOD and DOTA datasets are 320K and 360K, respectively; the weight momentum and decay are set to 0.9 and 0.0001, respectively; and the batch size for training each model is set to 4.</p>
</sec>
<sec>
<title>4.2. Ablation studies</title>
<p>To verify the effectiveness of the proposed method composition structure, we conduct ablation analysis on two datasets. <xref ref-type="table" rid="T1">Table 1</xref> shows the results of using <italic>AP</italic><sub>50</sub> ablation to analyze the effect of each component in JSDNet, including the effect of the Transformer structure integrated into the CNN network, the effect of directly using the original JS divergence formula, and the improved effect of using the geometric JS divergence. The comparison shows that the model based on the Transformer backbone can slightly improve the object feature extraction ability. Compared with the baseline algorithm, the AI-TOD and DOTA datasets increase the <italic>AP</italic> value by 5.2% and 2.6% respectively. Compared with the original JS divergence formula, the improved geometric JS divergence with closed-form formula can better improve the performance of object detectors, and the <italic>AP</italic> value is increased by 5.9% and 2.2% respectively. We believe that the JSDM module can greatly improve the detection results. This module provides a more accurate anchor box regression calculation method, which alleviates two shortcomings of IoU threshold calculation (i.e., imbalance in the number of positive and negative samples for tiny objects and imbalance in scale samples). Compared to using the original JS divergence formula, geometric JS divergence belongs to a more accurate closed form, which can reduce the systematic error of numerical calculation, and thus obtain better detection results for tiny objects.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Ablation study on AI-TOD and DOTA datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Component/dataset</bold></th>
<th valign="top" align="center"><bold>Baseline</bold></th>
<th valign="top" align="center" colspan="5"><bold>Different setting of JSDNet</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Swin-trans.</td>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td valign="top" align="center">&#x02713;</td>
</tr>
<tr>
<td valign="top" align="left">JSDM-Ori.</td>
<td/>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">JSDM</td>
<td/>
<td/>
<td/>
<td valign="top" align="center">&#x02713;</td>
<td/>
<td valign="top" align="center">&#x02713;</td>
</tr>
<tr>
<td valign="top" align="left">AI-TOD</td>
<td valign="top" align="center">24.2</td>
<td valign="top" align="center">29.4</td>
<td valign="top" align="center">40.1</td>
<td valign="top" align="center">46.6</td>
<td valign="top" align="center">46.3</td>
<td valign="top" align="center">52.2</td>
</tr>
<tr>
<td valign="top" align="left">DOTA</td>
<td valign="top" align="center">62.0</td>
<td valign="top" align="center">64.6</td>
<td valign="top" align="center">68.5</td>
<td valign="top" align="center">70.7</td>
<td valign="top" align="center">70.9</td>
<td valign="top" align="center">73.1</td>
</tr></tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="T2">Table 2</xref> explores the impact of the weight coefficients of geometric JS divergence on detection performance on the AI-TOD dataset. As can be seen, when &#x003B1; &#x0003D; 0.5, the detector was able to achieve the optimal detection effect, with the <italic>AP</italic><sub>50</sub> value reaching 52.2%. The smaller or larger the value of &#x003B1;, the more unbalanced the coupling between the covariance matrices of the two Gaussian distributions. This will lead to deviations in the regression constraints, and weakening the detection effect. The experiment shows that the improved geometric JS divergence can obtain closed form calculation results. When the covariance matrices of two Gaussian distributions are balanced coupled together, better detection results can be obtained, and these results are approximately symmetric.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Effect study of different &#x003B1; values on AI-TOD dataset.</p></caption>
<table frame="box" rules="all">
<tbody>
<tr>
<td valign="top" align="left"><bold>&#x003B1;</bold></td>
<td valign="top" align="center"><bold>&#x003B1; &#x0003D; 0.1</bold></td>
<td valign="top" align="center"><bold>&#x003B1; &#x0003D; 0.2</bold></td>
<td valign="top" align="left"><bold>&#x003B1; &#x0003D; 0.3</bold></td>
<td valign="top" align="left"><bold>&#x003B1; &#x0003D; 0.4</bold></td>
<td valign="top" align="center"><bold>&#x003B1; &#x0003D; 0.5</bold></td>
</tr>
<tr>
<td valign="top" align="left"><italic>AP</italic><sub>50</sub></td>
<td valign="top" align="center">43.2</td>
<td valign="top" align="center">47.1</td>
<td valign="top" align="left">49.7</td>
<td valign="top" align="left">51.6</td>
<td valign="top" align="center">52.2</td>
</tr>
<tr>
<td valign="top" align="left">&#x003B1;</td>
<td valign="top" align="center">&#x003B1; &#x0003D; 0.6</td>
<td valign="top" align="center">&#x003B1; &#x0003D; 0.7</td>
<td valign="top" align="left">&#x003B1; &#x0003D; 0.8</td>
<td valign="top" align="left">&#x003B1; &#x0003D; 0.9</td>
<td/>
</tr>
<tr>
<td valign="top" align="left"><italic>AP</italic><sub>50</sub></td>
<td valign="top" align="center">51.5</td>
<td valign="top" align="center">50.4</td>
<td valign="top" align="left">48.3</td>
<td valign="top" align="left">45.0</td>
<td/>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>4.3. Comparison and discussion</title>
<p>This section evaluates JSDNet and various algorithms on AI-TOD and DOTA datasets.</p>
<p>(1) AI-TOD dataset: We have conducted experiments on some baseline object detectors, including methods with and without anchor box. <xref ref-type="table" rid="T3">Table 3</xref> is a comparison of the quantitative results of the algorithm, listing the <italic>AP</italic> value calculation results for different thresholds and scales. It can be seen that the proposed algorithm significantly improves the detection performance of tiny objects in remote sensing. JSDNet achieved 52.2% on the <italic>AP</italic><sub>50</sub> and 13.0% on the <italic>AP</italic><sub>75</sub>, leading other methods, including the GWD and KLD methods under horizontal bounding box detection. CenterNet and YOLOv5 have achieved good results in traditional detectors, but it is clear that these methods are weak for tiny object detection. Experiment results demonstrate the effectiveness of using the analytic form of geometric JS divergence in the measurement of object detection distribution, and achieve the most advanced performance. The <italic>AP</italic><sub><italic>vt</italic></sub> and <italic>AP</italic><sub><italic>t</italic></sub> represent the evaluation of tiny object detection, with JSDNet reaching 8.6% and 19.3%, respectively, which is better than other methods, indicating that JSDNet can effectively learn the geometric JS divergence representation of remote sensing tiny objects, thereby avoiding the traditional IoU calculation.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Comparison of quantitative results of different indicators on AI-TOD.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Methods</bold></th>
<th valign="top" align="center"><bold>Backbone</bold></th>
<th valign="top" align="center"><bold><italic>AP</italic></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub>50</sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub>75</sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub><italic>vt</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub><italic>t</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub><italic>s</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub><italic>m</italic></sub></bold></th>
</tr>
</thead>
<tbody>
<tr style="background-color:#dee1e1">
<td valign="top" align="left" colspan="9"><bold>Anchor-free</bold></td>
</tr>
<tr>
<td valign="top" align="left">PepPonits (Yang et al., <xref ref-type="bibr" rid="B33">2019</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">9.2</td>
<td valign="top" align="center">23.6</td>
<td valign="top" align="center">5.3</td>
<td valign="top" align="center">2.5</td>
<td valign="top" align="center">9.2</td>
<td valign="top" align="center">12.9</td>
<td valign="top" align="center">14.4</td>
</tr>
<tr>
<td valign="top" align="left">FoveaBox (Kong et al., <xref ref-type="bibr" rid="B13">2020</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">11.3</td>
<td valign="top" align="center">28.1</td>
<td valign="top" align="center">7.4</td>
<td valign="top" align="center">1.4</td>
<td valign="top" align="center">8.6</td>
<td valign="top" align="center">17.8</td>
<td valign="top" align="center">32.2</td>
</tr>
<tr>
<td valign="top" align="left">FCOS (Tian et al., <xref ref-type="bibr" rid="B26">2019</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">12.0</td>
<td valign="top" align="center">30.2</td>
<td valign="top" align="center">7.3</td>
<td valign="top" align="center">2.2</td>
<td valign="top" align="center">11.1</td>
<td valign="top" align="center">16.6</td>
<td valign="top" align="center">26.9</td>
</tr>
<tr>
<td valign="top" align="left">Grid R-CNN (Lu et al., <xref ref-type="bibr" rid="B22">2019</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">14.3</td>
<td valign="top" align="center">31.1</td>
<td valign="top" align="center">11.0</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">11.0</td>
<td valign="top" align="center">25.7</td>
<td valign="top" align="center">36.7</td>
</tr>
<tr style="background-color:#dee1e1">
<td valign="top" align="left" colspan="9"><bold>Two-stage</bold></td>
</tr>
<tr>
<td valign="top" align="left">TridentNet (Li et al., <xref ref-type="bibr" rid="B15">2019</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">10.1</td>
<td valign="top" align="center">24.5</td>
<td valign="top" align="center">6.7</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">6.3</td>
<td valign="top" align="center">19.8</td>
<td valign="top" align="center">31.9</td>
</tr>
<tr>
<td valign="top" align="left">Faster R-CNN (Ren et al., <xref ref-type="bibr" rid="B24">2017</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">12.8</td>
<td valign="top" align="center">29.9</td>
<td valign="top" align="center">9.4</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">9.2</td>
<td valign="top" align="center">24.6</td>
<td valign="top" align="center">37.0</td>
</tr>
<tr>
<td valign="top" align="left">Cascade R-CNN (Cai and Vasconcelos, <xref ref-type="bibr" rid="B2">2018</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">15.1</td>
<td valign="top" align="center">34.2</td>
<td valign="top" align="center">11.2</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">11.5</td>
<td valign="top" align="center">26.7</td>
<td valign="top" align="center">38.5</td>
</tr>
<tr>
<td valign="top" align="left">DetectoRS (Qiao et al., <xref ref-type="bibr" rid="B23">2021</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">16.1</td>
<td valign="top" align="center">35.5</td>
<td valign="top" align="center">12.5</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">12.6</td>
<td valign="top" align="center">28.3</td>
<td valign="top" align="center"><bold>40.0</bold></td>
</tr>
<tr style="background-color:#dee1e1">
<td valign="top" align="left" colspan="9"><bold>One-stage</bold></td>
</tr>
<tr>
<td valign="top" align="left">RetinaNet (Lin et al., <xref ref-type="bibr" rid="B17">2020</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">8.9</td>
<td valign="top" align="center">24.2</td>
<td valign="top" align="center">4.6</td>
<td valign="top" align="center">2.7</td>
<td valign="top" align="center">8.4</td>
<td valign="top" align="center">13.1</td>
<td valign="top" align="center">20.4</td>
</tr>
<tr>
<td valign="top" align="left">SSD (Liu et al., <xref ref-type="bibr" rid="B19">2016</xref>)</td>
<td valign="top" align="center">VGG-16</td>
<td valign="top" align="center">10.7</td>
<td valign="top" align="center">32.5</td>
<td valign="top" align="center">4.0</td>
<td valign="top" align="center">2.0</td>
<td valign="top" align="center">8.7</td>
<td valign="top" align="center">16.8</td>
<td valign="top" align="center">28.0</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5 (Bochkovskiy et al., <xref ref-type="bibr" rid="B1">2020</xref>)</td>
<td valign="top" align="center">DarkNet-53</td>
<td valign="top" align="center">11.5</td>
<td valign="top" align="center">36.6</td>
<td valign="top" align="center">4.7</td>
<td valign="top" align="center">3.5</td>
<td valign="top" align="center">9.1</td>
<td valign="top" align="center">19.2</td>
<td valign="top" align="center">27.2</td>
</tr>
<tr>
<td valign="top" align="left">CenterNet (Duan et al., <xref ref-type="bibr" rid="B6">2019</xref>)</td>
<td valign="top" align="center">DLA-34</td>
<td valign="top" align="center">16.7</td>
<td valign="top" align="center">37.1</td>
<td valign="top" align="center">3.7</td>
<td valign="top" align="center">2.8</td>
<td valign="top" align="center">10.1</td>
<td valign="top" align="center">15.5</td>
<td valign="top" align="center">18.0</td>
</tr>
<tr>
<td valign="top" align="left">GWD-hor (Yang et al., <xref ref-type="bibr" rid="B30">2021a</xref>)</td>
<td valign="top" align="center">Resnet-101</td>
<td valign="top" align="center">17.0</td>
<td valign="top" align="center">41.9</td>
<td valign="top" align="center">7.8</td>
<td valign="top" align="center">4.4</td>
<td valign="top" align="center">15.3</td>
<td valign="top" align="center">22.7</td>
<td valign="top" align="center">28.8</td>
</tr>
<tr>
<td valign="top" align="left">KLD-hor (Yang et al., <xref ref-type="bibr" rid="B31">2021b</xref>)</td>
<td valign="top" align="center">Resnet-101</td>
<td valign="top" align="center">17.7</td>
<td valign="top" align="center">44.3</td>
<td valign="top" align="center">11.3</td>
<td valign="top" align="center">4.8</td>
<td valign="top" align="center">17.1</td>
<td valign="top" align="center">23.6</td>
<td valign="top" align="center">30.3</td>
</tr>
<tr>
<td valign="top" align="left">JSDNet (ours)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">18.2</td>
<td valign="top" align="center">46.6</td>
<td valign="top" align="center">10.5</td>
<td valign="top" align="center">5.4</td>
<td valign="top" align="center">15.9</td>
<td valign="top" align="center">24.4</td>
<td valign="top" align="center">31.6</td>
</tr>
<tr>
<td valign="top" align="left">JSDNet (ours)</td>
<td valign="top" align="center">Resnet-101</td>
<td valign="top" align="center">19.8</td>
<td valign="top" align="center">49.4</td>
<td valign="top" align="center">11.6</td>
<td valign="top" align="center">7.3</td>
<td valign="top" align="center">18.7</td>
<td valign="top" align="center">26.4</td>
<td valign="top" align="center">32.4</td>
</tr>
<tr>
<td valign="top" align="left">JSDNet (ours)</td>
<td valign="top" align="center">Swin-Trans</td>
<td valign="top" align="center"><bold>21.4</bold></td>
<td valign="top" align="center"><bold>52.2</bold></td>
<td valign="top" align="center"><bold>13.0</bold></td>
<td valign="top" align="center"><bold>8.6</bold></td>
<td valign="top" align="center"><bold>19.3</bold></td>
<td valign="top" align="center"><bold>29.0</bold></td>
<td valign="top" align="center">35.7</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the maximum value of the vertical column.</p>
</table-wrap-foot>
</table-wrap>
<p><xref ref-type="table" rid="T4">Table 4</xref> shows the detection results for eight object categories in the AI-TOD dataset. The proposed method is leading in terms of effectiveness in six categories, only second to the optimal results in the other two categories. The horizontal bounding box detection results using Wasserstein distance and KL divergence for distance measurement are listed in the table. Although they have also achieved good results, they are not closed formulas in information geometry, resulting in errors in the similarity measurements. Therefore, using the geometric JS divergence method achieves better detection performance. In addition, in some challenging object categories, such as SP, PE, WM, etc., JSDNet has advantages in detection effectiveness. The distribution of samples in these categories is uneven, and the background around the object is complex. Therefore, all methods have obtained lower AP values. <xref ref-type="fig" rid="F4">Figure 4</xref> shows some qualitative reasoning results for JSDNet. It is worth noting that JSDNet can accurately detect densely distributed tiny objects, such as vehicles, ships, storage tanks, and so on. Although JSDNet uses a horizontal bounding box, from the visual effect, the horizontal box is more suitable for positioning tiny objects in remote sensing image, and using a rotation box has little significance.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Comparison of quantitative results of different categories on AI-TOD.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Methods</bold></th>
<th valign="top" align="left"><bold>Backbone</bold></th>
<th valign="top" align="center"><bold>APL</bold></th>
<th valign="top" align="center"><bold>BR</bold></th>
<th valign="top" align="center"><bold>ST</bold></th>
<th valign="top" align="center"><bold>SH</bold></th>
<th valign="top" align="center"><bold>SP</bold></th>
<th valign="top" align="center"><bold>VE</bold></th>
<th valign="top" align="center"><bold>PE</bold></th>
<th valign="top" align="center"><bold>WM</bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub>50</sub></bold></th>
</tr>
</thead>
<tbody>
<tr style="background-color:#dee1e1">
<td valign="top" align="left" colspan="11"><bold>Anchor-free</bold></td>
</tr>
<tr>
<td valign="top" align="left">PepPonits (Yang et al., <xref ref-type="bibr" rid="B33">2019</xref>)</td>
<td valign="top" align="center">Res-50</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">22.5</td>
<td valign="top" align="center">28.8</td>
<td valign="top" align="center">0.2</td>
<td valign="top" align="center">18.3</td>
<td valign="top" align="center">4.1</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">23.6</td>
</tr>
<tr>
<td valign="top" align="left">FoveaBox (Kong et al., <xref ref-type="bibr" rid="B13">2020</xref>)</td>
<td valign="top" align="center">Res-50</td>
<td valign="top" align="center">15.6</td>
<td valign="top" align="center">3.3</td>
<td valign="top" align="center">21.1</td>
<td valign="top" align="center">20.8</td>
<td valign="top" align="center">9.7</td>
<td valign="top" align="center">16.3</td>
<td valign="top" align="center">4.0</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">28.1</td>
</tr>
<tr>
<td valign="top" align="left">FCOS (Tian et al., <xref ref-type="bibr" rid="B26">2019</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">7.2</td>
<td valign="top" align="center">13.4</td>
<td valign="top" align="center">20.2</td>
<td valign="top" align="center">26.7</td>
<td valign="top" align="center">8.4</td>
<td valign="top" align="center">16.3</td>
<td valign="top" align="center">3.5</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">30.2</td>
</tr>
<tr>
<td valign="top" align="left">Grid R-CNN (Lu et al., <xref ref-type="bibr" rid="B22">2019</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">24.5</td>
<td valign="top" align="center">11.7</td>
<td valign="top" align="center">20.9</td>
<td valign="top" align="center">23.5</td>
<td valign="top" align="center">12.1</td>
<td valign="top" align="center">16.1</td>
<td valign="top" align="center">5.1</td>
<td valign="top" align="center">0.4</td>
<td valign="top" align="center">31.1</td>
</tr>
<tr style="background-color:#dee1e1">
<td valign="top" align="left" colspan="11"><bold>Two-stage</bold></td>
</tr>
<tr>
<td valign="top" align="left">TridentNet (Li et al., <xref ref-type="bibr" rid="B15">2019</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">19.3</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">17.2</td>
<td valign="top" align="center">16.2</td>
<td valign="top" align="center">12.4</td>
<td valign="top" align="center">12.5</td>
<td valign="top" align="center">3.4</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">24.5</td>
</tr>
<tr>
<td valign="top" align="left">Faster R-CNN (Ren et al., <xref ref-type="bibr" rid="B24">2017</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">19.7</td>
<td valign="top" align="center">4.8</td>
<td valign="top" align="center">19.0</td>
<td valign="top" align="center">19.9</td>
<td valign="top" align="center">3.7</td>
<td valign="top" align="center">14.4</td>
<td valign="top" align="center">4.8</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">29.9</td>
</tr>
<tr>
<td valign="top" align="left">Cascade R-CNN (Cai and Vasconcelos, <xref ref-type="bibr" rid="B2">2018</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">26.2</td>
<td valign="top" align="center">9.6</td>
<td valign="top" align="center">24.0</td>
<td valign="top" align="center">24.3</td>
<td valign="top" align="center">13.2</td>
<td valign="top" align="center">17.5</td>
<td valign="top" align="center">5.8</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">34.2</td>
</tr>
<tr>
<td valign="top" align="left">DetectoRS (Qiao et al., <xref ref-type="bibr" rid="B23">2021</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">28.5</td>
<td valign="top" align="center">11.7</td>
<td valign="top" align="center">23.2</td>
<td valign="top" align="center">26.4</td>
<td valign="top" align="center">14.9</td>
<td valign="top" align="center">17.6</td>
<td valign="top" align="center">6.5</td>
<td valign="top" align="center">0.2</td>
<td valign="top" align="center">35.5</td>
</tr>
<tr style="background-color:#dee1e1">
<td valign="top" align="left" colspan="11"><bold>One-stage</bold></td>
</tr>
<tr>
<td valign="top" align="left">RetinaNet (Lin et al., <xref ref-type="bibr" rid="B17">2020</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">1.3</td>
<td valign="top" align="center">11.8</td>
<td valign="top" align="center">14.3</td>
<td valign="top" align="center">23.6</td>
<td valign="top" align="center">5.8</td>
<td valign="top" align="center">11.4</td>
<td valign="top" align="center">2.3</td>
<td valign="top" align="center">0.5</td>
<td valign="top" align="center">24.2</td>
</tr>
<tr>
<td valign="top" align="left">SSD (Liu et al., <xref ref-type="bibr" rid="B19">2016</xref>)</td>
<td valign="top" align="center">VGG-16</td>
<td valign="top" align="center">14.9</td>
<td valign="top" align="center">9.6</td>
<td valign="top" align="center">13.2</td>
<td valign="top" align="center">18.2</td>
<td valign="top" align="center">10.6</td>
<td valign="top" align="center">12.7</td>
<td valign="top" align="center">2.9</td>
<td valign="top" align="center">3.1</td>
<td valign="top" align="center">32.5</td>
</tr>
<tr>
<td valign="top" align="left">YOLOv5 (Bochkovskiy et al., <xref ref-type="bibr" rid="B1">2020</xref>)</td>
<td valign="top" align="center">DarkNet-53</td>
<td valign="top" align="center">19.6</td>
<td valign="top" align="center">10.7</td>
<td valign="top" align="center">11.3</td>
<td valign="top" align="center">22.0</td>
<td valign="top" align="center">9.2</td>
<td valign="top" align="center">14.3</td>
<td valign="top" align="center">3.7</td>
<td valign="top" align="center">0.9</td>
<td valign="top" align="center">36.6</td>
</tr>
<tr>
<td valign="top" align="left">CenterNet (Duan et al., <xref ref-type="bibr" rid="B6">2019</xref>)</td>
<td valign="top" align="center">DLA-34</td>
<td valign="top" align="center">29.2</td>
<td valign="top" align="center">13.1</td>
<td valign="top" align="center">22.9</td>
<td valign="top" align="center">27.7</td>
<td valign="top" align="center"><bold>15.6</bold></td>
<td valign="top" align="center">19.0</td>
<td valign="top" align="center">7.2</td>
<td valign="top" align="center">0.2</td>
<td valign="top" align="center">37.1</td>
</tr>
<tr>
<td valign="top" align="left">GWD-hor (Yang et al., <xref ref-type="bibr" rid="B30">2021a</xref>)</td>
<td valign="top" align="center">Resnet-101</td>
<td valign="top" align="center">26.3</td>
<td valign="top" align="center">12.6</td>
<td valign="top" align="center">28.1</td>
<td valign="top" align="center">25.5</td>
<td valign="top" align="center">13.1</td>
<td valign="top" align="center">21.3</td>
<td valign="top" align="center">5.9</td>
<td valign="top" align="center">3.5</td>
<td valign="top" align="center">41.9</td>
</tr>
<tr>
<td valign="top" align="left">KLD-hor (Yang et al., <xref ref-type="bibr" rid="B31">2021b</xref>)</td>
<td valign="top" align="center">Resnet-101</td>
<td valign="top" align="center">25.1</td>
<td valign="top" align="center">13.8</td>
<td valign="top" align="center">28.9</td>
<td valign="top" align="center">27.4</td>
<td valign="top" align="center">14.3</td>
<td valign="top" align="center">22.0</td>
<td valign="top" align="center">6.2</td>
<td valign="top" align="center">4.1</td>
<td valign="top" align="center">44.3</td>
</tr>
<tr>
<td valign="top" align="left">JSDNet (ours)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">25.8</td>
<td valign="top" align="center">15.8</td>
<td valign="top" align="center">30.4</td>
<td valign="top" align="center">29.7</td>
<td valign="top" align="center">12.5</td>
<td valign="top" align="center">20.6</td>
<td valign="top" align="center">6.0</td>
<td valign="top" align="center">4.9</td>
<td valign="top" align="center">46.6</td>
</tr>
<tr>
<td valign="top" align="left">JSDNet (ours)</td>
<td valign="top" align="center">Resnet-101</td>
<td valign="top" align="center">27.1</td>
<td valign="top" align="center"><bold>16.4</bold></td>
<td valign="top" align="center">33.6</td>
<td valign="top" align="center">31.5</td>
<td valign="top" align="center">13.9</td>
<td valign="top" align="center">23.0</td>
<td valign="top" align="center">7.2</td>
<td valign="top" align="center">5.7</td>
<td valign="top" align="center">49.4</td>
</tr>
<tr>
<td valign="top" align="left">JSDNet (ours)</td>
<td valign="top" align="center">Swin-Trans</td>
<td valign="top" align="center"><bold>29.9</bold></td>
<td valign="top" align="center">16.2</td>
<td valign="top" align="center"><bold>34.4</bold></td>
<td valign="top" align="center"><bold>33.0</bold></td>
<td valign="top" align="center">14.7</td>
<td valign="top" align="center"><bold>26.5</bold></td>
<td valign="top" align="center"><bold>8.6</bold></td>
<td valign="top" align="center"><bold>7.9</bold></td>
<td valign="top" align="center"><bold>52.2</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the maximum value of the vertical column.</p>
</table-wrap-foot>
</table-wrap>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Qualitative inference results of JSDNet on AI-TOD. <bold>(A)</bold> Airplane. <bold>(B)</bold> Bridge. <bold>(C)</bold> storage tank. <bold>(D)</bold> Ship. <bold>(E)</bold> Swimming pool. <bold>(F)</bold> Vehicle. <bold>(G)</bold> Person. <bold>(H)</bold> Wind mill.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-17-1273251-g0004.tif"/>
</fig>
<p>(2) DOTA dataset: <xref ref-type="table" rid="T5">Table 5</xref> lists the detection results of JSDNet and some baseline algorithms on the DOTA dataset. When using Resnet-50 as the backbone network, the detection results of this method are still good, with <italic>AP</italic><sub>50</sub> achieving 70.7%. In terms of <italic>AP</italic><sub><italic>vt</italic></sub> and <italic>AP</italic><sub><italic>t</italic></sub> indicators, some general detectors performed weakly. We believe that this is due to the impact of IoU calculation and threshold setting for tiny objects, while the GWD, KLD and JSDNet with horizontal bounding box have improved this issue somewhat. When using Swin Transformer as the backbone network, JSDNet can extract features of tiny objects more sufficient, improving the detection results. The <italic>AP</italic><sub>50</sub> achieved 73.1%. <xref ref-type="fig" rid="F5">Figure 5</xref> is visual result of JSDNet on the DOTA test set. JSDNet can accurately regress the spatial location information of tiny objects. The figure shows the detection effect of bridges and airplanes, which belong to smaller objects in the dataset and can still be accurately located.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Comparison of quantitative results on DOTA.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Methods</bold></th>
<th valign="top" align="center"><bold>Backbone</bold></th>
<th valign="top" align="center"><bold><italic>AP</italic></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub>50</sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub>75</sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub><italic>vt</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub><italic>t</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub><italic>s</italic></sub></bold></th>
<th valign="top" align="center"><bold><italic>AP</italic><sub><italic>m</italic></sub></bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Faster R-CNN (Ren et al., <xref ref-type="bibr" rid="B24">2017</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">35.6</td>
<td valign="top" align="center">59.5</td>
<td valign="top" align="center">37.2</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">7.1</td>
<td valign="top" align="center">28.9</td>
<td valign="top" align="center">42.1</td>
</tr>
<tr>
<td valign="top" align="left">Cascade R-CNN (Cai and Vasconcelos, <xref ref-type="bibr" rid="B2">2018</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">37.0</td>
<td valign="top" align="center">59.5</td>
<td valign="top" align="center">39.6</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">5.9</td>
<td valign="top" align="center">28.4</td>
<td valign="top" align="center">44.0</td>
</tr>
<tr>
<td valign="top" align="left">DetectoRS (Qiao et al., <xref ref-type="bibr" rid="B23">2021</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">40.8</td>
<td valign="top" align="center">62.6</td>
<td valign="top" align="center">44.4</td>
<td valign="top" align="center">0.0</td>
<td valign="top" align="center">7.0</td>
<td valign="top" align="center">29.9</td>
<td valign="top" align="center">47.8</td>
</tr>
<tr>
<td valign="top" align="left">RetinaNet (Lin et al., <xref ref-type="bibr" rid="B17">2020</xref>)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">40.5</td>
<td valign="top" align="center">62.0</td>
<td valign="top" align="center">43.9</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">6.5</td>
<td valign="top" align="center">30.2</td>
<td valign="top" align="center">46.7</td>
</tr>
<tr>
<td valign="top" align="left">GWD-hor (Yang et al., <xref ref-type="bibr" rid="B30">2021a</xref>)</td>
<td valign="top" align="center">Resnet-101</td>
<td valign="top" align="center">41.1</td>
<td valign="top" align="center">63.8</td>
<td valign="top" align="center">43.3</td>
<td valign="top" align="center">0.4</td>
<td valign="top" align="center">8.5</td>
<td valign="top" align="center">30.5</td>
<td valign="top" align="center">48.7</td>
</tr>
<tr>
<td valign="top" align="left">KLD-hor (Yang et al., <xref ref-type="bibr" rid="B31">2021b</xref>)</td>
<td valign="top" align="center">Resnet-101</td>
<td valign="top" align="center">41.8</td>
<td valign="top" align="center">67.2</td>
<td valign="top" align="center">44.2</td>
<td valign="top" align="center">0.8</td>
<td valign="top" align="center">9.4</td>
<td valign="top" align="center">32.0</td>
<td valign="top" align="center">50.1</td>
</tr>
<tr>
<td valign="top" align="left">JSDNet (ours)</td>
<td valign="top" align="center">Resnet-50</td>
<td valign="top" align="center">43.4</td>
<td valign="top" align="center">70.7</td>
<td valign="top" align="center">44.9</td>
<td valign="top" align="center">1.3</td>
<td valign="top" align="center">10.1</td>
<td valign="top" align="center">32.4</td>
<td valign="top" align="center">50.3</td>
</tr>
<tr>
<td valign="top" align="left">JSDNet (ours)</td>
<td valign="top" align="center">Swin-Trans</td>
<td valign="top" align="center"><bold>45.2</bold></td>
<td valign="top" align="center"><bold>73.1</bold></td>
<td valign="top" align="center"><bold>47.0</bold></td>
<td valign="top" align="center"><bold>1.7</bold></td>
<td valign="top" align="center"><bold>12.9</bold></td>
<td valign="top" align="center"><bold>34.2</bold></td>
<td valign="top" align="center"><bold>52.4</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the maximum value of the vertical column.</p>
</table-wrap-foot>
</table-wrap>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Qualitative inference results of JSDNet on DOTA. <bold>(A)</bold> Bridge. <bold>(B)</bold> Plane.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-17-1273251-g0005.tif"/>
</fig></sec>
</sec>
<sec sec-type="conclusions" id="s5">
<title>5. Conclusion</title>
<p>The sensitivity of tiny object detection in remote sensing images to the IoU threshold and the IoU calculation process makes a robust tiny object detector particularly important. A small position offset leads to a large change in the IoU value. Therefore, this article has adopted the closed-form of geometric JS divergence representation of tiny objects as the similarity measure for bounding-box distribution. In this article, the Swin Transformer model is adaptively integrated into the tiny object detection network to efficiently extract tiny features. The JSDM module is based on the Gaussian distribution modeling of the ground-truth and anchor box, and then the geometric JS divergence with the closed-form formula is applied to measure the distribution distance. The ablation and comparison experiments have been carried out on AI-TOD and DOTA datasets, and the results show that the proposed JSDNet can effectively improve the performance of remote sensing tiny object detection and can fully learn the geometric JS divergence representation of tiny objects.</p>
</sec>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>SN: Conceptualization, Resources, Writing&#x02014;review &#x00026; editing. CL: Methodology, Writing&#x02014;review &#x00026; editing. HW: Software, Writing&#x02014;original draft. YLi: Validation, Writing&#x02014;review &#x00026; editing. YLia: Formal analysis, Investigation, Writing&#x02014;review &#x00026; editing. NL: Writing&#x02014;review &#x00026; editing.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported by the National Natural Science Foundation of China under Grant Nos. 61805283, 61805284, and 61906213.</p>
</sec>
<ack><p>The authors would like to thank all reviewers and editors for their comments on this study.</p>
</ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bochkovskiy</surname> <given-names>A.</given-names></name> <name><surname>Wang</surname> <given-names>C.-Y.</given-names></name> <name><surname>Liao</surname> <given-names>H.-Y. M.</given-names></name></person-group> (<year>2020</year>). <article-title>YOLOv4: Optimal speed and accuracy of object detection</article-title>. <source>arXiv</source> [preprint]. <pub-id pub-id-type="doi">10.48550/arXiv.2004.10934</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cai</surname> <given-names>Z.</given-names></name> <name><surname>Vasconcelos</surname> <given-names>N.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Cascade r-cnn: delving into high quality object1 detection,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6154</fpage>&#x02013;<lpage>6162</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cui</surname> <given-names>L.</given-names></name> <name><surname>Lv</surname> <given-names>P.</given-names></name> <name><surname>Jiang</surname> <given-names>X.</given-names></name> <name><surname>Gao</surname> <given-names>Z.</given-names></name> <name><surname>Zhou</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Context-aware block net for small object detection</article-title>. <source>IEEE Trans. Cybern</source>. <volume>52</volume>, <fpage>2300</fpage>&#x02013;<lpage>2313</lpage>. <pub-id pub-id-type="doi">10.1109/TCYB.2020.3004636</pub-id><pub-id pub-id-type="pmid">32721905</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Deasy</surname> <given-names>J.</given-names></name> <name><surname>Simidjievski</surname> <given-names>N.</given-names></name> <name><surname>Li&#x000F2;</surname> <given-names>P.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Constraining variational inference with geometric jensen-shannon divergence,&#x0201D;</article-title> in <source>Proceedings of Advances in Neural Information Processing Systems</source> (<publisher-loc>Massachusetts</publisher-loc>: <publisher-name>Cambridge</publisher-name>).</citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>M.</given-names></name> <name><surname>Liu</surname> <given-names>L.</given-names></name> <name><surname>Liu</surname> <given-names>L.</given-names></name> <name><surname>Jiang</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Extended feature pyramid network for small object detection</article-title>. <source>IEEE Trans. Multimedia</source> <volume>24</volume>, <fpage>1968</fpage>&#x02013;<lpage>1979</lpage>. <pub-id pub-id-type="doi">10.1109/TMM.2021.3074273</pub-id><pub-id pub-id-type="pmid">36137470</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Duan</surname> <given-names>K.</given-names></name> <name><surname>Bai</surname> <given-names>S.</given-names></name> <name><surname>Xie</surname> <given-names>L.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Centernet: keypoint triplets for object detection,&#x0201D;</article-title> in <source>Proceedings of IEEE International Conference on Computer Vision</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6568</fpage>&#x02013;<lpage>6577</lpage>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Feng</surname> <given-names>X.</given-names></name> <name><surname>Han</surname> <given-names>J.</given-names></name> <name><surname>Yao</surname> <given-names>X.</given-names></name> <name><surname>Cheng</surname> <given-names>G.</given-names></name></person-group> (<year>2021</year>). <article-title>Tcanet: triple context-aware network for weakly supervised object detection in remote sensing images</article-title>. <source>IEEE Trans. Geosci. Remote Sens</source>. <volume>59</volume>, <fpage>6946</fpage>&#x02013;<lpage>6955</lpage>. <pub-id pub-id-type="doi">10.1109/TGRS.2020.3030990</pub-id></citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Frank</surname> <given-names>N.</given-names></name></person-group> (<year>2019</year>). <article-title>On the jensen&#x02013;shannon symmetrization of distances relying on abstract means</article-title>. <source>Entropy</source> <volume>21</volume>, <fpage>1</fpage>&#x02013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.3390/e21050485</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ge</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Yoshie</surname> <given-names>O.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Ota: optimal transport assignment for object detection,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>303</fpage>&#x02013;<lpage>312</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>W.</given-names></name> <name><surname>Kuerban</surname> <given-names>A.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Huang</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>B.</given-names></name> <name><surname>Gao</surname> <given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>Multi-vision network for accurate and real-time small object detection in optical remote sensing images</article-title>. <source>IEEE Geosci. Remote Sens. Lett</source>. <volume>19</volume>, <fpage>1</fpage>&#x02013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.1109/LGRS.2020.3044422</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>J.</given-names></name> <name><surname>Hwang</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Gan-based synthetic data augmentation for infrared small target detection</article-title>. <source>IEEE Trans. Geosci. Remote Sens</source>. <volume>60</volume>, <fpage>1</fpage>&#x02013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1109/TGRS.2022.3179891</pub-id></citation>
</ref>
<ref id="B12">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>K.</given-names></name> <name><surname>Lee</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Probabilistic anchor assignment with iou prediction for object detection,&#x0201D;</article-title> in <source>Proceedings of the European Conference on Computer Vision</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>355</fpage>&#x02013;<lpage>371</lpage>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kong</surname> <given-names>T.</given-names></name> <name><surname>Sun</surname> <given-names>F.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Jiang</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name> <name><surname>Shi</surname> <given-names>J.</given-names></name></person-group> (<year>2020</year>). <article-title>Foveabox: beyound anchor-based object detection</article-title>. <source>IEEE Trans. Image Process</source>. <volume>29</volume>, <fpage>7389</fpage>&#x02013;<lpage>7398</lpage>. <pub-id pub-id-type="doi">10.1109/TIP.2020.3002345</pub-id></citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Leng</surname> <given-names>J.</given-names></name> <name><surname>Ren</surname> <given-names>Y.</given-names></name> <name><surname>Jiang</surname> <given-names>W.</given-names></name> <name><surname>Sun</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name></person-group> (<year>2021</year>). <article-title>Realize your surroundings: exploiting context information for small object detection</article-title>. <source>Neurocomputing</source> <volume>433</volume>:<fpage>287</fpage>&#x02013;<lpage>299</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2020.12.093</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>N.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Scale-aware trident networks for object detection,&#x0201D;</article-title> in <source>Proceedings of IEEE International Conference on Computer Vision</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6054</fpage>&#x02013;<lpage>6063</lpage>.</citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Huang</surname> <given-names>Q.</given-names></name> <name><surname>Pei</surname> <given-names>X.</given-names></name></person-group> (<year>2021</year>). <article-title>Cross-layer attention network for small object detection in remote sensing imagery</article-title>. <source>IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens</source>. <volume>14</volume>, <fpage>2148</fpage>&#x02013;<lpage>2161</lpage>. <pub-id pub-id-type="doi">10.1109/JSTARS.2020.3046482</pub-id></citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>T.</given-names></name> <name><surname>Goyal</surname> <given-names>P.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Dollar</surname> <given-names>P.</given-names></name></person-group> (<year>2020</year>). <article-title>Focal loss for dense object detection</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>42</volume>, <fpage>317</fpage>&#x02013;<lpage>328</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2018.2858826</pub-id><pub-id pub-id-type="pmid">30040631</pub-id></citation></ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>L.</given-names></name> <name><surname>Ouyang</surname> <given-names>W.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Feiguth</surname> <given-names>P.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Deep learning for generic object detection: a survey</article-title>. <source>Int. J. Comput. Vision</source> <volume>128</volume>, <fpage>261</fpage>&#x02013;<lpage>318</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-019-01247-4</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>W.</given-names></name> <name><surname>Anguelov</surname> <given-names>D.</given-names></name> <name><surname>Erhan</surname> <given-names>D.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;SSD: single shot multibox detector,&#x0201D;</article-title> in <source>Proceedings of the European Conference on Computer Vision</source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>21</fpage>&#x02013;<lpage>37</lpage>.</citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Swin transformer: hierarchical vision transformer using shifted windows,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision</source>, <fpage>10012</fpage>&#x02013;<lpage>10022</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>W.</given-names></name> <name><surname>Lan</surname> <given-names>C.</given-names></name> <name><surname>Niu</surname> <given-names>C.</given-names></name></person-group> (<year>2023</year>). <article-title>A cnn-transformer hybrid model based on cswin transformer for uav image object detection</article-title>. <source>IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens</source>. <volume>16</volume>, <fpage>1211</fpage>&#x02013;<lpage>1231</lpage>. <pub-id pub-id-type="doi">10.1109/JSTARS.2023.3234161</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>B.</given-names></name> <name><surname>Yue</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>Q.</given-names></name> <name><surname>Yan</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Grid r-cnn,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>7363</fpage>&#x02013;<lpage>7372</lpage>.</citation>
</ref>
<ref id="B23">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Qiao</surname> <given-names>S.</given-names></name> <name><surname>Chen</surname> <given-names>L.-C.</given-names></name> <name><surname>Yuille</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Detectors: detecting objects with recursive feature pyramid and switchable atrous convolution,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>10213</fpage>&#x02013;<lpage>10224</lpage>.<pub-id pub-id-type="pmid">34677300</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>Faster R-CNN: Towards real-time object detection with region proposal networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>39</volume>, <fpage>1137</fpage>&#x02013;<lpage>1149</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id><pub-id pub-id-type="pmid">27295650</pub-id></citation></ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Thiagarajan</surname> <given-names>P.</given-names></name> <name><surname>Ghosh</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>Jensen-shannon divergence based novel loss functions for bayesian neural networks</article-title>. <source>arXiv</source> [preprint]. <pub-id pub-id-type="doi">10.48550/arXiv.2209.1136</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Tian</surname> <given-names>Z.</given-names></name> <name><surname>Shen</surname> <given-names>C.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Fcos: Fully convolutional one-stage object detection,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>9627</fpage>&#x02013;<lpage>9636</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Xia</surname> <given-names>G.-S.</given-names></name> <name><surname>Bai</surname> <given-names>X.</given-names></name> <name><surname>Ding</surname> <given-names>J.</given-names></name> <name><surname>Zhu</surname> <given-names>Z.</given-names></name> <name><surname>Belongie</surname> <given-names>S.</given-names></name> <name><surname>Luo</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>&#x0201C;Dota: a large-scale dataset for object detection in aerial images,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3974</fpage>&#x02013;<lpage>3983</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Yang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>H.</given-names></name> <name><surname>Yu</surname> <given-names>L.</given-names></name> <name><surname>Xia</surname> <given-names>G.</given-names></name></person-group> (<year>2022</year>). <article-title>Detecting tiny objects in aerial images: a normalized wasserstein distance and a new benchmark</article-title>. <source>ISPRS J. Photogr. Remote Sens</source>. <volume>190</volume>, <fpage>79</fpage>&#x02013;<lpage>93</lpage>. <pub-id pub-id-type="doi">10.1016/j.isprsjprs.2022.06.002</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>T.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Metaanchor: learning to detect objects with customized anchors,&#x0201D;</article-title> in <source>Proceedings of the Advances in Neural Information Processing Systems</source> (<publisher-loc>Massachusetts</publisher-loc>: <publisher-name>Cambridge</publisher-name>), <fpage>318</fpage>&#x02013;<lpage>328</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Yan</surname> <given-names>J.</given-names></name> <name><surname>Ming</surname> <given-names>Q.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Tian</surname> <given-names>Q.</given-names></name></person-group> (<year>2021a</year>). <article-title>&#x0201C;Rethinking rotated object detection with gaussian wasserstein distance loss,&#x0201D;</article-title> in <source>Proceedings of the 24th International Conference on Machine Learning</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>11830</fpage>&#x02013;<lpage>11841</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Ming</surname> <given-names>Q.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Tian</surname> <given-names>Q.</given-names></name> <etal/></person-group>. (<year>2021b</year>). <article-title>&#x0201C;Learning high-precision bounding box for rotated object detection via kullback-leibler divergence,&#x0201D;</article-title> in <source>Proceedings of the Advances in Neural Information Processing Systems</source> (<publisher-loc>Massachusetts</publisher-loc>: <publisher-name>Cambridge</publisher-name>), <fpage>18381</fpage>&#x02013;<lpage>18394</lpage>.</citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name> <name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Zhou</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Tang</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Detecting rotated objects as gaussian distributions and its 3-d generalization</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>45</volume>, <fpage>4335</fpage>&#x02013;<lpage>4354</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3197152</pub-id><pub-id pub-id-type="pmid">35939469</pub-id></citation></ref>
<ref id="B33">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>S.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Lin</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Reppoints: point set representation for object detection,&#x0201D;</article-title> in <source>Proceedings of the International Conference on Computer Vision</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>9657</fpage>&#x02013;<lpage>9666</lpage>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zeng</surname> <given-names>N.</given-names></name> <name><surname>Wu</surname> <given-names>P.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>W.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name></person-group> (<year>2022</year>). <article-title>A small-sized object detection oriented multi-scale feature fusion approach with application to defect detection</article-title>. <source>IEEE Trans. Instrum. Meas</source>. <volume>71</volume>, <fpage>1</fpage>&#x02013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1109/TIM.2022.3153997</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>K.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>Q.</given-names></name></person-group> (<year>2022</year>). <article-title>A hierarchical context embedding network for object detection in remote sensing images</article-title>. <source>IEEE Geosci. Remote Sens. Lett</source>. <volume>19</volume>, <fpage>1</fpage>&#x02013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.1109/LGRS.2022.3161938</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Zhu</surname> <given-names>X.</given-names></name> <name><surname>Lei</surname> <given-names>Z.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;S3fd: single shot scale-invariant face detector,&#x0201D;</article-title> in <source>Proceedings of the International Conference on Computer Vision</source> (<publisher-loc>Piscataway</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>192</fpage>&#x02013;<lpage>201</lpage>.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Feng</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>Finding nonrigid tiny person with densely cropped and local attention object detector networks in low-altitude aerial images</article-title>. <source>IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens</source>. <volume>15</volume>, <fpage>4371</fpage>&#x02013;<lpage>4385</lpage>. <pub-id pub-id-type="doi">10.1109/JSTARS.2022.3175498</pub-id></citation>
</ref>
</ref-list>
</back>
</article>