<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2024.1347898</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Discriminative context-aware network for camouflaged object detection</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Ike</surname> <given-names>Chidiebere Somadina</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2649311/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Muhammad</surname> <given-names>Nazeer</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/391306/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Bibi</surname> <given-names>Nargis</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Alhazmi</surname> <given-names>Samah</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2576052/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Eoghan</surname> <given-names>Furey</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Computing, Atlantic Technological University</institution>, <addr-line>Letterkenny</addr-line>, <country>Ireland</country></aff>
<aff id="aff2"><sup>2</sup><institution>School of Computing, Pak-Austria Fachhochschule Institute of Applied Sciences and Technology</institution>, <addr-line>Haripur</addr-line>, <country>Pakistan</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Computer Science, Fatima Jinnah Women University</institution>, <addr-line>Rawalpindi</addr-line>, <country>Pakistan</country></aff>
<aff id="aff4"><sup>4</sup><institution>Computer Science Department, College of Computing and Informatics, Saudi Electronic University</institution>, <addr-line>Riyadh</addr-line>, <country>Saudi Arabia</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001"><p>Edited by: Hanqi Zhuang, Florida Atlantic University, United States</p></fn>
<fn fn-type="edited-by" id="fn0002"><p>Reviewed by: Khalil Khan, Nazarbayev University, Kazakhstan</p><p>Anum Masood, NTNU, Norway</p></fn>
<corresp id="c001">&#x002A;Correspondence: Samah Alhazmi, <email>s.alhazmi@seu.edu.sa</email></corresp>
<corresp id="c002">Furey Eoghan, <email>eoghan.furey@atu.ie</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>27</day>
<month>03</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>7</volume>
<elocation-id>1347898</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>12</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>03</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2024 Ike, Muhammad, Bibi, Alhazmi and Eoghan.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Ike, Muhammad, Bibi, Alhazmi and Eoghan</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Animals use camouflage (background matching, disruptive coloration, etc.) for protection, confusing predators and making detection difficult. Camouflage Object Detection (COD) tackles this challenge by identifying objects seamlessly blended into their surroundings. Existing COD techniques struggle with hidden objects due to noisy inferences inherent in natural environments. To address this, we propose the Discriminative Context-aware Network (DiCANet) for improved COD performance.</p>
</sec>
<sec>
<title>Methods</title>
<p>DiCANet addresses camouflage challenges through a two-stage approach. First, an adaptive restoration block intelligently learns feature weights, prioritizing informative channels and pixels. This enhances convolutional neural networks&#x2019; ability to represent diverse data and handle complex camouflage. Second, a cascaded detection module with an enlarged receptive field refines the object prediction map, achieving clear boundaries without post-processing.</p>
</sec>
<sec>
<title>Results</title>
<p>Without post-processing, DiCANet achieves state-of-the-art performance on challenging COD datasets (CAMO, CHAMELEON, COD10K) by generating accurate saliency maps with rich contextual details and precise boundaries.</p>
</sec>
<sec>
<title>Discussion</title>
<p>DiCANet tackles the challenge of identifying camouflaged objects in noisy environments with its two-stage restoration and cascaded detection approach. This innovative architecture surpasses existing methods in COD tasks, as proven by benchmark dataset experiments.</p>
</sec>
</abstract>
<kwd-group>
<kwd>camouflage object detection</kwd>
<kwd>COD</kwd>
<kwd>dataset</kwd>
<kwd>feature extraction</kwd>
<kwd>benchmark</kwd>
<kwd>deep learning</kwd>
<kwd>convolutional neural network</kwd>
<kwd>artificial intelligence</kwd>
</kwd-group>
<counts>
<fig-count count="9"/>
<table-count count="1"/>
<equation-count count="9"/>
<ref-count count="95"/>
<page-count count="12"/>
<word-count count="9041"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>The idea behind Charles Darwin&#x2019;s theory of evolution and natural selection is the evolution of prey camouflage patterns and the understanding of animal cognition in a more ecological context. The earliest research on camouflage dates to the last century (<xref ref-type="bibr" rid="ref7">Cott, 1940</xref>). Research by <xref ref-type="bibr" rid="ref77">Thayer (1918)</xref> and <xref ref-type="bibr" rid="ref7">Cott (1940)</xref> comprehensively studied the phenomenon of camouflage. Camouflage is an evolutionary concealment technique to mask objects&#x2019; location, identity, and movement in their surrounding environment. For living organisms to adapt to their environment, they require the exhibition of adaptive traits or behavioral strategies better suited to the environment. The combination of these physiological characteristics, such as color, pattern, morphology, and behavior (<xref ref-type="bibr" rid="ref25">Gleeson et al., 2018</xref>; <xref ref-type="bibr" rid="ref75">Stevens and Ruxton, 2019</xref>), provides them with some survival advantages by disrupting the visual silhouette of animals or potential predators. Inspired by this important natural phenomenon, humans have made attempts to replicate these patterns in many fields.</p>
<p>As a multidisciplinary study of computer science and evolutionary biology, it has a wide range of applications in practical scenarios, including wildlife preservation and animal monitoring; arts (e.g., recreational art) (<xref ref-type="bibr" rid="ref6">Chu et al., 2010</xref>; <xref ref-type="bibr" rid="ref22">Ge et al., 2018</xref>); agriculture (e.g., locust detection to prevent invasion); computer vision and other vision-related areas (e.g., search-and-rescue missions in natural disasters; military target detection and surveillance systems; rare species discovery); medical image analysis [e.g., polyp segmentation (<xref ref-type="bibr" rid="ref16">Fan et al., 2020b</xref>); lung infection segmentation (<xref ref-type="bibr" rid="ref17">Fan et al., 2020c</xref>; <xref ref-type="bibr" rid="ref85">Wu et al., 2021</xref>)], to mention a few.</p>
<p>There are two types of camouflaged objects: naturally camouflaged objects and artificially camouflaged objects (<xref ref-type="bibr" rid="ref74">Stevens and Merilaita, 2009</xref>). Natural camouflage results from the coevolution of predators and prey. <xref ref-type="fig" rid="fig1">Figures 1A</xref>,<xref ref-type="fig" rid="fig1">B</xref> show disruptive coloration and background pattern matching in animals attempting to exploit predators&#x2019; visual processing and cognition. Other camouflage strategies include countershading, transparency, masquerade, distractive markings (<xref ref-type="bibr" rid="ref20">Galloway et al., 1802</xref>), etc. Artificially camouflaged objects are predatory camouflage strategies often seen in humans, such as military troops, vehicles, weapons, and positions in war zones (<xref ref-type="bibr" rid="ref96">Zheng et al., 2018</xref>). These objects first observe their environment and elegantly blend their texture patterns to create a familiar scene as the environment to deceive potential observers&#x2019; visual perception systems, as shown in <xref ref-type="fig" rid="fig1">Figure 1C</xref>.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Natural and artificial camouflaged objects. <bold>(A,B)</bold> show Natural camouflage and <bold>(C)</bold> shows Artificial camouflage.</p>
</caption>
<graphic xlink:href="frai-07-1347898-g001.tif"/>
</fig>
<p>COD has gained increased attention in the computer vision community but is not well-studied due to the insufficiency of large training datasets and a standard benchmark like Pascal-VOC (<xref ref-type="bibr" rid="ref12">Everingham et al., 2015</xref>), ImageNet (<xref ref-type="bibr" rid="ref9">Deng et al., 2009</xref>), MS-COCO (<xref ref-type="bibr" rid="ref47">Lin T. Y. et al., 2014</xref>), etc.</p>
<p>The majority of computer vision literature is largely concerned with the detection/segmentation of non-camouflaged objects (<xref ref-type="bibr" rid="ref66">Ren et al., 2017</xref>). Based on the detecting and segmenting viewpoint (<xref ref-type="bibr" rid="ref95">Zhao Z. Q. et al., 2019</xref>), the objects can be divided into three categories: generic objects, salient objects, and camouflage objects. Generic object detection (GOD) is a popular direction in cognitive computer vision which aims to find common objects. They can either be salient or camouflaged. Salient object detection (SOD) aims to find attention-grabbing objects in an image, i.e., objects with pre-defined classes. There exists a vast amount of research works for both generic (<xref ref-type="bibr" rid="ref71">Shotton et al., 2006</xref>; <xref ref-type="bibr" rid="ref52">Liu et al., 2010</xref>; <xref ref-type="bibr" rid="ref24">Girshick et al., 2014</xref>; <xref ref-type="bibr" rid="ref12">Everingham et al., 2015</xref>; <xref ref-type="bibr" rid="ref23">Girshick, 2015</xref>; <xref ref-type="bibr" rid="ref65">Ren et al., 2015</xref>; <xref ref-type="bibr" rid="ref38">Kirillov et al., 2019</xref>; <xref ref-type="bibr" rid="ref43">Le et al., 2020</xref>), and salient object detection (<xref ref-type="bibr" rid="ref80">Wang et al., 2017</xref>; <xref ref-type="bibr" rid="ref86">Wu et al., 2019</xref>; <xref ref-type="bibr" rid="ref92">Zhao J. X. et al., 2019</xref>; <xref ref-type="bibr" rid="ref94">Zhao and Wu, 2019</xref>; <xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>; <xref ref-type="bibr" rid="ref63">Qin et al., 2020</xref>; <xref ref-type="bibr" rid="ref83">Waqas Zamir et al., 2021</xref>). COD aims to identify objects whose shape and outline are not easily recognizable in images, as shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>. The high intrinsic similarities between the camouflaged objects and the background require a significant amount of visual perception knowledge, hence making COD far more challenging than the conventional salient object detection or generic object detection (<xref ref-type="bibr" rid="ref22">Ge et al., 2018</xref>; <xref ref-type="bibr" rid="ref95">Zhao Z. Q. et al., 2019</xref>; <xref ref-type="bibr" rid="ref92">Zhao J. X. et al., 2019</xref>).</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Object segmentation exemplars: <bold>(A)</bold> Given input image, <bold>(B)</bold> GOD, <bold>(C)</bold> SOD, <bold>(D)</bold> COD.</p>
</caption>
<graphic xlink:href="frai-07-1347898-g002.tif"/>
</fig>
<p>In this paper, we present a review of deep-learning object detection from a camou-flaged perspective. We proposed a discriminative context-aware network called &#x201C;Di-CANet.&#x201D; In consideration of the noisy interference in natural systems, the low-frequency distribution contains smooth data disordering while the high-frequency details get an unwanted approximation. These contain channel-wise and pixel-wise features unevenly distributed across the camouflaged image and should be differentiated using weighted information to get an appropriate representation of salient features of objects. Therefore, rather than directly assigning equal weights to the channel-wise and pixel-wise features (<xref ref-type="bibr" rid="ref84">Woo et al., 2018</xref>), inspired by <xref ref-type="bibr" rid="ref63">Qin et al. (2020)</xref>, we introduced an adaptive restoration block (ARB). This is used to adaptively learn the weights of the image features and assign different weights to them. This not only contributes to the representative ability of convolutional neural networks (CNN) but also provides the required robustness against various types of information preservation. After processing the ARB, these features are complementary-aware according to the fusion pipeline to generate restored camouflage images. Next, a cascaded detection module (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>) fortified with a modified receptive field block (<xref ref-type="bibr" rid="ref49">Liu and Huang, 2018</xref>) was adopted to segment ecological signals and drive the segmentation performance of the target objects during the detection stage. Furthermore, a more refined camouflaged object prediction map is attained with clear boundaries and the generation of an accurate saliency map in terms of contextual details.</p>
<p>With the above considerations, the proposed DiCANet is used to develop a good, camouflaged prediction map. Our contributions can be summarized as follows: (1) We proposed a discriminative context-aware network (&#x201C;DiCANet&#x201D;) for camouflage object segmentation; (2) We intelligently infused an adaptive restoration block into a bio-inspired cascaded detection block to effectively guide detection and segmentation performance. The ARB comprises three key components: (a) feature attention block (FAB), (b) Group architecture incorporation, and (c) Attention-based feature fusion network. Details of these components will be discussed in subsequent sections; (3) The proposed COD model boosted performance to a new state-of-the-art (SOTA). The experiments are verified for the effectiveness of our proposed method.</p>
</sec>
<sec id="sec2">
<label>2</label>
<title>Related work</title>
<p>This section reviews related works in two folds: image restoration approaches and deep learning-based COD approaches.</p>
<sec id="sec3">
<label>2.1</label>
<title>Image restoration</title>
<p>Visual information present in the real world contains undesired image contents, and as a positionally sensitive problem, it requires pixel-to-pixel correspondence between the input and the output image. To recover image content from natural images, the traditional approach showed promising reconstruction performance but suffered from computational drawbacks (<xref ref-type="bibr" rid="ref78">Ulyanov et al., 2018</xref>). Recently, a deep-learning-based restoration model has led to the breakthrough of the conventional approach and achieved state-of-the-art results (<xref ref-type="bibr" rid="ref83">Waqas Zamir et al., 2021</xref>; <xref ref-type="bibr" rid="ref89">Zamir et al., 2022</xref>). Designing algorithms robust enough to maintain a spatially precise, high-resolution representation with strong semantic information throughout the entire network has been a challenge. Research by <xref ref-type="bibr" rid="ref90">Zamir et al. (2020)</xref> proposed a novel multi-scale residual block to effectively learn enriched features for effective real image restoration and enhancement. Despite recent major advancements, state-of-the-art methods suffer from high system complexity, making them computationally inefficient (<xref ref-type="bibr" rid="ref58">Nah et al., 2017</xref>; <xref ref-type="bibr" rid="ref1">Abdelhamed et al., 2018</xref>; <xref ref-type="bibr" rid="ref5">Chu et al., 2021</xref>). To reduce the inter-block complexity of the other SOTA methods (<xref ref-type="bibr" rid="ref3">Chen et al., 2022</xref>) adopted the stacked neural networks in UNet architecture with skip connections (<xref ref-type="bibr" rid="ref69">Ronneberger et al., 2015</xref>), following (<xref ref-type="bibr" rid="ref81">Wang et al., 2022</xref>; <xref ref-type="bibr" rid="ref89">Zamir et al., 2022</xref>), etc., to design a nonlinear activation-free network framework that is based on CNN rather than a transformer-based network due to SOTA performance drawbacks as reported by <xref ref-type="bibr" rid="ref51">Liu et al. (2022)</xref> and <xref ref-type="bibr" rid="ref26">Han et al. (2021)</xref>. Research by <xref ref-type="bibr" rid="ref63">Qin et al. (2020)</xref> proposed a feature fusion attention network, that fuses the FAB with an attention-based multipath local residual structure to focus on learning weights of important spatial information to generate accurate results.</p>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>COD</title>
<p>Research into COD has rooted history in biology and arts (<xref ref-type="bibr" rid="ref77">Thayer, 1918</xref>; <xref ref-type="bibr" rid="ref7">Cott, 1940</xref>). The studies are still relevant in widening our knowledge of visual perception. The recognition of camouflaged objects has not been well explored in the literature. Early camouflage research focused on detecting the foreground region even when the foreground texture resembled that of the background (<xref ref-type="bibr" rid="ref21">Galun et al., 2003</xref>; <xref ref-type="bibr" rid="ref73">Song and Geng, 2010</xref>; <xref ref-type="bibr" rid="ref88">Xue et al., 2016</xref>). Based on cues such as color, shape, intensity, edge, and orientation, these works distinguished the foreground and background. To address the issue of camouflage detection, a few techniques based on hand-crafted features such as texture (<xref ref-type="bibr" rid="ref70">Sengottuvelan et al., 2008</xref>; <xref ref-type="bibr" rid="ref61">Pan et al., 2011</xref>; <xref ref-type="bibr" rid="ref50">Liu et al., 2012</xref>) and motion (<xref ref-type="bibr" rid="ref30">Hou, 2011</xref>; <xref ref-type="bibr" rid="ref42">Le et al., 2019</xref>) are put forth. However, due to the high similarity between the foreground and background, none of these approaches performs well in real application scenarios for segmenting camouflaged objects but is only effective in the case of a simple and non-uniform background. Despite the numerous CNN-based object detection models available, unique designs are required to build models for COD. In contrast to pixel-level segmentation, GOD detects objects with bounding boxes. Furthermore, the segmentation in COD is based on saliency from a human perspective, not semantics, which is not modeled in GOD models. On the other hand, models that are designed for SOD are unable to effectively detect concealed objects. SOD models do non-semantic segmentation and model saliency; nevertheless, they do not specialize in finding indefinite boundaries of objects, as salient objects tend to be of potential human interest. Researchers have proposed several feasible methods for COD.</p>
<p>Recently, (<xref ref-type="bibr" rid="ref42">Le et al., 2019</xref>) proposed an end-to-end network for segmenting camouflaged objects by integrating classification into the segmentation framework. Research by <xref ref-type="bibr" rid="ref39">Lamdouar et al. (2020)</xref> and <xref ref-type="bibr" rid="ref98">Zhu et al. (2021)</xref> has proposed novel approaches based on the assumption that camouflaged objects exist in an image, which is not always practical in the real world. To simulate the real world, (<xref ref-type="bibr" rid="ref41">Le et al., 2021</xref>) proposed camouflaged instance segmentation without any assumption that camouflaged objects exist in an image. Following the same motivation, (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>) proposed a Search Identification Network (SINet) comprising two modules, namely a search module and an identification module, where the former searches whether a potential prey exists while the latter identifies the target animal. The SINet framework leverages a modified Receptive Field Block (<xref ref-type="bibr" rid="ref49">Liu and Huang, 2018</xref>) to search for camouflaged object regions. Furthermore, aside from their COD model, (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>) presented a large COD dataset, called COD10K, which progressed COD research to a new level in the field of computer vision. Similarly, (<xref ref-type="bibr" rid="ref10">Dong et al., 2021</xref>) proposed an MCIF-Net framework that integrates a large receptive field and an effective feature aggregation strategy into a unified framework to extra rich context features for accurate COD. In addition to existing literature, recent advancements, and relevant studies, such as the notable works of (<xref ref-type="bibr" rid="ref34">Hussain et al., 2021</xref>; <xref ref-type="bibr" rid="ref62">Qadeer et al., 2022</xref>; <xref ref-type="bibr" rid="ref59">Naqvi et al., 2023</xref>), contribute to the understanding of object detection, tracking, and recognition in various contexts, enhancing the breadth and depth of the related literature. Despite research devoted to the challenges in the field of COD to achieve out-standing performance in terms of accuracy, existing deep learning-based COD methods suffer major limitations such as weak boundaries (i.e., edges), low boundary contrast, variations in object appearances, such as object size and shape, leading to unsatisfactory segmentation performance (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>; <xref ref-type="bibr" rid="ref56">Mei et al., 2021</xref>; <xref ref-type="bibr" rid="ref36">Ji et al., 2022</xref>), and raises the demands of more advanced feature fusion strategies.</p>
<p>Biological studies (<xref ref-type="bibr" rid="ref74">Stevens and Merilaita, 2009</xref>; <xref ref-type="bibr" rid="ref57">Merilaita et al., 2017</xref>; <xref ref-type="bibr" rid="ref67">Rida et al., 2020</xref>) have shown that targets that are deliberately hidden cause more noisy inferences in the visual perception system, which contributes to object concealment. In nature, this is a common phenomenon. Finding ecologically relevant signals hidden in extreme situations becomes a challenge. More so, without precise control of the feature fusion process, detectors are vulnerable to significant attacks from low-frequency details, which cause vague object boundaries and misjudgment in extreme situations. Inspired by this real-world phenomenon, this paper aims to design a novel baseline model to balance the accuracy and efficiency of COD by adaptively exploiting the semantic and spatial information to obtain plausible final context-aware camouflage prediction maps with refined edge boundaries.</p>
</sec>
</sec>
<sec sec-type="materials|methods" id="sec5">
<label>3</label>
<title>Materials and methods</title>
<sec id="sec6">
<label>3.1</label>
<title>Motivation and proposed framework</title>
<p>The term &#x201C;survival of the fittest&#x201D; was conceptualized by Charles Darwin&#x2019;s theory of evolution (<xref ref-type="bibr" rid="ref18">Flannelly, 2017</xref>). The survival of numerous species in the wild depends on cultural adaptation; thus, hunting in a wide variety of ecosystems of living things is essential to help organisms thrive in their environment. Motivated by the first two stages of predation, i.e., search (a sensory mechanism) and identification in nature, the DiCANet framework is proposed. The simplified version of the proposed framework is shown in <xref ref-type="fig" rid="fig3">Figure 3</xref>. Details of each component are discussed in subsequent sections.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Proposed DiCANet architecture.</p>
</caption>
<graphic xlink:href="frai-07-1347898-g003.tif"/>
</fig>
</sec>
<sec id="sec7">
<label>3.2</label>
<title>Camouflaged image</title>
<p>The art of camouflage hinges on manipulating an object&#x2019;s visual appearance to blend into its surroundings. At the heart of this strategy is the concept of pixel similarity. Digital images including those used in camouflage analysis are represented by pixels &#x2014;tiny blocks of varying features that collectively form the image. In the context of input camouflaged images, the concept of pixel similarity measures how closely the pixels of objects in the camouflaged image match with the surroundings in terms of color, visual patterns, surface variations, and intensity (<xref ref-type="bibr" rid="ref76">Talas et al., 2017</xref>). The more similar the pixels of the camouflaged object are to those of its intended background (<xref ref-type="fig" rid="fig3">Figure 3</xref>), the more effective the camouflage and the harder for observers to spot detectable features of the concealed object. Furthermore, any detectable discrepancies in pixel similarity will reveal the presence of the hidden object, undermining the effectiveness of the camouflage. By analyzing these features and strategically manipulating the pixel attributes of a camouflaged object, we proposed an effective Context-aware Network for Camouflaged Object Detection.</p>
</sec>
<sec id="sec8">
<label>3.3</label>
<title>Adaptive restoration block (ARB)</title>
<p>To restore concealed images, redundant information unevenly distributed across a real-world image should be adaptively bypassed while robustly allowing the network architecture to focus on more effective information. The ARB framework&#x2019;s internal block contains several key elements, including (a) the feature attention block (FAB), (b) the attention-based basic block structure, and (c) the feature fusion framework. A detailed framework is shown in <xref ref-type="fig" rid="fig4">Figure 4</xref>.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Adaptive restoration block architecture.</p>
</caption>
<graphic xlink:href="frai-07-1347898-g004.tif"/>
</fig>
<p>Given a <inline-formula><mml:math id="M1"><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>D</mml:mi></mml:math></inline-formula> real-world camouflage input image <inline-formula><mml:math id="M2"><mml:msub><mml:mi>I</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi mathvariant="double-struck">R</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>W</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:msub><mml:mi>C</mml:mi><mml:mi mathvariant="italic">in</mml:mi></mml:msub></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mi mathvariant="normal">where</mml:mi><mml:mspace width="0.25em"/><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>W</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="M3"><mml:msub><mml:mi>C</mml:mi><mml:mi mathvariant="italic">in</mml:mi></mml:msub></mml:math></inline-formula> are the shape of the image (i.e., dimensions and input channel number) respectively. To map the input camouflaged image space into a higher dimensional feature space, a <inline-formula><mml:math id="M4"><mml:mn>3</mml:mn><mml:mo>&#x2217;</mml:mo><mml:mn>3</mml:mn></mml:math></inline-formula> convolution <inline-formula><mml:math id="M5"><mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mi>S</mml:mi><mml:mi>F</mml:mi></mml:mrow></mml:msub><mml:mfenced open="(" close=")"><mml:mo>&#x22C5;</mml:mo></mml:mfenced></mml:math></inline-formula> was applied to extract shallow features with edge information <inline-formula><mml:math id="M6"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi mathvariant="double-struck">R</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>W</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>C</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> formulated as:</p>
<disp-formula id="EQ1"><label>(1)</label><mml:math id="M7"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mi>S</mml:mi><mml:mi>F</mml:mi></mml:mrow></mml:msub><mml:mfenced open="(" close=")"><mml:msub><mml:mi>I</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mfenced></mml:math></disp-formula>
<p>Deep features <inline-formula><mml:math id="M8"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi mathvariant="double-struck">R</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>X</mml:mi><mml:mspace width="0.25em"/><mml:mi>W</mml:mi><mml:mspace width="0.25em"/><mml:mi>X</mml:mi><mml:mspace width="0.25em"/><mml:mi>C</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> are then extracted from <inline-formula><mml:math id="M9"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> as:</p>
<disp-formula id="EQ3"><label>(2)</label><mml:math id="M10"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>F</mml:mi></mml:mrow></mml:msub><mml:mfenced open="(" close=")"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mfenced></mml:math></disp-formula>
<p>Where <inline-formula><mml:math id="M11"><mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>F</mml:mi></mml:mrow></mml:msub><mml:mfenced open="(" close=")"><mml:mo>&#x22C5;</mml:mo></mml:mfenced></mml:math></inline-formula> is the deep features extraction module and it contains <inline-formula><mml:math id="M12"><mml:mi>K</mml:mi></mml:math></inline-formula> residual Group Architectures block (RGAB) with multiple skip connections. More specifically, intermediate features <inline-formula><mml:math id="M13"><mml:msub><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>..</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>K</mml:mi></mml:msub></mml:math></inline-formula> and output deep features <inline-formula><mml:math id="M14"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>F</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are extracted block by block as:</p>
<disp-formula id="EQ4"><label>(3)</label><mml:math id="M15"><mml:msub><mml:mi>F</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>G</mml:mi><mml:mi>A</mml:mi><mml:msub><mml:mi>B</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mspace width="0.25em"/><mml:mfenced open="(" close=")"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mfenced><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>F</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mi mathvariant="italic">CONV</mml:mi></mml:msub><mml:mfenced open="(" close=")"><mml:msub><mml:mi>F</mml:mi><mml:mi>K</mml:mi></mml:msub></mml:mfenced><mml:mtext>,</mml:mtext></mml:math></disp-formula>
<p>Where <inline-formula><mml:math id="M16"><mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>G</mml:mi><mml:mi>A</mml:mi><mml:msub><mml:mi>B</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mfenced open="(" close=")"><mml:mo>&#x22C5;</mml:mo></mml:mfenced></mml:math></inline-formula> represents the <inline-formula><mml:math id="M17"><mml:mi>i</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:math></inline-formula> RGAB and <inline-formula><mml:math id="M18"><mml:msub><mml:mi>H</mml:mi><mml:mi mathvariant="italic">CONV</mml:mi></mml:msub></mml:math></inline-formula> is the last convolutional layer, which introduces the convolution operation&#x2019;s inductive bias into the network and sets the stage for shallow and deep feature aggregation.</p>
</sec>
<sec id="sec9">
<label>3.4</label>
<title>Feature attention block (FAB)</title>
<p>To improve model representation, an attention mechanism has been introduced inside a CNN (<xref ref-type="bibr" rid="ref91">Zhang et al., 2018</xref>; <xref ref-type="bibr" rid="ref8">Dai et al., 2019</xref>; <xref ref-type="bibr" rid="ref60">Niu et al., 2020</xref>). Many image restoration networks treat channel-and pixel-level features equally, making them incapable of efficiently handling images with uneven low-and high-frequency distributions. Realistically, redundant information is unevenly distributed across images, and the weight of the unwanted pixels should be significantly different for each channel-and pixel-wise feature. In the attention block, features are learned via a dynamic mechanism that enables the model to concentrate on diverse segments of the input data, highlighting pertinent features and attenuating or suppressing irrelevant ones. This process is typically realized through computing attention weights, which signify the significance or relevance of various input features. This adaptive learning approach provides additional flexibility for the network hierarchy in dealing with different types of information. Feature Attention blocks consist of a residual block with channel attention (RB-CA) and residual attention with pixel attention (RB-PA) as shown in <xref ref-type="fig" rid="fig5">Figure 5</xref>. The former ensures that different channel features have different weighted information (<xref ref-type="bibr" rid="ref28">He et al., 2010</xref>) while the latter attentively focuses on informative features in the high-frequency pixel regions.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Feature attention block. <bold>(A)</bold> Channel attention (CA). <bold>(B)</bold> Pixel attention (PA).</p>
</caption>
<graphic xlink:href="frai-07-1347898-g005.tif"/>
</fig>
<sec id="sec10">
<label>3.4.1</label>
<title>Channel attention (CA)</title>
<p>To achieve channel-wise weighting for each channel in feature maps, global average pooling (GAP) was employed before feeding the data into fully connected layers for classification tasks. The concept of GAP in CNNs focuses on each feature map (channel) and aggregates information across the entire spatial extent of the feature maps, resulting in a single value per channel (<xref ref-type="bibr" rid="ref45">Lin M. et al., 2014</xref>; <xref ref-type="bibr" rid="ref19">Forrest, 2016</xref>; <xref ref-type="bibr" rid="ref31">Hu et al., 2018</xref>; <xref ref-type="bibr" rid="ref54">Machine Learning Mastery, 2019</xref>). The 1D vector (channel descriptors) obtained from GAP can then be used in subsequent calculations to extract meaningful features from the image. The mathematical expression detailing how channel descriptors achieve weighted information is as follows:</p>
<disp-formula id="EQ5"><label>(4)</label><mml:math id="M19"><mml:msub><mml:mi>g</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mfenced open="(" close=")"><mml:msub><mml:mi>F</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mfenced><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>W</mml:mi></mml:mrow></mml:mfrac><mml:mspace width="0.25em"/><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo stretchy="true">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>H</mml:mi></mml:msubsup><mml:mrow><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo stretchy="true">&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>W</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mfenced open="(" close=")" separators=","><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mfenced></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula>
<p>Where <inline-formula><mml:math id="M20"><mml:msub><mml:mi>H</mml:mi><mml:mi>p</mml:mi></mml:msub></mml:math></inline-formula> represents the global pooling function, <inline-formula><mml:math id="M21"><mml:msub><mml:mi>F</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:math></inline-formula> the input, and <inline-formula><mml:math id="M22"><mml:msub><mml:mi>X</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mfenced open="(" close=")" separators=","><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mfenced></mml:math></inline-formula> denotes the value of <inline-formula><mml:math id="M23"><mml:mi>c</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>t</mml:mi><mml:mi>h</mml:mi></mml:math></inline-formula>channel <inline-formula><mml:math id="M24"><mml:msub><mml:mi>X</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:math></inline-formula> at spatial position <inline-formula><mml:math id="M25"><mml:mfenced open="(" close=")" separators=","><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mfenced></mml:math></inline-formula>. The shape of the feature map changes from <inline-formula><mml:math id="M26"><mml:mi>C</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>W</mml:mi></mml:math></inline-formula> to <inline-formula><mml:math id="M27"><mml:mi>C</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mn>1</mml:mn><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mn>1</mml:mn></mml:math></inline-formula> i.e., collapsing <inline-formula><mml:math id="M28"><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>W</mml:mi></mml:math></inline-formula>. These feature maps are fed through two convolution layers and a computationally efficient sigmoid, followed by ReLu activation function (<xref ref-type="fig" rid="fig5">Figure 5A</xref>) to provide the weights of the different channels formulated as follows:</p>
<disp-formula id="E1"><label>(5)</label><mml:math id="M29"><mml:mi>C</mml:mi><mml:msub><mml:mi>A</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mfenced open="(" close=")"><mml:mrow><mml:mi mathvariant="italic">Conv</mml:mi><mml:mfenced open="(" close=")"><mml:mrow><mml:mi>&#x03B4;</mml:mi><mml:mfenced open="(" close=")"><mml:mrow><mml:mi mathvariant="italic">Conv</mml:mi><mml:mfenced open="(" close=")"><mml:msub><mml:mi>A</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:math></disp-formula>
<p>Where <inline-formula><mml:math id="M30"><mml:mi>&#x03C3;</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="M31"><mml:mi>&#x03B4;</mml:mi></mml:math></inline-formula> represent the sigmoid function and the ReLu activation function, respectively. By elementwise multiplication of the input <inline-formula><mml:math id="M32"><mml:msub><mml:mi>F</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:math></inline-formula> and weights of the channels <inline-formula><mml:math id="M33"><mml:mi>C</mml:mi><mml:msub><mml:mi>A</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:math></inline-formula>, the output of the channel attention <inline-formula><mml:math id="M34"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>c</mml:mi><mml:mo>&#x2217;</mml:mo></mml:msubsup></mml:math></inline-formula> can be deduced as follows:</p>
<disp-formula id="EQ7"><label>(6)</label><mml:math id="M35"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>c</mml:mi><mml:mo>&#x2217;</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:msub><mml:mi>A</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>&#x2297;</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:math></disp-formula>
</sec>
<sec id="sec11">
<label>3.4.2</label>
<title>Pixel attention (PA)</title>
<p>To capture fine-grained details about spatial context, pixel attention (PA) mechanisms actively focus on specific pixels within the entire area (spatial extent) of the feature maps. The concept of attention mechanisms in CNNs, including those that focus on pixel-level details, has been explored in various research studies (e.g., <xref ref-type="bibr" rid="ref35">Ismail Fawaz et al., 2019</xref>; <xref ref-type="bibr" rid="ref11">Dosovitskiy et al., 2020</xref>). Inspired by CA (<xref ref-type="bibr" rid="ref31">Hu et al., 2018</xref>) and spatial attention (SA) (<xref ref-type="bibr" rid="ref84">Woo et al., 2018</xref>), PA is used to improve the feature representation capacity to obtain images with clear object boundaries. Comparable to CA as shown in <xref ref-type="fig" rid="fig5">Figure 5B</xref>, the input <inline-formula><mml:math id="M36"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>c</mml:mi><mml:mo>&#x2217;</mml:mo></mml:msubsup></mml:math></inline-formula> (i.e., the output of the channel attention block) is fed through two convolution layers with ReLu and sigmoid activation function (<xref ref-type="fig" rid="fig5">Figure 5B</xref>). The shape of the feature map changes from <inline-formula><mml:math id="M37"><mml:mi>C</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>W</mml:mi></mml:math></inline-formula> to 1<inline-formula><mml:math id="M38"><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>W</mml:mi></mml:math></inline-formula>.</p>
<disp-formula id="E2"><label>(7)</label><mml:math id="M39"><mml:mi>P</mml:mi><mml:mi>A</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mfenced open="(" close=")"><mml:mrow><mml:mi mathvariant="italic">Conv</mml:mi><mml:mfenced open="(" close=")"><mml:mrow><mml:mi>&#x03B4;</mml:mi><mml:mfenced open="(" close=")"><mml:mrow><mml:mi mathvariant="italic">Conv</mml:mi><mml:mfenced open="(" close=")"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>c</mml:mi><mml:mo>&#x2217;</mml:mo></mml:msubsup></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:math></disp-formula>
<p>Recall that activation maps are often followed elementwise through an activation function such as ReLU. Therefore, by elementwise multiplication of <inline-formula><mml:math id="M40"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>c</mml:mi><mml:mo>&#x2217;</mml:mo></mml:msubsup></mml:math></inline-formula> and PA, Feature Attention Block (FAB) output <inline-formula><mml:math id="M41"><mml:mover accent="true"><mml:mi>F</mml:mi><mml:mo stretchy="true">&#x02DC;</mml:mo></mml:mover></mml:math></inline-formula>is given by:</p>
<disp-formula id="EQ9"><label>(8)</label><mml:math id="M42"><mml:mover accent="true"><mml:mi>F</mml:mi><mml:mo stretchy="true">&#x02DC;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>c</mml:mi><mml:mo>&#x2217;</mml:mo></mml:msubsup><mml:mo>&#x2297;</mml:mo><mml:mi>P</mml:mi><mml:mi>A</mml:mi></mml:math></disp-formula>
<p>Integrating Channel Attention and Pixel Attention within CNNs empowers the network to learn both the overall image context and the finer details of specific regions simultaneously. This leads to stronger and more informative feature representations, improving the network&#x2019;s ability to distinguish objects. Recent research (e.g., <xref ref-type="bibr" rid="ref31">Hu et al., 2018</xref>; <xref ref-type="bibr" rid="ref35">Ismail Fawaz et al., 2019</xref>; <xref ref-type="bibr" rid="ref11">Dosovitskiy et al., 2020</xref>) has explored this combined approach to enhance CNN performance in various computer vision tasks like image classification, object detection, and semantic segmentation.</p>
</sec>
</sec>
<sec id="sec12">
<label>3.5</label>
<title>Block structure (BBS)</title>
<p>The performance of neural networks has been significantly impacted since attention mechanisms (<xref ref-type="bibr" rid="ref87">Xu et al., 2015</xref>; <xref ref-type="bibr" rid="ref79">Vaswani et al., 2017</xref>; <xref ref-type="bibr" rid="ref82">Wang et al., 2018</xref>) and the emergence of residual connections (<xref ref-type="bibr" rid="ref29">He et al., 2016</xref>) were introduced to train deep networks. The design of the BBS <inline-formula><mml:math id="M43"><mml:mfenced open="(" close=")"><mml:msub><mml:mi>B</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mfenced></mml:math></inline-formula>is built on the combination of these concepts. As shown in <xref ref-type="fig" rid="fig6">Figure 6</xref>, BBS consist of a multiple local residual learning (LRL) skip connection block and a FAB. Local residual learning permits low-frequency details to be bypassed through multiple local residual learning, allowing the main network to learn discriminatively useful information. The combination of several basic block structures with skip connections increases the depth and capability of the ARB in overcoming training challenges.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Basic block structure.</p>
</caption>
<graphic xlink:href="frai-07-1347898-g006.tif"/>
</fig>
<p>By implementing a two-layer convolutional network at the end of the ARB network (as shown in <xref ref-type="fig" rid="fig4">Figure 4</xref>) and employing a long-skip connection global residual learning module as a recovery strategy to restore the input camouflage image.</p>
</sec>
<sec id="sec13">
<label>3.6</label>
<title>Feature fusion attention strategy</title>
<p>Shallow feature information can often be difficult to retain as the network gets deeper. U-Net (<xref ref-type="bibr" rid="ref69">Ronneberger et al., 2015</xref>) and other networks strive to fuse different level features of shallow and deep information. As depicted in <xref ref-type="fig" rid="fig4">Figure 4</xref>, feature maps produced by the &#x1D43A; group architecture in the channel direction are concatenated. Following the FAB weighting strategy, the retained low-level features with edge information in the shallow layer that preserve spatial details for establishing object boundaries are fed into deep layers, allowing the ARB network (ARB-Net) to focus more on semantic information like high-frequency textures for hidden objects scene visibility in real-world scenarios.</p>
</sec>
<sec id="sec14">
<label>3.7</label>
<title>Loss function</title>
<p>According to <xref ref-type="bibr" rid="ref44">Lim et al. (2017)</xref>, training with L1 loss often outperformed training with L2 loss for image restoration tasks. Following the same strategy, we adopted L1 loss as our default loss function for training the ARB-Net. The total loss function L is:</p>
<disp-formula id="EQ10"><label>(9)</label><mml:math id="M44"><mml:mi>L</mml:mi><mml:mfenced open="(" close=")"><mml:mi>&#x0398;</mml:mi></mml:mfenced><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:mspace width="0.25em"/><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo stretchy="true">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:mfenced open="&#x2016;" close="&#x2016;"><mml:mrow><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mi>i</mml:mi></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:mi>A</mml:mi><mml:mi>R</mml:mi><mml:mi>B</mml:mi><mml:mfenced open="(" close=")"><mml:msubsup><mml:mi>I</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msubsup></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mstyle></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M45"><mml:mi>&#x0398;</mml:mi></mml:math></inline-formula> represents the ARB-Net parameters, <inline-formula><mml:math id="M46"><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mi>i</mml:mi></mml:msubsup></mml:math></inline-formula> stands for ground truth, and <inline-formula><mml:math id="M47"><mml:msubsup><mml:mi>I</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msubsup></mml:math></inline-formula> stands for the real-world camouflaged input image. The proposed ARB-Net extends the hyperparameters detailed in <xref ref-type="bibr" rid="ref63">Qin et al. (2020)</xref>, encompassing vital parameters like image size, learning rate, optimizer, batch size, and loss function. The selection process for the Adaptive Restoration Block (ARB) was meticulously executed through a systematic approach combining experimentation, domain knowledge, and optimization techniques. Leveraging our understanding of camouflage object detection and image restoration, we meticulously fine-tuned the hyperparameters to meet the unique demands of the task. Through iterative adjustments and rigorous validation of test data, we identified the most effective configuration for the ARB. This comprehensive approach ensures that the ARB-Net is finely tuned to excel in the intricate domain of camouflage object detection, enhancing its performance and applicability in real-world scenarios.</p>
</sec>
<sec id="sec15">
<label>3.8</label>
<title>Cascaded detection block</title>
<sec id="sec16">
<label>3.8.1</label>
<title>Sensory module (SM)</title>
<p>According to a neuroscience study by <xref ref-type="bibr" rid="ref40">Langley et al. (1996)</xref>, when prey indiscriminately hides in the background, selective search attention (<xref ref-type="bibr" rid="ref68">Riley and Roitblat, 2018</xref>) plays a significant role in the predatory sensory mechanism to reduce non-prey details, thus saving computational time. To take advantage of the sensory mechanism, search attention is used in the initial feature learning to select and aggregate semantic features from the restored camouflage image <inline-formula><mml:math id="M48"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>A</mml:mi><mml:mi>R</mml:mi><mml:mi>B</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> in the previous section.</p>
<p>Given an input image <inline-formula><mml:math id="M49"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>A</mml:mi><mml:mi>R</mml:mi><mml:mi>B</mml:mi></mml:mrow></mml:msub><mml:mspace width="0.25em"/><mml:mi>&#x03F5;</mml:mi><mml:mspace width="0.25em"/><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>W</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mspace width="0.25em"/><mml:mi>H</mml:mi><mml:mspace width="0.25em"/><mml:mi>x</mml:mi><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> (the output of the ARB) a set of features <inline-formula><mml:math id="M50"><mml:mfenced open="{" close="}"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>k</mml:mi><mml:mspace width="0.25em"/><mml:mi>&#x03F5;</mml:mi><mml:mspace width="0.25em"/><mml:mfenced open="{" close="}" separators=",,,,"><mml:mn>1</mml:mn><mml:mn>2</mml:mn><mml:mn>3</mml:mn><mml:mn>4</mml:mn><mml:mn>5</mml:mn></mml:mfenced></mml:mrow></mml:mfenced></mml:math></inline-formula>is extracted from the ResNet-50 (<xref ref-type="bibr" rid="ref29">He et al., 2016</xref>) backbone architecture. The resolution of each feature <inline-formula><mml:math id="M51"><mml:msub><mml:mi>f</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:math></inline-formula>is <inline-formula><mml:math id="M52"><mml:mfrac><mml:mi>H</mml:mi><mml:msup><mml:mn>2</mml:mn><mml:mi>k</mml:mi></mml:msup></mml:mfrac><mml:mspace width="0.25em"/><mml:mi>x</mml:mi></mml:math></inline-formula> <inline-formula><mml:math id="M53"><mml:mfrac><mml:mi>W</mml:mi><mml:msup><mml:mn>2</mml:mn><mml:mi>k</mml:mi></mml:msup></mml:mfrac></mml:math></inline-formula>, <inline-formula><mml:math id="M54"><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mfenced open="{" close="}" separators=",,,,"><mml:mn>4</mml:mn><mml:mn>4</mml:mn><mml:mn>8</mml:mn><mml:mn>16</mml:mn><mml:mn>32</mml:mn></mml:mfenced><mml:mtext>.</mml:mtext></mml:math></inline-formula>Studies by <xref ref-type="bibr" rid="ref46">Lin et al. (2017)</xref> demonstrated that high-level features in deep layers keep semantic information for finding objects, whereas low-level features in shallow layers preserve spatial details for establishing object boundaries. Based on the property of neural networks, extracted features are categorized as low-level <inline-formula><mml:math id="M55"><mml:mfenced open="{" close="}" separators=","><mml:msub><mml:mi>X</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:msub><mml:mi>X</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mfenced></mml:math></inline-formula>, intermediate-level <inline-formula><mml:math id="M56"><mml:mfenced open="{" close="}"><mml:msub><mml:mi>X</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mfenced></mml:math></inline-formula>, and high-level features <inline-formula><mml:math id="M57"><mml:mfenced open="{" close="}" separators=","><mml:msub><mml:mi>X</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:msub><mml:mi>X</mml:mi><mml:mn>4</mml:mn></mml:msub></mml:mfenced><mml:mspace width="0.25em"/><mml:mtext>,</mml:mtext></mml:math></inline-formula>which are later fused through concatenation, up-sampling, and down-sampling operations; thereafter, by leveraging a dense convolutional network strategy of (<xref ref-type="bibr" rid="ref33">Huang et al., 2017</xref>) to preserve more information from different layers and then use a modified receptive field (<xref ref-type="bibr" rid="ref49">Liu and Huang, 2018</xref>) block to enlarge the receptive field and output a set of enhanced features.</p>
</sec>
<sec id="sec17">
<label>3.8.2</label>
<title>Identification module (IM)</title>
<p>In the identification module, disguised objects need to be precisely identified using the output features obtained from the previous sensory module. Following the identification network of (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>), our final context-aware camouflaged object prediction maps with refined boundaries are generated.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="sec18">
<label>4</label>
<title>Results</title>
<p>To demonstrate the generality of our newly proposed DiCANet COD model, the ARB-Net goes through a fine-tuning stage with different key network parameters and is trained on local image patches to perform restoration for more complex image background scenarios. For optimal results that preserve the camouflaged object&#x2019;s latent spectral content and structural details, the Group Structure &#x1D43A; and each Basic Block Structure &#x1D435; are set to 3 and 19 respectively, in the ARB. The filter size for all convolution layers is set to <inline-formula><mml:math id="M58"><mml:mn>3</mml:mn><mml:mo>&#x2217;</mml:mo><mml:mn>3</mml:mn></mml:math></inline-formula>, except for the Channel Attention, whose kernel size is <inline-formula><mml:math id="M59"><mml:mn>1</mml:mn><mml:mo>&#x2217;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>. Additionally, all feature maps maintain a fixed size except for the Channel Attention module. Each Group Structure outputs 64 filters.</p>
</sec>
<sec sec-type="discussion" id="sec19">
<label>5</label>
<title>Discussion</title>
<sec id="sec20">
<label>5.1</label>
<title>Experimental settings</title>
<sec id="sec21">
<label>5.1.1</label>
<title>Training/Testing details</title>
<p>ARB-Net builds on the same training settings of (<xref ref-type="bibr" rid="ref63">Qin et al., 2020</xref>). Following the same hyperparameter configurations of (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>) for CDB. We evaluate the DiCANet models on the whole CHAMELEON (<xref ref-type="bibr" rid="ref72">Skurowski et al., 2018</xref>) and test sets of CAMO (<xref ref-type="bibr" rid="ref42">Le et al., 2019</xref>), and COD10K (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>). The entire experiment was executed on a 2.2&#x2009;GHz dual-core Intel Core i7 CPU with 8&#x2009;GB of RAM using Google COLAB as our working interface. Evaluation Metrics: We adopt four benchmark evaluation metrics to evaluate the performance of the DiCANet model including S-measure (<xref ref-type="bibr" rid="ref13">Fan et al., 2017</xref>), mean E-measure (<xref ref-type="bibr" rid="ref14">Fan et al., 2018</xref>), weighted F-measure (<xref ref-type="bibr" rid="ref55">Margolin et al., 2014</xref>), and Mean Absolute Error.</p>
</sec>
</sec>
<sec id="sec22">
<label>5.2</label>
<title>Baseline models</title>
<p>To demonstrate the robustness of DiCANet, this research selected 13 strong baseline methods that adopted ResNet50 (<xref ref-type="bibr" rid="ref29">He et al., 2016</xref>) as the backbone network for feature extraction and achieved SOTA performance in related fields, namely GOD and SOD: object detection FPN (<xref ref-type="bibr" rid="ref46">Lin et al., 2017</xref>); semantic segmentation PSPNet (<xref ref-type="bibr" rid="ref93">Zhao et al., 2017</xref>); instance segmentation Mask RCNN (<xref ref-type="bibr" rid="ref27">He et al., 2017</xref>), HTC (<xref ref-type="bibr" rid="ref4">Chen et al., 2019</xref>), and MSRCNN (<xref ref-type="bibr" rid="ref32">Huang et al., 2019</xref>); medical image segmentation UNet++ (<xref ref-type="bibr" rid="ref97">Zhou et al., 2018</xref>) and PraNet (<xref ref-type="bibr" rid="ref16">Fan et al., 2020b</xref>); salient object detection PiCANet (<xref ref-type="bibr" rid="ref48">Liu et al., 2018</xref>) BASNet (<xref ref-type="bibr" rid="ref64">Qin et al., 2019</xref>), CPD (<xref ref-type="bibr" rid="ref86">Wu et al., 2019</xref>), PFANet (<xref ref-type="bibr" rid="ref94">Zhao and Wu, 2019</xref>), EGNet (<xref ref-type="bibr" rid="ref92">Zhao J. X. et al., 2019</xref>), and camouflaged object segmentation SINet (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>).</p>
</sec>
<sec id="sec23">
<label>5.3</label>
<title>Quantitative comparison</title>
<p><xref ref-type="table" rid="tab1">Table 1</xref> summarizes the quantitative results of different baselines on three standard COD datasets. The proposed model achieved the highest values for the evaluation metrics, which indicates superior performance.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Quantitative comparison in terms of <inline-formula><mml:math id="M60"><mml:msub><mml:mi>S</mml:mi><mml:mo>&#x221D;</mml:mo></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math id="M61"><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x03D5;</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x03B2;</mml:mi><mml:mi>&#x03C9;</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mi mathvariant="italic">and</mml:mi><mml:mspace width="0.25em"/><mml:mi>M</mml:mi></mml:math></inline-formula> on three benchmark COD datasets (<xref ref-type="bibr" rid="ref15">Fan et al., 2020a</xref>).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">Baseline models</th>
<th align="center" valign="top" colspan="4">CHAMELEON</th>
<th align="center" valign="top" colspan="4">CAMO &#x2013; Test</th>
<th align="center" valign="top" colspan="4">COD10K &#x2013; Test</th>
</tr>
<tr>
<th align="center" valign="top">
<inline-formula><mml:math id="M62"><mml:msub><mml:mi>S</mml:mi><mml:mo>&#x221D;</mml:mo></mml:msub><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M63"><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x03D5;</mml:mi></mml:msub><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M64"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x03B2;</mml:mi><mml:mi>&#x03C9;</mml:mi></mml:msubsup><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M65"><mml:mi>M</mml:mi><mml:mo>&#x2193;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M66"><mml:msub><mml:mi>S</mml:mi><mml:mo>&#x221D;</mml:mo></mml:msub><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M67"><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x03D5;</mml:mi></mml:msub><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M68"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x03B2;</mml:mi><mml:mi>&#x03C9;</mml:mi></mml:msubsup><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M69"><mml:mi>M</mml:mi><mml:mo>&#x2193;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M70"><mml:msub><mml:mi>S</mml:mi><mml:mo>&#x221D;</mml:mo></mml:msub><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M71"><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x03D5;</mml:mi></mml:msub><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M72"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x03B2;</mml:mi><mml:mi>&#x03C9;</mml:mi></mml:msubsup><mml:mo>&#x2191;</mml:mo></mml:math></inline-formula>
</th>
<th align="center" valign="top">
<inline-formula><mml:math id="M73"><mml:mi>M</mml:mi><mml:mo>&#x2193;</mml:mo></mml:math></inline-formula>
</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">FPN</td>
<td align="center" valign="top">0.794</td>
<td align="center" valign="top">0.783</td>
<td align="center" valign="top">0.590</td>
<td align="center" valign="top">0.075</td>
<td align="center" valign="top">0.684</td>
<td align="center" valign="top">0.677</td>
<td align="center" valign="top">0.483</td>
<td align="center" valign="top">0.131</td>
<td align="center" valign="top">0.697</td>
<td align="center" valign="top">0.691</td>
<td align="center" valign="top">0.411</td>
<td align="center" valign="top">0.075</td>
</tr>
<tr>
<td align="left" valign="top">MaskRCNN</td>
<td align="center" valign="top">0.643</td>
<td align="center" valign="top">0.778</td>
<td align="center" valign="top">0.518</td>
<td align="center" valign="top">0.099</td>
<td align="center" valign="top">0.574</td>
<td align="center" valign="top">0.715</td>
<td align="center" valign="top">0.430</td>
<td align="center" valign="top">0.151</td>
<td align="center" valign="top">0.613</td>
<td align="center" valign="top">0.748</td>
<td align="center" valign="top">0.402</td>
<td align="center" valign="top">0.080</td>
</tr>
<tr>
<td align="left" valign="top">PSPNet</td>
<td align="center" valign="top">0.773</td>
<td align="center" valign="top">0.758</td>
<td align="center" valign="top">0.555</td>
<td align="center" valign="top">0.085</td>
<td align="center" valign="top">0.663</td>
<td align="center" valign="top">0.659</td>
<td align="center" valign="top">0.455</td>
<td align="center" valign="top">0.139</td>
<td align="center" valign="top">0.678</td>
<td align="center" valign="top">0.680</td>
<td align="center" valign="top">0.377</td>
<td align="center" valign="top">0.080</td>
</tr>
<tr>
<td align="left" valign="top">UNet++</td>
<td align="center" valign="top">0.695</td>
<td align="center" valign="top">0.762</td>
<td align="center" valign="top">0.501</td>
<td align="center" valign="top">0.094</td>
<td align="center" valign="top">0.599</td>
<td align="center" valign="top">0.653</td>
<td align="center" valign="top">0.392</td>
<td align="center" valign="top">0.149</td>
<td align="center" valign="top">0.623</td>
<td align="center" valign="top">0.672</td>
<td align="center" valign="top">0.350</td>
<td align="center" valign="top">0.086</td>
</tr>
<tr>
<td align="left" valign="top">PiCANet</td>
<td align="center" valign="top">0.769</td>
<td align="center" valign="top">0.749</td>
<td align="center" valign="top">0.536</td>
<td align="center" valign="top">0.085</td>
<td align="center" valign="top">0.609</td>
<td align="center" valign="top">0.584</td>
<td align="center" valign="top">0.356</td>
<td align="center" valign="top">0.156</td>
<td align="center" valign="top">0.649</td>
<td align="center" valign="top">0.643</td>
<td align="center" valign="top">0.322</td>
<td align="center" valign="top">0.090</td>
</tr>
<tr>
<td align="left" valign="top">MSRCNN</td>
<td align="center" valign="top">0.637</td>
<td align="center" valign="top">0.686</td>
<td align="center" valign="top">0.443</td>
<td align="center" valign="top">0.091</td>
<td align="center" valign="top">0.617</td>
<td align="center" valign="top">0.669</td>
<td align="center" valign="top">0.454</td>
<td align="center" valign="top">0.133</td>
<td align="center" valign="top">0.641</td>
<td align="center" valign="top">0.706</td>
<td align="center" valign="top">0.419</td>
<td align="center" valign="top">0.073</td>
</tr>
<tr>
<td align="left" valign="top">BASNet</td>
<td align="center" valign="top">0.687</td>
<td align="center" valign="top">0.721</td>
<td align="center" valign="top">0.474</td>
<td align="center" valign="top">0.118</td>
<td align="center" valign="top">0.618</td>
<td align="center" valign="top">0.661</td>
<td align="center" valign="top">0.413</td>
<td align="center" valign="top">0.159</td>
<td align="center" valign="top">0.634</td>
<td align="center" valign="top">0.678</td>
<td align="center" valign="top">0.365</td>
<td align="center" valign="top">0.105</td>
</tr>
<tr>
<td align="left" valign="top">PFANet</td>
<td align="center" valign="top">0.679</td>
<td align="center" valign="top">0.648</td>
<td align="center" valign="top">0.378</td>
<td align="center" valign="top">0.144</td>
<td align="center" valign="top">0.659</td>
<td align="center" valign="top">0.622</td>
<td align="center" valign="top">0.391</td>
<td align="center" valign="top">0.172</td>
<td align="center" valign="top">0.636</td>
<td align="center" valign="top">0.618</td>
<td align="center" valign="top">0.286</td>
<td align="center" valign="top">0.128</td>
</tr>
<tr>
<td align="left" valign="top">CPD</td>
<td align="center" valign="top">0.853</td>
<td align="center" valign="top">0.866</td>
<td align="center" valign="top">0.706</td>
<td align="center" valign="top">0.052</td>
<td align="center" valign="top">0.726</td>
<td align="center" valign="top">0.729</td>
<td align="center" valign="top">0.550</td>
<td align="center" valign="top">0.115</td>
<td align="center" valign="top">0.747</td>
<td align="center" valign="top">0.770</td>
<td align="center" valign="top">0.508</td>
<td align="center" valign="top">0.059</td>
</tr>
<tr>
<td align="left" valign="top">HTC</td>
<td align="center" valign="top">0.517</td>
<td align="center" valign="top">0.489</td>
<td align="center" valign="top">0.204</td>
<td align="center" valign="top">0.129</td>
<td align="center" valign="top">0.476</td>
<td align="center" valign="top">0.442</td>
<td align="center" valign="top">0.174</td>
<td align="center" valign="top">0.172</td>
<td align="center" valign="top">0.548</td>
<td align="center" valign="top">0.520</td>
<td align="center" valign="top">0.221</td>
<td align="center" valign="top">0.088</td>
</tr>
<tr>
<td align="left" valign="top">EGNet</td>
<td align="center" valign="top">0.848</td>
<td align="center" valign="top">0.870</td>
<td align="center" valign="top">0.702</td>
<td align="center" valign="top">0.050</td>
<td align="center" valign="top">0.732</td>
<td align="center" valign="top">0.768</td>
<td align="center" valign="top">0.583</td>
<td align="center" valign="top">0.104</td>
<td align="center" valign="top">0.737</td>
<td align="center" valign="top">0.779</td>
<td align="center" valign="top">0.509</td>
<td align="center" valign="top">0.056</td>
</tr>
<tr>
<td align="left" valign="top">PraNet</td>
<td align="center" valign="top">0.860</td>
<td align="center" valign="top">0.907</td>
<td align="center" valign="top">0.763</td>
<td align="center" valign="top">0.044</td>
<td align="center" valign="top">0.769</td>
<td align="center" valign="top">0.824</td>
<td align="center" valign="top">0.663</td>
<td align="center" valign="top">0.094</td>
<td align="center" valign="top">0.789</td>
<td align="center" valign="top">0.861</td>
<td align="center" valign="top">0.629</td>
<td align="center" valign="top">0.045</td>
</tr>
<tr>
<td align="left" valign="top">SINet</td>
<td align="center" valign="top">0.869</td>
<td align="center" valign="top">0.891</td>
<td align="center" valign="top">0.740</td>
<td align="center" valign="top">0.044</td>
<td align="center" valign="top">0.751</td>
<td align="center" valign="top">0.771</td>
<td align="center" valign="top">0.606</td>
<td align="center" valign="top">0.100</td>
<td align="center" valign="top">0.771</td>
<td align="center" valign="top">0.806</td>
<td align="center" valign="top">0.551</td>
<td align="center" valign="top">0.051</td>
</tr>
<tr>
<td align="left" valign="top">DiCANet (Ours)</td>
<td align="center" valign="top"><bold>0.871</bold></td>
<td align="center" valign="top"><bold>0.950</bold></td>
<td align="center" valign="top"><bold>0.805</bold></td>
<td align="center" valign="top"><bold>0.034</bold></td>
<td align="center" valign="top">0.747</td>
<td align="center" valign="top"><bold>0.828</bold></td>
<td align="center" valign="top"><bold>0.647</bold></td>
<td align="center" valign="top"><bold>0.091</bold></td>
<td align="center" valign="top"><bold>0.775</bold></td>
<td align="center" valign="top"><bold>0.872</bold></td>
<td align="center" valign="top"><bold>0.629</bold></td>
<td align="center" valign="top"><bold>0.043</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The best scores are highlighted in bold. t indicates the higher the score the better, and &#x2193;: the lower the better.</p>
</table-wrap-foot>
</table-wrap>
<p>For the CAMO dataset, comparing DiCANet model with the top two performing baselines: PraNet and SINet, the proposed method improved by 0.003 and 0.009, respectively in terms of <inline-formula><mml:math id="M74"><mml:mi>M</mml:mi></mml:math></inline-formula>, and by 0.057 and 0.041, respectively, in terms of <inline-formula><mml:math id="M75"><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x03D5;</mml:mi></mml:msub></mml:math></inline-formula>and <inline-formula><mml:math id="M76"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x03B2;</mml:mi><mml:mi>&#x03C9;</mml:mi></mml:msubsup></mml:math></inline-formula>. Although DiCANet achieved a low structural similarity score <inline-formula><mml:math id="M77"><mml:msub><mml:mi>S</mml:mi><mml:mo>&#x221D;</mml:mo></mml:msub></mml:math></inline-formula>, accurate predictions with high integrity of preserved edge details and clear boundaries were still achieved. Similarly, when compared with the edge boundary models, e.g., EGNet and PFANet, our DiCANet improves <inline-formula><mml:math id="M78"><mml:msub><mml:mi>E</mml:mi><mml:mi>&#x03D5;</mml:mi></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M79"><mml:msubsup><mml:mi>F</mml:mi><mml:mi>&#x03B2;</mml:mi><mml:mi>&#x03C9;</mml:mi></mml:msubsup></mml:math></inline-formula> by (0.08 and 0.103) and (0.302 and 0.427), respectively, while drastically reducing MAE error by 0.016 and 0.110 for the CHAMELEON dataset. DiCANet achieved a significant improvement in <inline-formula><mml:math id="M80"><mml:msub><mml:mi>S</mml:mi><mml:mo>&#x221D;</mml:mo></mml:msub></mml:math></inline-formula> of 0.011 compared with the best model PraNet. Interestingly, for the most challenging dataset, COD10K, DiCANet outperformed the competition in prediction accuracy for all metrics and boosted performance to a new SOTA.</p>
</sec>
<sec id="sec24">
<label>5.4</label>
<title>Qualitative comparison</title>
<p><xref ref-type="fig" rid="fig7">Figure 7</xref> shows the qualitative comparison of the camouflaged prediction map of DiCANet against the top four cutting-edge models. Row 1 to row 2, (top to bottom) are examples from CHAMELEON datasets; row 3 are examples from CAMO datasets; row 4 is an example from COD10K&#x2019;s super-class: amphibious. It is evident that DiCANet outperforms all competing models and provides the best prediction that is the closest to ground truth (best viewed when zoomed).</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Camouflaged objects segmentation results. <bold>(A)</bold> Image, <bold>(B)</bold> GT, <bold>(C)</bold> DiCANet, <bold>(D)</bold> SINet, <bold>(E)</bold> PraNet, <bold>(F)</bold> EGNet, <bold>(G)</bold> CPD.</p>
</caption>
<graphic xlink:href="frai-07-1347898-g007.tif"/>
</fig>
<p>Noncamouflaged regions are consistently included in the results of the compared methods, while some details of camouflaged objects are neglected. In contrast, the competing models inaccurately detect disguised objects and provide unreliable visual results. The proposed model demonstrated excellent performance in locating concealed objects accurately, with rich, fine details in predictions and clear boundaries. Additionally, our method captures the object boundaries quite well due to the power of ARB&#x2019;s adaptive weighing mechanism and feature fusion strategy.</p>
<sec id="sec25">
<label>5.4.1</label>
<title>Failure case</title>
<p>Despite achieving satisfactory quantitative performance and setting a record in the COD task, the proposed DiCANet framework exhibits limitations in specific scenarios as shown in <xref ref-type="fig" rid="fig8">Figure 8</xref>. When dealing with multiple camouflaged objects grouped closely together (<italic>row 1</italic>), DiCANet might struggle to accurately predict the number of objects. This limitation can be attributed to the network&#x2019;s limited prior knowledge in handling scenes with a specific number of objects. The complicated topological structures (<italic>row 2</italic>) with dense details can also pose challenges for DiCANet due to background complexity distraction. This complexity overwhelms the attention mechanisms, diverting focus from the camouflaged objects. Additionally, the intricate details in the background could share similar features with the camouflage patterns, making it difficult to distinguish the camouflaged object from its surroundings. These limitations provide valuable insights and potential areas for future investigation. By tackling these challenges and exploring novel approaches, researchers can create more resilient COD systems capable of managing even the most intricate and challenging scenarios.</p>
<fig position="float" id="fig8">
<label>Figure 8</label>
<caption>
<p>Failure cases of our DiCANet. <bold>(A)</bold> Images, <bold>(B)</bold> GT, <bold>(C)</bold> Ours, <bold>(D)</bold> SINet.</p>
</caption>
<graphic xlink:href="frai-07-1347898-g008.tif"/>
</fig>
</sec>
<sec id="sec26">
<label>5.4.2</label>
<title>Ablation study</title>
<p>To further demonstrate the superiority of DiCANet architecture with previous state-of-the-art methods, we conducted an ablation study by considering challenging camouflage scenarios (<xref ref-type="fig" rid="fig9">Figure 9</xref>). The study observes that DiCNet consistently shows distinctive detection and segmentation of concealed objects in challenging natural scenarios, such as partial occlusion (1st row), weak object/background contrast (2nd row), and strong background descriptor (3rd row). Meanwhile, the structural similarity &#x1D47A;&#x221D; scores (<italic>in red</italic>) of DiCANet are much higher and with a minimal error (<italic>in red</italic>) compared to the competitors, which further demonstrates the superiority of our method. We can also clearly see that the combination of the proposed adaptive ARB-Net and <italic>Feature Fusion Attention Strategy</italic> has significantly elevated our results to an exceptional level.</p>
<fig position="float" id="fig9">
<label>Figure 9</label>
<caption>
<p>Visual comparison with top three baselines on COD10K <inline-formula><mml:math id="M81"><mml:mfenced open="(" close=")"><mml:mrow><mml:msub><mml:mi mathvariant="normal">S</mml:mi><mml:mo>&#x221D;</mml:mo></mml:msub><mml:mo stretchy="true">/</mml:mo><mml:mi>M</mml:mi></mml:mrow></mml:mfenced></mml:math></inline-formula>.</p>
</caption>
<graphic xlink:href="frai-07-1347898-g009.tif"/>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="conclusions" id="sec27">
<label>6</label>
<title>Conclusion</title>
<p>This paper presents Discriminative Context-Aware Network (DiCANet), a novel joint learning framework for detecting concealed objects with refined edges. The proposed model leverages two key components: the ARB-Net and the CDB. To improve the camouflage scene visibility, we employed ARB-Net to adaptively generate different attention weights for each channel-and pixel-wise feature and strategically fuse the feature maps to expand the discriminative power and representative ability of the convolution networks. To drive camouflage object localization and segmentation performance, we employed the CDB module. Based on the ARB and CDB modules, a context-aware network that effectively aims to pay more attention to local contextual information to evaluate the objectivity of the camouflage prediction map was proposed. Extensive experiments show that mining distinctive information can overcome the difficulties of both SOD and COD tasks with superior performance; thus, DiCANet outperforms SOTA methods under the commonly used evaluation metrics and deserves further exploration in other related computer vision tasks.</p>
</sec>
<sec sec-type="data-availability" id="sec28">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec sec-type="author-contributions" id="sec29">
<title>Author contributions</title>
<p>CI: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Resources, Software, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. NM: Data curation, Formal analysis, Investigation, Methodology, Project administration, Resources, Software, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. NB: Conceptualization, Data curation, Formal analysis, Investigation, Project administration, Resources, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. SA: Data curation, Formal analysis, Funding acquisition, Investigation, Project administration, Resources, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. FE: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Project administration, Resources, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="sec30">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research, authorship, and/or publication of this article.</p>
</sec>
<ack>
<p>The authors would like to thank Atlantic Technological University, Pak-Austria Fachhochschule Institute of Applied Sciences and Technology, Fatima Jinnah Women University, and Saudi Electronic University.</p>
</ack>
<sec sec-type="COI-statement" id="sec31">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="sec100" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list><title>References</title>
<ref id="ref1"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Abdelhamed</surname> <given-names>A.</given-names></name> <name><surname>Lin</surname> <given-names>S.</given-names></name> <name><surname>Brown</surname> <given-names>M. S.</given-names></name></person-group> (<year>2018</year>). <article-title>A high-quality denoising dataset for smartphone cameras</article-title>, in &#x201C;<conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>&#x201D;, pp. <fpage>1692</fpage>&#x2013;<lpage>1700</lpage>.</citation></ref>
<ref id="ref3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>L.</given-names></name> <name><surname>Chu</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>Simple baselines for image restoration</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2204.04676</pub-id></citation></ref>
<ref id="ref4"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>K.</given-names></name> <name><surname>Pang</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Xiong</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Sun</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Hybrid task cascade for instance segmentation</article-title>. <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>, pp. <fpage>4974</fpage>&#x2013;<lpage>4983</lpage>.</citation></ref>
<ref id="ref5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chu</surname> <given-names>X.</given-names></name> <name><surname>Chen</surname> <given-names>L.</given-names></name> <name><surname>Chen</surname> <given-names>C.</given-names></name> <name><surname>Lu</surname> <given-names>X.</given-names></name></person-group> (<year>2021</year>). <article-title>Revisiting global statistics aggregation for improving image restoration</article-title>. <source>arXiv</source></citation></ref>
<ref id="ref6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chu</surname> <given-names>H. K.</given-names></name> <name><surname>Hsu</surname> <given-names>W. H.</given-names></name> <name><surname>Mitra</surname> <given-names>N. J.</given-names></name> <name><surname>Cohen-Or</surname> <given-names>D.</given-names></name> <name><surname>Wong</surname> <given-names>T. T.</given-names></name> <name><surname>Lee</surname> <given-names>T. Y.</given-names></name></person-group> (<year>2010</year>). <article-title>Camouflage images</article-title>. <source>ACM Trans. Graph.</source> <volume>29</volume>:<fpage>1</fpage>. doi: <pub-id pub-id-type="doi">10.1145/1833351.1778788</pub-id></citation></ref>
<ref id="ref7"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Cott</surname> <given-names>H. B.</given-names></name></person-group> (<year>1940</year>). <source>Adaptive coloration in animals</source>. Methuen, London.</citation></ref>
<ref id="ref8"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Dai</surname> <given-names>T.</given-names></name> <name><surname>Cai</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Xia</surname> <given-names>S. T.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name></person-group> (<year>2019</year>). <article-title>Second-order attention network for single image super-resolution</article-title>. <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>11065</fpage>&#x2013;<lpage>11074</lpage>.</citation></ref>
<ref id="ref9"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>J.</given-names></name> <name><surname>Dong</surname> <given-names>W.</given-names></name> <name><surname>Socher</surname> <given-names>R.</given-names></name> <name><surname>Li</surname> <given-names>L. J.</given-names></name> <name><surname>Li</surname> <given-names>K.</given-names></name> <name><surname>Fei-Fei</surname> <given-names>L.</given-names></name></person-group> (<year>2009</year>). <article-title>Imagenet: a large-scale hierarchical image database</article-title>. In <conf-name>2009 IEEE conference on computer vision and pattern recognition, IEEE</conf-name>, pp. <fpage>248</fpage>&#x2013;<lpage>255</lpage>.</citation></ref>
<ref id="ref10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dong</surname> <given-names>B.</given-names></name> <name><surname>Zhuge</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Bi</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>G.</given-names></name></person-group> (<year>2021</year>). <article-title>Accurate camouflaged object detection via mixture convolution and interactive fusion</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2101.05687</pub-id></citation></ref>
<ref id="ref11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dosovitskiy</surname> <given-names>A.</given-names></name> <name><surname>Beyer</surname> <given-names>L.</given-names></name> <name><surname>Kolesnikov</surname> <given-names>A.</given-names></name> <name><surname>Weissenborn</surname> <given-names>D.</given-names></name> <name><surname>Zhai</surname> <given-names>X.</given-names></name> <name><surname>Unterthiner</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>An image is worth 16x16 words:Transformers for image recognition at scale</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id></citation></ref>
<ref id="ref12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Everingham</surname> <given-names>M.</given-names></name> <name><surname>Eslami</surname> <given-names>S. M.</given-names></name> <name><surname>Van Gool</surname> <given-names>L.</given-names></name> <name><surname>Williams</surname> <given-names>C. K.</given-names></name> <name><surname>Winn</surname> <given-names>J.</given-names></name> <name><surname>Zisserman</surname> <given-names>A.</given-names></name></person-group> (<year>2015</year>). <article-title>The pascal visual object classes challenge: a retrospective</article-title>. <source>Int. J. Comput. Vis.</source> <volume>111</volume>, <fpage>98</fpage>&#x2013;<lpage>136</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11263-014-0733-5</pub-id></citation></ref>
<ref id="ref13"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>D. P.</given-names></name> <name><surname>Cheng</surname> <given-names>M. M.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>T.</given-names></name> <name><surname>Borji</surname> <given-names>A.</given-names></name></person-group> <article-title>Structure-measure: a new way to evaluate foreground maps</article-title>. <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name>. (<year>2017</year>), pp. <fpage>4548</fpage>&#x2013;<lpage>4557</lpage>.</citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>D. P.</given-names></name> <name><surname>Gong</surname> <given-names>C.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Ren</surname> <given-names>B.</given-names></name> <name><surname>Cheng</surname> <given-names>M. M.</given-names></name> <name><surname>Borji</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>Enhanced-alignment measure for binary foreground map evaluation</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1805.10421</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>D. P.</given-names></name> <name><surname>Ji</surname> <given-names>G. P.</given-names></name> <name><surname>Sun</surname> <given-names>G.</given-names></name> <name><surname>Cheng</surname> <given-names>M. M.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Shao</surname> <given-names>L.</given-names></name></person-group> (<year>2020a</year>). <article-title>Camouflaged object detection</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>2777</fpage>&#x2013;<lpage>2787</lpage>.</citation></ref>
<ref id="ref16"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>D. P.</given-names></name> <name><surname>Ji</surname> <given-names>G. P.</given-names></name> <name><surname>Zhou</surname> <given-names>T.</given-names></name> <name><surname>Chen</surname> <given-names>G.</given-names></name> <name><surname>Fu</surname> <given-names>H.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2020b</year>). &#x201C;<article-title>Pranet: Parallel reverse attention network for polyp segmentation</article-title>&#x201D; in <source>International conference on medical image computing and computer-assisted intervention</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>263</fpage>&#x2013;<lpage>273</lpage>.</citation></ref>
<ref id="ref17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>D. P.</given-names></name> <name><surname>Zhou</surname> <given-names>T.</given-names></name> <name><surname>Ji</surname> <given-names>G. P.</given-names></name> <name><surname>Zhou</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>G.</given-names></name> <name><surname>Fu</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2020c</year>). <article-title>INF-net: automatic COVID-19 lung infection segmentation from ct images</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>39</volume>, <fpage>2626</fpage>&#x2013;<lpage>2637</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TMI.2020.2996645</pub-id>, PMID: <pub-id pub-id-type="pmid">32730213</pub-id></citation></ref>
<ref id="ref18"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Flannelly</surname> <given-names>K. J.</given-names></name></person-group> (<year>2017</year>). <source>Religious beliefs, evolutionary psychiatry, and mental health in America</source>. <publisher-loc>New York, NY</publisher-loc>: <publisher-name>Springer</publisher-name>.</citation></ref>
<ref id="ref19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Forrest</surname> <given-names>N.</given-names></name></person-group> (<year>2016</year>). <article-title>SqueezeNet: AlexNet-level accuracy with 50x fewer parameters</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1602.07360</pub-id></citation></ref>
<ref id="ref20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Galloway</surname> <given-names>J. A.</given-names></name> <name><surname>Green</surname> <given-names>S. D.</given-names></name> <name><surname>Stevens</surname> <given-names>M.</given-names></name> <name><surname>Kelley</surname> <given-names>L. A.</given-names></name></person-group> (<year>1802</year>). <article-title>Finding a signal hidden among noise: how can predators overcome camouflage strategies?</article-title> <source>Philos. Trans. R. Soc. B</source> <volume>2020</volume>:<fpage>20190478</fpage>.</citation></ref>
<ref id="ref21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Galun</surname> <given-names>M.</given-names></name> <name><surname>Sharon</surname> <given-names>E.</given-names></name> <name><surname>Basri</surname> <given-names>R.</given-names></name> <name><surname>Brandt</surname> <given-names>A.</given-names></name></person-group> (<year>2003</year>). <article-title>Texture segmentation by multiscale aggregation of filter responses and shape elements</article-title>. <source>ICCV</source> <volume>3</volume>:<fpage>716</fpage>. doi: <pub-id pub-id-type="doi">10.1109/ICCV.2003.1238418</pub-id></citation></ref>
<ref id="ref22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ge</surname> <given-names>S.</given-names></name> <name><surname>Jin</surname> <given-names>X.</given-names></name> <name><surname>Ye</surname> <given-names>Q.</given-names></name> <name><surname>Luo</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>Q.</given-names></name></person-group> (<year>2018</year>). <article-title>Image editing by object-aware optimal boundary searching and mixed-domain composition</article-title>. <source>Comput. Vis. Media</source> <volume>4</volume>, <fpage>71</fpage>&#x2013;<lpage>82</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s41095-017-0102-8</pub-id></citation></ref>
<ref id="ref23"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group> (<year>2015</year>). &#x201C;<article-title>Fast r-cnn</article-title>&#x201D; in <source>Proceedings of the IEEE international conference on computer vision</source>, <fpage>1440</fpage>&#x2013;<lpage>1448</lpage>.</citation></ref>
<ref id="ref24"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Donahue</surname> <given-names>J.</given-names></name> <name><surname>Darrell</surname> <given-names>T.</given-names></name> <name><surname>Malik</surname> <given-names>J.</given-names></name></person-group> <article-title>Rich feature hierarchies for accurate object detection and semantic segmentation</article-title>. In <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. (<year>2014</year>), pp. <fpage>580</fpage>&#x2013;<lpage>587</lpage>.</citation></ref>
<ref id="ref25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gleeson</surname> <given-names>P.</given-names></name> <name><surname>Lung</surname> <given-names>D.</given-names></name> <name><surname>Grosu</surname> <given-names>R.</given-names></name> <name><surname>Hasani</surname> <given-names>R.</given-names></name> <name><surname>Larson</surname> <given-names>S. D.</given-names></name></person-group> (<year>2018</year>). <article-title>c302: a multiscale framework for modelling the nervous system of <italic>Caenorhabditis elegans</italic></article-title>. <source>Philos. Trans. Royal Soc. B</source> <volume>373</volume>:<fpage>20170379</fpage>. doi: <pub-id pub-id-type="doi">10.1098/rstb.2017.0379</pub-id>, PMID: <pub-id pub-id-type="pmid">30201842</pub-id></citation></ref>
<ref id="ref26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>Q.</given-names></name> <name><surname>Fan</surname> <given-names>Z.</given-names></name> <name><surname>Dai</surname> <given-names>Q.</given-names></name> <name><surname>Sun</surname> <given-names>L.</given-names></name> <name><surname>Cheng</surname> <given-names>M. M.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Demystifying local vision transformer: Sparse connectivity, weight sharing, and dynamic weight</article-title>. <source>arXiv</source> <volume>2</volume>.</citation></ref>
<ref id="ref27"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Gkioxari</surname> <given-names>G.</given-names></name> <name><surname>Doll&#x00E1;r</surname> <given-names>P.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name></person-group> (<year>2017</year>). <article-title>Mask R-CNN</article-title>. <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name>, pp. <fpage>2961</fpage>&#x2013;<lpage>2969</lpage>.</citation></ref>
<ref id="ref28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name> <name><surname>Tang</surname> <given-names>X.</given-names></name></person-group> (<year>2010</year>). <article-title>Single image haze removal using dark channel prior</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>33</volume>, <fpage>2341</fpage>&#x2013;<lpage>2353</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2010.168</pub-id>, PMID: <pub-id pub-id-type="pmid">20820075</pub-id></citation></ref>
<ref id="ref29"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>Deep residual learning for image recognition</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, pp. <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</citation></ref>
<ref id="ref30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hou</surname> <given-names>J. Y. Y. H. W.</given-names></name></person-group> (<year>2011</year>). <article-title>Detection of the mobile object with camouflage color under dynamic background based on optical flow</article-title>. <source>Procedia Eng.</source> <volume>15</volume>, <fpage>2201</fpage>&#x2013;<lpage>2205</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.proeng.2011.08.412</pub-id></citation></ref>
<ref id="ref31"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Shen</surname> <given-names>L.</given-names></name> <name><surname>Sun</surname> <given-names>G.</given-names></name></person-group> (<year>2018</year>). <article-title>Squeeze-and-excitation networks</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, <fpage>7132</fpage>&#x2013;<lpage>7141</lpage>.</citation></ref>
<ref id="ref32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Z.</given-names></name> <name><surname>Huang</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name></person-group> (<year>2019</year>). <article-title>Mask scoring R-CNN</article-title>. <source>CVPR</source>, <fpage>6409</fpage>&#x2013;<lpage>6418</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1903.00241</pub-id></citation></ref>
<ref id="ref33"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>G.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Van Der Maaten</surname> <given-names>L.</given-names></name> <name><surname>Weinberger</surname> <given-names>K. Q.</given-names></name></person-group> (<year>2017</year>). <article-title>Densely connected convolutional networks</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, pp. <fpage>4700</fpage>&#x2013;<lpage>4708</lpage>.</citation></ref>
<ref id="ref34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hussain</surname> <given-names>N.</given-names></name> <name><surname>Khan</surname> <given-names>M. A.</given-names></name> <name><surname>Kadry</surname> <given-names>S.</given-names></name> <name><surname>Tariq</surname> <given-names>U.</given-names></name> <name><surname>Mostafa</surname> <given-names>R. R.</given-names></name> <name><surname>Choi</surname> <given-names>J. I.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Intelligent deep learning and improved whale optimization algorithm based framework for object recognition</article-title>. <source>Hum. Cent. Comput. Inf. Sci</source> <volume>11</volume>:<fpage>2021</fpage>. doi: <pub-id pub-id-type="doi">10.22967/HCIS.2021.11.034</pub-id></citation></ref>
<ref id="ref35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ismail Fawaz</surname> <given-names>H.</given-names></name> <name><surname>Forestier</surname> <given-names>G.</given-names></name> <name><surname>Weber</surname> <given-names>J.</given-names></name> <name><surname>Idoumghar</surname> <given-names>L.</given-names></name> <name><surname>Muller</surname> <given-names>P. A.</given-names></name></person-group> (<year>2019</year>). <article-title>Deep learning for time series classification: a review</article-title>. <source>Data Min. Knowl. Disc.</source> <volume>33</volume>, <fpage>917</fpage>&#x2013;<lpage>963</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10618-019-00619-1</pub-id></citation></ref>
<ref id="ref36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ji</surname> <given-names>G. P.</given-names></name> <name><surname>Zhu</surname> <given-names>L.</given-names></name> <name><surname>Zhuge</surname> <given-names>M.</given-names></name> <name><surname>Fu</surname> <given-names>K.</given-names></name></person-group> (<year>2022</year>). <article-title>Fast camouflaged object detection via edge-based reversible re-calibration network</article-title>. <source>Pattern Recogn.</source> <volume>123</volume>:<fpage>108414</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patcog.2021.108414</pub-id></citation></ref>
<ref id="ref38"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Kirillov</surname> <given-names>A.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Rother</surname> <given-names>C.</given-names></name> <name><surname>Doll&#x00E1;r</surname> <given-names>P.</given-names></name></person-group> (<year>2019</year>). <article-title>Panoptic segmentation</article-title>. <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>9404</fpage>&#x2013;<lpage>9413</lpage>.</citation></ref>
<ref id="ref39"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Lamdouar</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>C.</given-names></name> <name><surname>Xie</surname> <given-names>W.</given-names></name> <name><surname>Zisserman</surname> <given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>Betrayed by motion: Camouflaged object discovery via motion segmentation</article-title>. <conf-name>Proceedings of the Asian Conference on Computer Vision</conf-name>.</citation></ref>
<ref id="ref40"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Langley</surname> <given-names>C. M.</given-names></name> <name><surname>Riley</surname> <given-names>D. A.</given-names></name> <name><surname>Bond</surname> <given-names>A. B.</given-names></name> <name><surname>Goel</surname> <given-names>N.</given-names></name></person-group> (<year>1996</year>). <article-title>Visual search for natural grains in pigeons (<italic>Columba livia</italic>): search images and selective attention</article-title>. <source>J. Exp. Psychol. Anim. Behav. Process.</source> <volume>22</volume>, <fpage>139</fpage>&#x2013;<lpage>151</lpage>. PMID: <pub-id pub-id-type="pmid">8618099</pub-id></citation></ref>
<ref id="ref41"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Le</surname> <given-names>T. N.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Nguyen</surname> <given-names>T. C.</given-names></name> <name><surname>Le</surname> <given-names>M. Q.</given-names></name> <name><surname>Nguyen</surname> <given-names>K. D.</given-names></name> <name><surname>Do</surname> <given-names>T. T.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Camouflaged instance segmentation in-the-wild: dataset, method, and benchmark suite</article-title>. <source>IEEE Trans. Image Process.</source> <volume>31</volume>, <fpage>287</fpage>&#x2013;<lpage>300</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2021.3130490</pub-id></citation></ref>
<ref id="ref42"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Le</surname> <given-names>T. N.</given-names></name> <name><surname>Nguyen</surname> <given-names>T. V.</given-names></name> <name><surname>Nie</surname> <given-names>Z.</given-names></name> <name><surname>Tran</surname> <given-names>M. T.</given-names></name> <name><surname>Sugimoto</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <article-title>Anabranch network for camouflaged object segmentation</article-title>. <source>Comput. Vis. Image Underst.</source> <volume>184</volume>, <fpage>45</fpage>&#x2013;<lpage>56</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cviu.2019.04.006</pub-id></citation></ref>
<ref id="ref43"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Le</surname> <given-names>T. N.</given-names></name> <name><surname>Ono</surname> <given-names>S.</given-names></name> <name><surname>Sugimoto</surname> <given-names>A.</given-names></name> <name><surname>Kawasaki</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>Attention R-CNN for accident detection</article-title>. In <conf-name>2020 IEEE intelligent vehicles symposium (IV)</conf-name>, pp. <fpage>313</fpage>&#x2013;<lpage>320</lpage>.</citation></ref>
<ref id="ref44"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Lim</surname> <given-names>B.</given-names></name> <name><surname>Son</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>H.</given-names></name> <name><surname>Nah</surname> <given-names>S.</given-names></name> <name><surname>Lee</surname> <given-names>M.</given-names></name></person-group>, (<year>2017</year>). <article-title>Enhanced deep residual networks for single image super-resolution</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition workshops</conf-name>, pp. <fpage>136</fpage>&#x2013;<lpage>144</lpage>.</citation></ref>
<ref id="ref45"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>M.</given-names></name> <name><surname>Chen</surname> <given-names>Q.</given-names></name> <name><surname>Yan</surname> <given-names>S.</given-names></name></person-group> (<year>2014</year>). <article-title>Network in Network</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1312.4400</pub-id></citation></ref>
<ref id="ref46"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>T. Y.</given-names></name> <name><surname>Doll&#x00E1;r</surname> <given-names>P.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Hariharan</surname> <given-names>B.</given-names></name> <name><surname>Belongie</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>Feature pyramid networks for object detection</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, pp. <fpage>2117</fpage>&#x2013;<lpage>2125</lpage>.</citation></ref>
<ref id="ref47"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>T. Y.</given-names></name> <name><surname>Maire</surname> <given-names>M.</given-names></name> <name><surname>Belongie</surname> <given-names>S.</given-names></name> <name><surname>Hays</surname> <given-names>J.</given-names></name> <name><surname>Perona</surname> <given-names>P.</given-names></name> <name><surname>Ramanan</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2014</year>). &#x201C;<article-title>Microsoft coco: Common objects in context</article-title>&#x201D; in <source>European conference on computer vision</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>740</fpage>&#x2013;<lpage>755</lpage>.</citation></ref>
<ref id="ref48"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>N.</given-names></name> <name><surname>Han</surname> <given-names>J.</given-names></name> <name><surname>Yang</surname> <given-names>M. H.</given-names></name></person-group> (<year>2018</year>). <article-title>Picanet: learning pixel-wise contextual attention for saliency detection</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, pp. <fpage>3089</fpage>&#x2013;<lpage>3098</lpage>.</citation></ref>
<ref id="ref49"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>S.</given-names></name> <name><surname>Huang</surname> <given-names>D.</given-names></name></person-group> (<year>2018</year>). <article-title>Receptive field block net for accurate and fast object detection</article-title>. In <conf-name>Proceedings of the European conference on computer vision</conf-name>, pp. <fpage>385</fpage>&#x2013;<lpage>400</lpage>.</citation></ref>
<ref id="ref50"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Huang</surname> <given-names>K.</given-names></name> <name><surname>Tan</surname> <given-names>T.</given-names></name></person-group> (<year>2012</year>). <article-title>Foreground object detection using top-down information based on EM framework</article-title>. <source>IEEE Trans. Image Process.</source> <volume>21</volume>, <fpage>4204</fpage>&#x2013;<lpage>4217</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2012.2200492</pub-id>, PMID: <pub-id pub-id-type="pmid">22645266</pub-id></citation></ref>
<ref id="ref51"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Mao</surname> <given-names>H.</given-names></name> <name><surname>Wu</surname> <given-names>C. Y.</given-names></name> <name><surname>Feichtenhofer</surname> <given-names>C.</given-names></name> <name><surname>Darrell</surname> <given-names>T.</given-names></name> <name><surname>Xie</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>A convnet for the 2020s</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>11976</fpage>&#x2013;<lpage>11986</lpage>.</citation></ref>
<ref id="ref52"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>C.</given-names></name> <name><surname>Yuen</surname> <given-names>J.</given-names></name> <name><surname>Torralba</surname> <given-names>A.</given-names></name></person-group> (<year>2010</year>). <article-title>Sift flow: dense correspondence across scenes and its applications</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>33</volume>, <fpage>978</fpage>&#x2013;<lpage>994</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2010.147</pub-id></citation></ref>
<ref id="ref54"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll1">Machine Learning Mastery</collab></person-group>. (<year>2019</year>) A Gentle Introduction to Pooling Layers for Convolutional Neural Networks. Available at: <ext-link xlink:href="https://machinelearningmastery.com/crash-course-convolutional-neural-networks/" ext-link-type="uri">https://machinelearningmastery.com/crash-course-convolutional-neural-networks/</ext-link></citation></ref>
<ref id="ref55"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Margolin</surname> <given-names>R.</given-names></name> <name><surname>Zelnik-Manor</surname> <given-names>L.</given-names></name> <name><surname>Tal</surname> <given-names>A.</given-names></name></person-group> (<year>2014</year>). <article-title>How to evaluate foreground maps?</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, pp. <fpage>248</fpage>&#x2013;<lpage>255</lpage>.</citation></ref>
<ref id="ref56"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Mei</surname> <given-names>H.</given-names></name> <name><surname>Ji</surname> <given-names>G. P.</given-names></name> <name><surname>Wei</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Wei</surname> <given-names>X.</given-names></name> <name><surname>Fan</surname> <given-names>D. P.</given-names></name></person-group> (<year>2021</year>). <article-title>Camouflaged object segmentation with distraction mining</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>8772</fpage>&#x2013;<lpage>8781</lpage>.</citation></ref>
<ref id="ref57"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Merilaita</surname> <given-names>S.</given-names></name> <name><surname>Scott-Samuel</surname> <given-names>N. E.</given-names></name> <name><surname>Cuthill</surname> <given-names>I. C.</given-names></name></person-group> (<year>2017</year>). <article-title>How camouflage works</article-title>. <source>Philos. Trans. Royal Soc. B</source> <volume>372</volume>:<fpage>20160341</fpage>. doi: <pub-id pub-id-type="doi">10.1098/rstb.2016.0341</pub-id>, PMID: <pub-id pub-id-type="pmid">28533458</pub-id></citation></ref>
<ref id="ref58"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Nah</surname> <given-names>S.</given-names></name> <name><surname>Hyun Kim</surname> <given-names>T.</given-names></name> <name><surname>Lee</surname> <given-names>M.</given-names></name></person-group>. (<year>2017</year>). <article-title>Deep multi-scale convolutional neural network for dynamic scene deblurring</article-title>. In <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, pp. <fpage>3883</fpage>&#x2013;<lpage>3891</lpage>.</citation></ref>
<ref id="ref59"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Naqvi</surname> <given-names>S. M. A.</given-names></name> <name><surname>Shabaz</surname> <given-names>M.</given-names></name> <name><surname>Khan</surname> <given-names>M. A.</given-names></name> <name><surname>Hassan</surname> <given-names>S. I.</given-names></name></person-group> (<year>2023</year>). <article-title>Adversarial attacks on visual objects using the fast gradient sign method</article-title>. <source>J Grid Comput</source> <volume>21</volume>:<fpage>52</fpage>. doi: <pub-id pub-id-type="doi">10.1007/s10723-023-09684-9</pub-id></citation></ref>
<ref id="ref60"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Niu</surname> <given-names>B.</given-names></name> <name><surname>Wen</surname> <given-names>W.</given-names></name> <name><surname>Ren</surname> <given-names>W.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Yang</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2020</year>). &#x201C;<article-title>Single image super-resolution via a holistic attention network</article-title>&#x201D; in <source>European conference on computer vision</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>191</fpage>&#x2013;<lpage>207</lpage>.</citation></ref>
<ref id="ref61"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pan</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Fu</surname> <given-names>Q.</given-names></name> <name><surname>Zhang</surname> <given-names>P.</given-names></name> <name><surname>Xu</surname> <given-names>X.</given-names></name></person-group> (<year>2011</year>). <article-title>Study on the camouflaged target detection method based on 3D convexity</article-title>. <source>Mod. Appl. Sci.</source> <volume>5</volume>:<fpage>152</fpage>. doi: <pub-id pub-id-type="doi">10.5539/mas.v5n4p152</pub-id></citation></ref>
<ref id="ref62"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qadeer</surname> <given-names>N.</given-names></name> <name><surname>Shah</surname> <given-names>J. H.</given-names></name> <name><surname>Sharif</surname> <given-names>M.</given-names></name> <name><surname>Khan</surname> <given-names>M. A.</given-names></name> <name><surname>Muhammad</surname> <given-names>G.</given-names></name> <name><surname>Zhang</surname> <given-names>Y. D.</given-names></name></person-group> (<year>2022</year>). <article-title>Intelligent tracking of mechanically thrown objects by industrial catching robot for automated in-plant logistics 4.0</article-title>. <source>Sensors</source> <volume>22</volume>:<fpage>2113</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s22062113</pub-id>, PMID: <pub-id pub-id-type="pmid">35336292</pub-id></citation></ref>
<ref id="ref63"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qin</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Bai</surname> <given-names>Y.</given-names></name> <name><surname>Xie</surname> <given-names>X.</given-names></name> <name><surname>Jia</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>FFA-net: feature fusion attention network for single image dehazing</article-title>. <source>Proc. AAAI Conf. Artif. Intel.</source> <volume>34</volume>, <fpage>11908</fpage>&#x2013;<lpage>11915</lpage>. doi: <pub-id pub-id-type="doi">10.1609/aaai.v34i07.6865</pub-id></citation></ref>
<ref id="ref64"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Qin</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Huang</surname> <given-names>C.</given-names></name> <name><surname>Gao</surname> <given-names>C.</given-names></name> <name><surname>Dehghan</surname> <given-names>M.</given-names></name> <name><surname>Jagersand</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>Basnet: boundary-aware salient object detection</article-title>. <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>7479</fpage>&#x2013;<lpage>7489</lpage>.</citation></ref>
<ref id="ref65"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Faster R-CNN: towards real-time object detection with region proposal networks</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>28</volume>, <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1506.01497</pub-id></citation></ref>
<ref id="ref66"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>Faster R-CNN: towards real-time object detection with region proposal networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>39</volume>, <fpage>1137</fpage>&#x2013;<lpage>1149</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>, PMID: <pub-id pub-id-type="pmid">27295650</pub-id></citation></ref>
<ref id="ref67"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rida</surname> <given-names>I.</given-names></name> <name><surname>Al-Maadeed</surname> <given-names>N.</given-names></name> <name><surname>Al-Maadeed</surname> <given-names>S.</given-names></name> <name><surname>Bakshi</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>A comprehensive overview of feature representation for biometric recognition</article-title>. <source>Multimed. Tools Appl.</source> <volume>79</volume>, <fpage>4867</fpage>&#x2013;<lpage>4890</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11042-018-6808-5</pub-id></citation></ref>
<ref id="ref68"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Riley</surname> <given-names>D. A.</given-names></name> <name><surname>Roitblat</surname> <given-names>H. L.</given-names></name></person-group> (<year>2018</year>). <article-title>Selective attention and related cognitive processes in pigeons</article-title>. <source>Cogn. Proces. Anim. Behav.</source>, <fpage>249</fpage>&#x2013;<lpage>276</lpage>. doi: <pub-id pub-id-type="doi">10.4324/9780203710029-9</pub-id></citation></ref>
<ref id="ref69"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O.</given-names></name> <name><surname>Fischer</surname> <given-names>P.</given-names></name> <name><surname>Brox</surname> <given-names>T.</given-names></name></person-group> (<year>2015</year>). &#x201C;<article-title>U-net: convolutional networks for biomedical image segmentation</article-title>&#x201D; in <source>International conference on medical image computing and computer-assisted intervention</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>234</fpage>&#x2013;<lpage>241</lpage>.</citation></ref>
<ref id="ref70"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Sengottuvelan</surname> <given-names>P.</given-names></name> <name><surname>Wahi</surname> <given-names>A.</given-names></name> <name><surname>Shanmugam</surname> <given-names>A.</given-names></name></person-group> (<year>2008</year>). <article-title>Performance of decamouflaging through exploratory image analysis</article-title>. <conf-name>2008 first international conference on emerging trends in engineering and technology</conf-name>, pp. <fpage>6</fpage>&#x2013;<lpage>10</lpage>.</citation></ref>
<ref id="ref71"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Shotton</surname> <given-names>J.</given-names></name> <name><surname>Winn</surname> <given-names>J.</given-names></name> <name><surname>Rother</surname> <given-names>C.</given-names></name> <name><surname>Criminisi</surname> <given-names>A.</given-names></name></person-group> (<year>2006</year>). &#x201C;<article-title>Textonboost: joint appearance, shape and context modeling for multi-class object recognition and segmentation</article-title>&#x201D; in <source>European conference on computer vision</source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>15</lpage>.</citation></ref>
<ref id="ref72"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Skurowski</surname> <given-names>P.</given-names></name> <name><surname>Abdulameer</surname> <given-names>H.</given-names></name> <name><surname>B&#x0142;aszczyk</surname> <given-names>J.</given-names></name> <name><surname>Depta</surname> <given-names>T.</given-names></name> <name><surname>Kornacki</surname> <given-names>A.</given-names></name> <name><surname>Kozie&#x0142;</surname> <given-names>P.</given-names></name></person-group> (<year>2018</year>), Animal camouflage analysis: CHAMELEON database. <italic>Unpublished manuscript</italic>, 2, p. 7.</citation></ref>
<ref id="ref73"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Song</surname> <given-names>L.</given-names></name> <name><surname>Geng</surname> <given-names>W.</given-names></name></person-group> (<year>2010</year>). <article-title>A new camouflage texture evaluation method based on WSSIM and nature image features</article-title>. <conf-name>2010 international conference on multimedia technology</conf-name>, pp. <fpage>1</fpage>&#x2013;<lpage>4</lpage>.</citation></ref>
<ref id="ref74"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stevens</surname> <given-names>M.</given-names></name> <name><surname>Merilaita</surname> <given-names>S.</given-names></name></person-group> (<year>2009</year>). <article-title>Animal camouflage: current issues and new perspectives</article-title>. <source>Philos. Trans. Royal Soc. B</source> <volume>364</volume>, <fpage>423</fpage>&#x2013;<lpage>427</lpage>. doi: <pub-id pub-id-type="doi">10.1098/rstb.2008.0217</pub-id>, PMID: <pub-id pub-id-type="pmid">18990674</pub-id></citation></ref>
<ref id="ref75"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stevens</surname> <given-names>M.</given-names></name> <name><surname>Ruxton</surname> <given-names>G. D.</given-names></name></person-group> (<year>2019</year>). <article-title>The key role of behaviour in animal camouflage</article-title>. <source>Biol. Rev.</source> <volume>94</volume>, <fpage>116</fpage>&#x2013;<lpage>134</lpage>. doi: <pub-id pub-id-type="doi">10.1111/brv.12438</pub-id>, PMID: <pub-id pub-id-type="pmid">29927061</pub-id></citation></ref>
<ref id="ref76"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Talas</surname> <given-names>L.</given-names></name> <name><surname>Baddeley</surname> <given-names>R. J.</given-names></name> <name><surname>Cuthill</surname> <given-names>I. C.</given-names></name></person-group> (<year>2017</year>). <article-title>Cultural evolution of military camouflage</article-title>. <source>Philos. Trans. Royal Soc. B</source> <volume>372</volume>:<fpage>20160351</fpage>. doi: <pub-id pub-id-type="doi">10.1177/10482911211032971</pub-id>, PMID: <pub-id pub-id-type="pmid">28533466</pub-id></citation></ref>
<ref id="ref77"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Thayer</surname> <given-names>G. H.</given-names></name></person-group> (<year>1918</year>). <source>Concealing-coloration in the animal kingdom: An exposition of the laws of disguise through color and pattern</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>Macmillan Company</publisher-name>.</citation></ref>
<ref id="ref78"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Ulyanov</surname> <given-names>D.</given-names></name> <name><surname>Vedaldi</surname> <given-names>A.</given-names></name> <name><surname>Lempitsky</surname> <given-names>V.</given-names></name></person-group> (<year>2018</year>). <article-title>Deep image prior</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name> (pp. <fpage>9446</fpage>&#x2013;<lpage>9454</lpage>).</citation></ref>
<ref id="ref79"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Parmar</surname> <given-names>N.</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J.</given-names></name> <name><surname>Jones</surname> <given-names>L.</given-names></name> <name><surname>Gomez</surname> <given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Attention is all you need</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>30</volume>, <fpage>5998</fpage>&#x2013;<lpage>6008</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></citation></ref>
<ref id="ref80"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>T.</given-names></name> <name><surname>Borji</surname> <given-names>A.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Zhang</surname> <given-names>P.</given-names></name> <name><surname>Lu</surname> <given-names>H.</given-names></name></person-group> (<year>2017</year>). <article-title>A stagewise refinement model for detecting salient objects in images</article-title>. In <conf-name>Proceedings of the IEEE international conference on computer vision</conf-name>, pp. <fpage>4019</fpage>&#x2013;<lpage>4028</lpage>.</citation></ref>
<ref id="ref81"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Cun</surname> <given-names>X.</given-names></name> <name><surname>Bao</surname> <given-names>J.</given-names></name> <name><surname>Zhou</surname> <given-names>W.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>Uformer: a general u-shaped transformer for image restoration</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>17683</fpage>&#x2013;<lpage>17693</lpage>.</citation></ref>
<ref id="ref82"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Gupta</surname> <given-names>A.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name></person-group> (<year>2018</year>). <article-title>Non-local neural networks</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, pp. <fpage>7794</fpage>&#x2013;<lpage>7803</lpage>.</citation></ref>
<ref id="ref83"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Waqas Zamir</surname> <given-names>S.</given-names></name> <name><surname>Arora</surname> <given-names>A.</given-names></name> <name><surname>Khan</surname> <given-names>S.</given-names></name> <name><surname>Hayat</surname> <given-names>M.</given-names></name> <name><surname>Shahbaz Khan</surname> <given-names>F.</given-names></name> <name><surname>Yang</surname> <given-names>M. H.</given-names></name></person-group> (<year>2021</year>). <article-title>Restormer: efficient transformer for high-resolution image restoration</article-title>. <source>arXiv</source>:<fpage>2111</fpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2111.09881</pub-id></citation></ref>
<ref id="ref84"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Woo</surname> <given-names>S.</given-names></name> <name><surname>Park</surname> <given-names>J.</given-names></name> <name><surname>Lee</surname> <given-names>J. Y.</given-names></name> <name><surname>Kweon</surname> <given-names>I. S.</given-names></name></person-group> (<year>2018</year>). <article-title>CBAM: convolutional block attention module</article-title>. <conf-name>Proceedings of the European conference on computer vision (ECCV)</conf-name>, pp. <fpage>3</fpage>&#x2013;<lpage>19</lpage>.</citation></ref>
<ref id="ref85"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y. H.</given-names></name> <name><surname>Gao</surname> <given-names>S. H.</given-names></name> <name><surname>Mei</surname> <given-names>J.</given-names></name> <name><surname>Xu</surname> <given-names>J.</given-names></name> <name><surname>Fan</surname> <given-names>D. P.</given-names></name> <name><surname>Zhang</surname> <given-names>R. G.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Jcs: an explainable COVID-19 diagnosis system by joint classification and segmentation</article-title>. <source>IEEE Trans. Image Process.</source> <volume>30</volume>, <fpage>3113</fpage>&#x2013;<lpage>3126</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2021.3058783</pub-id>, PMID: <pub-id pub-id-type="pmid">33600316</pub-id></citation></ref>
<ref id="ref86"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Z.</given-names></name> <name><surname>Su</surname> <given-names>L.</given-names></name> <name><surname>Huang</surname> <given-names>Q.</given-names></name></person-group> (<year>2019</year>). <article-title>Cascaded partial decoder for fast and accurate salient object detection</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>3907</fpage>&#x2013;<lpage>3916</lpage>.</citation></ref>
<ref id="ref87"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>K.</given-names></name> <name><surname>Ba</surname> <given-names>J.</given-names></name> <name><surname>Kiros</surname> <given-names>R.</given-names></name> <name><surname>Cho</surname> <given-names>K.</given-names></name> <name><surname>Courville</surname> <given-names>A.</given-names></name> <name><surname>Salakhudinov</surname> <given-names>R.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Show, attend and tell: Neural image caption generation with visual attention</article-title>. <conf-name>International conference on machine learning</conf-name>, pp. <fpage>2048</fpage>&#x2013;<lpage>2057</lpage></citation></ref>
<ref id="ref88"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xue</surname> <given-names>F.</given-names></name> <name><surname>Yong</surname> <given-names>C.</given-names></name> <name><surname>Xu</surname> <given-names>S.</given-names></name> <name><surname>Dong</surname> <given-names>H.</given-names></name> <name><surname>Luo</surname> <given-names>Y.</given-names></name> <name><surname>Jia</surname> <given-names>W.</given-names></name></person-group> (<year>2016</year>). <article-title>Camouflage performance analysis and evaluation framework based on features fusion</article-title>. <source>Multimed. Tools Appl.</source> <volume>75</volume>, <fpage>4065</fpage>&#x2013;<lpage>4082</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11042-015-2946-1</pub-id></citation></ref>
<ref id="ref89"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zamir</surname> <given-names>S. W.</given-names></name> <name><surname>Arora</surname> <given-names>A.</given-names></name> <name><surname>Khan</surname> <given-names>S.</given-names></name> <name><surname>Hayat</surname> <given-names>M.</given-names></name> <name><surname>Khan</surname> <given-names>F. S.</given-names></name> <name><surname>Yang</surname> <given-names>M. H.</given-names></name></person-group> (<year>2022</year>). <article-title>Restormer: efficient transformer for high-resolution image restoration</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>5728</fpage>&#x2013;<lpage>5739</lpage>.</citation></ref>
<ref id="ref90"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Zamir</surname> <given-names>S. W.</given-names></name> <name><surname>Arora</surname> <given-names>A.</given-names></name> <name><surname>Khan</surname> <given-names>S.</given-names></name> <name><surname>Hayat</surname> <given-names>M.</given-names></name> <name><surname>Khan</surname> <given-names>F. S.</given-names></name> <name><surname>Yang</surname> <given-names>M. H.</given-names></name> <etal/></person-group>. (<year>2020</year>). &#x201C;<article-title>Learning enriched features for real image restoration and enhancement</article-title>&#x201D; in <source>European conference on computer vision</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>492</fpage>&#x2013;<lpage>511</lpage>.</citation></ref>
<ref id="ref91"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>K.</given-names></name> <name><surname>Li</surname> <given-names>K.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Zhong</surname> <given-names>B.</given-names></name> <name><surname>Fu</surname> <given-names>Y.</given-names></name></person-group> (<year>2018</year>). <article-title>Image super-resolution using very deep residual channel attention networks</article-title>. In <conf-name>Proceedings of the European conference on computer vision (ECCV)</conf-name>, pp. <fpage>286</fpage>&#x2013;<lpage>301</lpage>.</citation></ref>
<ref id="ref92"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>J. X.</given-names></name> <name><surname>Liu</surname> <given-names>J. J.</given-names></name> <name><surname>Fan</surname> <given-names>D. P.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Cheng</surname> <given-names>M. M.</given-names></name></person-group> (<year>2019</year>). <article-title>EGNet: edge guidance network for salient object detection</article-title>. <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, pp. <fpage>8779</fpage>&#x2013;<lpage>8788</lpage>.</citation></ref>
<ref id="ref93"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>H.</given-names></name> <name><surname>Shi</surname> <given-names>J.</given-names></name> <name><surname>Qi</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Jia</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>Pyramid scene parsing network</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, pp. <fpage>2881</fpage>&#x2013;<lpage>2890</lpage>.</citation></ref>
<ref id="ref94"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>T.</given-names></name> <name><surname>Wu</surname> <given-names>X.</given-names></name></person-group> (<year>2019</year>). <article-title>Pyramid feature attention network for saliency detection</article-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, pp. <fpage>3085</fpage>&#x2013;<lpage>3094</lpage>.</citation></ref>
<ref id="ref95"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Z. Q.</given-names></name> <name><surname>Zheng</surname> <given-names>P.</given-names></name> <name><surname>Xu</surname> <given-names>S. T.</given-names></name> <name><surname>Wu</surname> <given-names>X.</given-names></name></person-group> (<year>2019</year>). <article-title>Object detection with deep learning: a review</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>30</volume>, <fpage>3212</fpage>&#x2013;<lpage>3232</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2018.2876865</pub-id></citation></ref>
<ref id="ref96"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>F.</given-names></name> <name><surname>Cao</surname> <given-names>T.</given-names></name> <name><surname>Sun</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name></person-group> (<year>2018</year>). <article-title>Detection of people with camouflage pattern via dense deconvolution network</article-title>. <source>IEEE Signal Proces. Lett.</source> <volume>26</volume>, <fpage>29</fpage>&#x2013;<lpage>33</lpage>. doi: <pub-id pub-id-type="doi">10.1109/LSP.2018.2825959</pub-id></citation></ref>
<ref id="ref97"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Rahman Siddiquee</surname> <given-names>M. M.</given-names></name> <name><surname>Tajbakhsh</surname> <given-names>N.</given-names></name> <name><surname>Liang</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). &#x201C;<article-title>UNET++: A nested U-Net architecture for medical image segmentation</article-title>&#x201D; in <source>Deep learning in medical image analysis and multimodal learning for clinical decision support</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>3</fpage>&#x2013;<lpage>11</lpage>.</citation></ref>
<ref id="ref98"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name></person-group> (<year>2021</year>). <article-title>Inferring camouflaged objects by texture-aware interactive guidance network</article-title>. <conf-name>Proceedings of the AAAI Conference on Artificial Intelligence</conf-name>, <volume>35</volume>, pp. <fpage>3599</fpage>&#x2013;<lpage>3607</lpage>.</citation></ref>
</ref-list>
</back>
</article>