<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2024.1418048</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Medicine</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Enhancing the ophthalmic AI assessment with a fundus image quality classifier using local and global attention mechanisms</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Shengzhan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2614018/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Shen</surname> <given-names>Wenyue</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1687234/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Gao</surname> <given-names>Zhiyuan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Jiang</surname> <given-names>Xiaoyu</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1892951/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Yaqi</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1972258/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Yunxiang</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ma</surname> <given-names>Xiaoyu</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Wenhao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Xin</surname> <given-names>Shuanghua</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ren</surname> <given-names>Weina</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2614096/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Jin</surname> <given-names>Kai</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1283614/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Ye</surname> <given-names>Juan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c003"><sup>&#x002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>The Affiliated People&#x2019;s Hospital of Ningbo University, Ningbo</institution>, <addr-line>Zhejiang</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Eye Center, School of Medicine, The Second Affiliated Hospital, Zhejiang University, Hangzhou</institution>, <addr-line>Zhejiang</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>College of Control Science and Engineering, Zhejiang University</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>College of Media, Communication University of Zhejiang</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>College of Computer Science and Technology, Hangzhou Dianzi University</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<aff id="aff6"><sup>6</sup><institution>Institute of Intelligent Media, Communication University of Zhejiang</institution>, <addr-line>Hangzhou</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Shida Chen, Sun Yat-sen University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Ji&#x00E0;n xi&#x00F3;ng, Second Affiliated Hospital of Nanchang University, China</p><p>Guoming Zhang, Shenzhen Eye Hospital, China</p></fn>
<corresp id="c001">&#x002A;Correspondence: Shengzhan Wang, <email>wangshengzhan886@163.com</email></corresp>
<corresp id="c002">Kai Jin, <email>jinkai@zju.edu.cn</email></corresp>
<corresp id="c003">Juan Ye, <email>yejuan@zju.edu.cn</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>07</day>
<month>08</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>11</volume>
<elocation-id>1418048</elocation-id>
<history>
<date date-type="received">
<day>15</day>
<month>04</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>07</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2024 Wang, Shen, Gao, Jiang, Wang, Li, Ma, Wang, Xin, Ren, Jin and Ye.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Wang, Shen, Gao, Jiang, Wang, Li, Ma, Wang, Xin, Ren, Jin and Ye</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>The assessment of image quality (IQA) plays a pivotal role in the realm of image-based computer-aided diagnosis techniques, with fundus imaging standing as the primary method for the screening and diagnosis of ophthalmic diseases. Conventional studies on fundus IQA tend to rely on simplistic datasets for evaluation, predominantly focusing on either local or global information, rather than a synthesis of both. Moreover, the interpretability of these studies often lacks compelling evidence. In order to address these issues, this study introduces the Local and Global Attention Aggregated Deep Neural Network (LGAANet), an innovative approach that integrates both local and global information for enhanced analysis.</p>
</sec>
<sec>
<title>Methods</title>
<p>The LGAANet was developed and validated using a Multi-Source Heterogeneous Fundus (MSHF) database, encompassing a diverse collection of images. This dataset includes 802 color fundus photography (CFP) images (302 from portable cameras), and 500 ultrawide-field (UWF) images from 904 patients with diabetic retinopathy (DR) and glaucoma, as well as healthy individuals. The assessment of image quality was meticulously carried out by a trio of ophthalmologists, leveraging the human visual system as a benchmark. Furthermore, the model employs attention mechanisms and saliency maps to bolster its interpretability.</p>
</sec>
<sec>
<title>Results</title>
<p>In testing with the CFP dataset, LGAANet demonstrated remarkable accuracy in three critical dimensions of image quality (illumination, clarity and contrast based on the characteristics of human visual system, and indicates the potential aspects to improve the image quality), recording scores of 0.947, 0.924, and 0.947, respectively. Similarly, when applied to the UWF dataset, the model achieved accuracies of 0.889, 0.913, and 0.923, respectively. These results underscore the efficacy of LGAANet in distinguishing between varying degrees of image quality with high precision.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>To our knowledge, LGAANet represents the inaugural algorithm trained on an MSHF dataset specifically for fundus IQA, marking a significant milestone in the advancement of computer-aided diagnosis in ophthalmology. This research significantly contributes to the field, offering a novel methodology for the assessment and interpretation of fundus images in the detection and diagnosis of ocular diseases.</p>
</sec>
</abstract>
<kwd-group>
<kwd>fundus photography</kwd>
<kwd>attention mechanism</kwd>
<kwd>image quality assessment</kwd>
<kwd>spatial information</kwd>
<kwd>multiscale feature extraction</kwd>
</kwd-group>
<counts>
<fig-count count="5"/>
<table-count count="5"/>
<equation-count count="5"/>
<ref-count count="23"/>
<page-count count="9"/>
<word-count count="5446"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Ophthalmology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="S1" sec-type="intro">
<title>Introduction</title>
<p>Fundus photography stands as a cornerstone in the diagnosis of diabetic retinopathy (DR), glaucoma, age-related macular degeneration (AMD), among various ocular disorders (<xref ref-type="bibr" rid="B1">1</xref>). With the advent of artificial intelligence (AI), the automation of disease screening through fundus imaging has emerged as a focal area of research and clinical application (<xref ref-type="bibr" rid="B2">2</xref>). Several algorithms have been explored, with a notable number being translated into clinical settings (<xref ref-type="bibr" rid="B3">3</xref>&#x2013;<xref ref-type="bibr" rid="B5">5</xref>). The quality of fundus images is critical to the diagnostic accuracy of these models, necessitating a robust Image Quality Assessment (IQA) for automated systems.</p>
<p>Manual IQA, though reliable, places a significant burden on medical professionals which requires direct assessment of images to ensure pathological structures are discernibly visible. Conversely, automated IQA methods offer a less labor-intensive alternative, utilizing algorithms to evaluate image quality. These methods range from structure-analysis-based to generic image-statistics approaches (<xref ref-type="bibr" rid="B6">6</xref>). In the era of deep learning, innovations in IQA have significantly benefited from the advanced feature-extraction capabilities of convolutional neural networks (CNNs) (<xref ref-type="bibr" rid="B7">7</xref>&#x2013;<xref ref-type="bibr" rid="B9">9</xref>), employing strategies such as hallucinated reference generation and distortion identification to enhance quality prediction and feature weighting through visual saliency (<xref ref-type="bibr" rid="B10">10</xref>). DeepFundus, a deep learning-based fundus image classifier, addresses the data quality gap in medical AI by offering automated, multidimensional image sorting, significantly enhancing model performance across various retinopathies and supporting a data-driven paradigm for the entire medical AI lifecycle (<xref ref-type="bibr" rid="B11">11</xref>).</p>
<p>Despite these advancements, challenges persist, particularly in the generalizability of algorithms across diverse imaging conditions and the integration of both local and global information critical for comprehensive quality assessment. Furthermore, the interpretability of deep learning models in this context remains uncertain. In order to fill these gaps, this study introduces the Local and Global Attention Aggregated Deep Neural Network (LGAANet), designed to leverage both local and global information in assessing the quality of fundus images. Most existing IQA datasets are single-center collections that overlook variations in imaging devices, eye conditions, and imaging environments. Our approach involves training on a multi-source heterogeneous fundus (MSHF) database (<xref ref-type="bibr" rid="B12">12</xref>), encompassing a broad spectrum of normal and pathological images captured through various imaging modalities, to enhance the model&#x2019;s generalizability and interpretability. This database was selected due to its diverse and representative nature, which allows for robust validation of the LGAANet model across various imaging conditions and sources.</p>
</sec>
<sec id="S2" sec-type="materials|methods">
<title>Materials and methods</title>
<p>An overview of the study approach and methodology is presented in <xref ref-type="fig" rid="F1">Figure 1</xref>. Our MSHF dataset consisted of various sub-databases collected from different devices and exhibited diverse appearance patterns. The dataset comprises 802 color fundus photography (CFP) images (302 from portable fundus cameras) and 500 ultrawide-field (UWF) images. These images originate from 904 patients, encompassing DR and glaucoma patients, in addition to normal individuals. Such samples collected via various domains are capable of providing more diversity during training of CNNs, which is beneficial for improving the generalization ability of models. Three critical dimensions of image quality: the illumination, clarity and contrast are selected based on the characteristics of human visual system, and indicates the potential aspects to improve the image quality. In order to validate the performance of our approach, we used an external dataset and noise dataset. A detailed description of each stage follows.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>An overview of the study approach and methodology. The multi-source heterogeneous fundus (MSHF) dataset is collected, and then serves as an input to train the local and global attention aggregated deep neural network (LGAANet). The output is the image quality of each image based on three metrics, and a heap map is created to show the interpretability.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-11-1418048-g001.tif"/>
</fig>
<sec id="S2.SS1">
<title>The spatial-information-retained multi-scale feature extractor</title>
<p>Multi-scale features and spatial attention mechanisms have shown potential for quality prediction (<xref ref-type="bibr" rid="B13">13</xref>&#x2013;<xref ref-type="bibr" rid="B19">19</xref>). However, existing multi-scale-feature-incorporated quality-prediction studies tend to leverage Multi-Level Spatially Pooled (MLSP) strategy to aggregate features from various scales, i.e., using Global Average Pooling (GAP) to extract the multi-dimensional activations into a one-dimensional vector and concatenate vectors from various scales. The MLSP method yields one-dimensional vectors and inevitably leaves out much spatial information. Therefore, it is challenging to integrate spatial attention mechanisms into the one-dimensional feature.</p>
<p>In order to improve prediction accuracy and combine both multi-scale features and spatial mechanisms into our quality prediction model, we included a spatial-information-retained (SIR) multi-scale feature extractor to combine both local and global quality-aware features through an attention-incorporated perspective.</p>
<p>Specifically, let <italic>X</italic> denote the input image with size [3, <italic>H</italic>, <italic>W</italic>], and denote the multi-scale feature (Scale#1 to Scale #3) extracted from ResNet50 as:</p>
<disp-formula id="E1">
<label>(1)</label>
<mml:math id="M1">
<mml:mrow>
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo lspace="2.5pt" rspace="2.5pt" stretchy="false">|</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <italic>f</italic>(&#x22C5;|<italic>Stage</italic><sub><italic>i</italic></sub>) denotes the activations extracted from the last convolutional layer of ResNet50 in Stage#<italic>i</italic>. The <italic>s<sub>i</sub></italic> is rescaled channel-wise via a convolutional layer with kernel size 1x1 and followed by a batch-normalization and a RELU layer, i.e., <inline-formula><mml:math id="INEQ3"><mml:mrow><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo rspace="5.8pt">=</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mi>i</mml:mi><mml:none/></mml:mmultiscripts><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">|</mml:mo><mml:msubsup><mml:mi>T</mml:mi><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>O</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, in which <inline-formula><mml:math id="INEQ4"><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>&#x22C5;</mml:mtext><mml:mo lspace="2.5pt" rspace="2.5pt" stretchy="false">|</mml:mo><mml:msubsup><mml:mi>T</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mi>u</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>p</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> denotes the convolutional unit mentioned above with kernel size, padding, input channel size <italic>C</italic><sub><italic>in</italic></sub>, and output channel size <italic>C</italic><sub><italic>out</italic></sub>. In the architecture of ResNet50, [<italic>I</italic><sub>1</sub>, <italic>I</italic><sub>2</sub>, <italic>I</italic><sub>3</sub>] = [256, 512, 1024], and we set [<italic>O</italic><sub>1</sub>, <italic>O</italic><sub>2</sub>, <italic>O</italic><sub>3</sub>] = [16, 32, 64] to prevent the channel size after concatenation from being too large. Therefore, the size of <inline-formula><mml:math id="INEQ7"><mml:mrow><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mn>1</mml:mn><mml:none/></mml:mmultiscripts><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mn>2</mml:mn><mml:none/></mml:mmultiscripts><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mn>3</mml:mn><mml:none/></mml:mmultiscripts></mml:mrow></mml:math></inline-formula> is [16, <italic>W</italic>/4, <italic>H</italic>/4], [32, <italic>W</italic>/8, <italic>H</italic>/8], [64, <italic>W</italic>/16, <italic>H</italic>/16], respectively.</p>
<p>In order to maintain the detailed spatial information of features extracted from each scale and simultaneously rescale them to coordinate with features extracted from the last Stage of ResNet50 (i.e., Stage#4 with spatial size [<italic>W</italic>/32, <italic>H</italic>/32]), the <inline-formula><mml:math id="INEQ12"><mml:mrow><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mn>1</mml:mn><mml:none/></mml:mmultiscripts><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mn>2</mml:mn><mml:none/></mml:mmultiscripts><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mn>3</mml:mn><mml:none/></mml:mmultiscripts></mml:mrow></mml:math></inline-formula> are non-overlapped and spatially split into several chunks with spatial size [<italic>W</italic>/32, <italic>H</italic>/32], i.e.,:</p>
<disp-formula id="E2">
<label>(2)</label>
<mml:math id="M2">
<mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mpadded width="+3.3pt">
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mpadded>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mmultiscripts>
<mml:mi>s</mml:mi>
<mml:none/>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>i</mml:mi>
<mml:none/>
</mml:mmultiscripts>
<mml:mo rspace="5.8pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mtable columnspacing="5pt" displaystyle="true" rowspacing="0pt">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mrow>
<mml:msubsup>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2062;</mml:mo>
<mml:mi mathvariant="normal">&#x22EF;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:msubsup>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mtd>
<mml:mtd/>
<mml:mtd/>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mrow>
<mml:mi mathvariant="normal">&#x22EE;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi mathvariant="normal">&#x22F1;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi mathvariant="normal">&#x22EE;</mml:mi>
</mml:mrow>
</mml:mtd>
<mml:mtd/>
<mml:mtd/>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:mrow>
<mml:msubsup>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2062;</mml:mo>
<mml:mi mathvariant="normal">&#x22EF;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:msubsup>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mtd>
<mml:mtd/>
<mml:mtd/>
</mml:mtr>
</mml:mtable>
<mml:mo>]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <italic>chunk</italic><sub><italic>i</italic></sub> denotes the set of chunks after spatial split from <inline-formula><mml:math id="INEQ15"><mml:mmultiscripts><mml:mi>s</mml:mi><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mi>i</mml:mi><mml:none/></mml:mmultiscripts></mml:math></inline-formula>, and each of the chunks is denoted as <inline-formula><mml:math id="INEQ16"><mml:msubsup><mml:mi>c</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula> (<italic>m</italic> and <italic>n</italic> denote the spatial index of the chunk) with a channel size coordinated with <inline-formula><mml:math id="INEQ17"><mml:mmultiscripts><mml:mi>s</mml:mi><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mi>i</mml:mi><mml:none/></mml:mmultiscripts></mml:math></inline-formula> and a spatial size of [<italic>W</italic>/32, <italic>H</italic>/32]. In addition, <italic>k</italic><sub>1</sub> = 64, <italic>k</italic><sub>2</sub> = 16, <italic>k</italic><sub>3</sub> = 4.</p>
<p>As for each <italic>chunk</italic><sub><italic>i</italic></sub>, its elements are concatenated channel-wise by,</p>
<disp-formula id="E3">
<label>(3)</label>
<mml:math id="M3">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mmultiscripts>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mprescripts/>
<mml:none/>
<mml:mo>&#x2033;</mml:mo>
<mml:mi>i</mml:mi>
<mml:none/>
</mml:mmultiscripts>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:msubsup>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>|</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>}</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mpadded width="+6.6pt">
<mml:mi>m</mml:mi>
</mml:mpadded>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>_</mml:mtext>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>After this, the size of <inline-formula><mml:math id="INEQ23"><mml:mrow><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mn>1</mml:mn><mml:none/></mml:mmultiscripts><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mn>2</mml:mn><mml:none/></mml:mmultiscripts><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mn>3</mml:mn><mml:none/></mml:mmultiscripts></mml:mrow></mml:math></inline-formula> is [16&#x002A;64, <italic>W</italic>/32, <italic>H</italic>/32], [32&#x002A;16, <italic>W</italic>/32, <italic>H</italic>/32], [64&#x002A;4, <italic>W</italic>/32, <italic>H</italic>/32]. Finally, <inline-formula><mml:math id="INEQ29"><mml:mrow><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mn>1</mml:mn><mml:none/></mml:mmultiscripts><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mn>2</mml:mn><mml:none/></mml:mmultiscripts><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>,</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mn>3</mml:mn><mml:none/></mml:mmultiscripts></mml:mrow></mml:math></inline-formula> and the activations extracted via <italic>f</italic>(&#x22C5;|<italic>Stage</italic><sub>4</sub>) are fed into <inline-formula><mml:math id="INEQ33"><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>&#x22C5;</mml:mtext><mml:mo lspace="2.5pt" rspace="2.5pt" stretchy="false">|</mml:mo><mml:msubsup><mml:mi>T</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mn>128</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> and yield 4 multi-dimensional features with the same size, representing both local and global information. Channel-wise concatenation is then employed to obtain a local spatial-information-retained multi-scale feature with size [128&#x002A;4, <italic>W</italic>/32, <italic>H</italic>/32].</p>
<p>The above-described spatial-information-retained multi-scale feature extraction is also illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>, taking Stage#1 as an example, and the pseudocode is listed in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption><p>Illustration of spatial-information-retained (SIR) multi-scale feature extraction. The activations extracted from Stage#1 of ResNet50, denoted as <italic>s<sub>1</sub></italic>, are first rescaled into <inline-formula><mml:math id="INEQ35"><mml:mmultiscripts><mml:mi>s</mml:mi><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mn>1</mml:mn><mml:none/></mml:mmultiscripts></mml:math></inline-formula> by a convolutional layer with kernel size 1x1. Then <inline-formula><mml:math id="INEQ36"><mml:mmultiscripts><mml:mi>s</mml:mi><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mn>1</mml:mn><mml:none/></mml:mmultiscripts></mml:math></inline-formula> is spatially split into multiple chunks whose spatial size is coordinated with the features extracted from Stage#4 of ResNet50. The chunks are concatenated into <inline-formula><mml:math id="INEQ37"><mml:mpadded width="+5pt"><mml:mmultiscripts><mml:mi>s</mml:mi><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mn>1</mml:mn><mml:none/></mml:mmultiscripts></mml:mpadded></mml:math></inline-formula> and rescaled to a size of [128, <italic>H</italic>/32, <italic>W</italic>/32]. In this way, the spatial information of multi-scale features is retained while the feature size within each scale is consistent.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-11-1418048-g002.tif"/>
</fig>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Pseudocode of spatial-information-retained multi-scale feature extractor.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<tbody>
<tr>
<td valign="top" align="left">Let <italic>X</italic> denote the input image</td>
</tr>
<tr>
<td valign="top" align="left">Step1. Extract multi-scale feature <italic>s<sub>i</sub></italic>, <italic>i</italic> = {1,2,3} from ResNet50 according to <xref ref-type="disp-formula" rid="E1">Equation 1</xref></td>
</tr>
<tr>
<td valign="top" align="left">Step2. For each scale <italic>i</italic>:</td>
</tr>
<tr>
<td valign="top" align="left">Rescale <italic>s<sub>i</sub></italic> via <inline-formula><mml:math id="INEQ48"><mml:mrow><mml:mi>s</mml:mi><mml:mmultiscripts><mml:mo>=</mml:mo><mml:mprescripts/><mml:none/><mml:mo>&#x2032;</mml:mo><mml:mi>i</mml:mi><mml:none/></mml:mmultiscripts><mml:mi>g</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">|</mml:mo><mml:msubsup><mml:mi>T</mml:mi><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>O</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> channel-wise</td>
</tr>
<tr>
<td valign="top" align="left">Spatially split <italic>s<sub>i</sub></italic> into <italic>chunk</italic><sub><italic>i</italic></sub> according to <xref ref-type="disp-formula" rid="E2">Equation 2</xref></td>
</tr>
<tr>
<td valign="top" align="left">Concatenate elements in <italic>chunk</italic><sub><italic>i</italic></sub> channel-wise according to <xref ref-type="disp-formula" rid="E3">Equation 3</xref> and obtain <inline-formula><mml:math id="INEQ51"><mml:mmultiscripts><mml:mi>s</mml:mi><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mi>i</mml:mi><mml:none/></mml:mmultiscripts></mml:math></inline-formula></td>
</tr>
<tr>
<td valign="top" align="left">Rescale <inline-formula><mml:math id="INEQ52"><mml:mmultiscripts><mml:mi>s</mml:mi><mml:none/><mml:mo>&#x2033;</mml:mo><mml:mi>i</mml:mi><mml:none/></mml:mmultiscripts></mml:math></inline-formula> channel-wise via <inline-formula><mml:math id="INEQ53"><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>&#x22C5;</mml:mtext><mml:mo lspace="2.5pt" rspace="2.5pt" stretchy="false">|</mml:mo><mml:msubsup><mml:mi>T</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mn>128</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> according to <xref ref-type="disp-formula" rid="E4">Equations 4</xref>, <xref ref-type="disp-formula" rid="E5">5</xref> and obtain <italic>ft</italic><sub><italic>i</italic></sub></td>
</tr>
<tr>
<td valign="top" align="left">End</td>
</tr>
<tr>
<td valign="top" align="left">Step3. Get <italic>ft</italic><sub>4</sub> by feeding <italic>f</italic>(X|<italic>Stage</italic><sub>4</sub>) into <inline-formula><mml:math id="INEQ57"><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>&#x22C5;</mml:mtext><mml:mo lspace="2.5pt" rspace="2.5pt" stretchy="false">|</mml:mo><mml:msubsup><mml:mi>T</mml:mi><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2062;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mn>128</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula></td>
</tr>
<tr>
<td valign="top" align="left">Step4. Concatenate {<italic>ft</italic><sub><italic>i</italic></sub>|<italic>i</italic>&#x03F5;[1,4]} channel-wise and obtain the final spatial-information-retained multi-scale feature</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
<sec id="S2.SS2">
<title>LGAANet</title>
<p>Based on the proposed SIR multi-scale feature extractor, we developed the LGAANet, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>. Our LGAANet is comprised of a ResNet50-based SIR multi-scale feature extractor <italic>f</italic>(&#x22C5;;&#x03B8;), an attention module <italic>Att</italic>(&#x22C5;;&#x03B3;), and a feature-aggregation module <italic>g</italic>(&#x22C5;;&#x03B4;). Let <italic>X</italic> denote the input image; the final quality prediction <inline-formula><mml:math id="INEQ43"><mml:mover accent="true"><mml:mi>q</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:math></inline-formula> is obtained via,</p>
<disp-formula id="E4">
<label>(4)</label>
<mml:math id="M4">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo stretchy="false">^</mml:mo>
</mml:mover>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="normal">&#x03B8;</mml:mi>
<mml:mo rspace="5.8pt">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">&#x00D7;</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="normal">&#x03B8;</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="normal">&#x03B3;</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="normal">&#x03B4;</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption><p>Overall pipeline of proposed local and global attention aggregated deep neural network (LGAANet) for quality prediction. <bold>(A)</bold> ResNet50 structure. <bold>(B)</bold> Spatial-information-retained (SIR) multi-scale feature extractor illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref> and Section Methods-D. The green sphere labeled &#x201C;C&#x201D; denotes channel-wise concatenation of SIR features extracted at each scale. <bold>(C)</bold> The attention module is leveraged to learn the spatial weighting strategies and multiplied elemental-wise with the SIR multi-scale feature. <bold>(D)</bold> The global average pooling layer is incorporated and followed by several fully connected layers to aggregate the quality prediction.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-11-1418048-g003.tif"/>
</fig>
<p>Since the quality label <italic>q</italic> is binary, the loss to be optimized, denoted as <italic>L</italic>, is calculated by,</p>
<disp-formula id="E5">
<label>(5)</label>
<mml:math id="M5">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mi>L</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mover accent="true">
<mml:mi>q</mml:mi>
<mml:mo stretchy="false">^</mml:mo>
</mml:mover>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>q</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where <italic>Sigmoid</italic>(&#x22C5;) denotes the Sigmoid layer and <italic>BCE</italic>(&#x22C5;) denotes the binary cross-entropy.</p>
<p>The attention mechanism could be implemented via various CNN architectures. Here spatial attention [denoted as BaseLine (BL) + SpatialAtt + MultiScale (MS)] and self-attention (denoted as BL+SelfAtt+MS) are leveraged to learn the spatial weighting strategy for multi-scale quality-aware features. The spatial attention is implemented by several stacks of convolutional-batch normalization-RELU units while the self-attention is following (<xref ref-type="bibr" rid="B20">20</xref>). Also, we constructed a multi-scale excluded and attention-incorporated CNN framework for the ablation study, denoted as BL+SpatialAtt.</p>
<p>For the sake of comparison, we considered the BL in the performance comparison, in which the feature extracted from ResNet50 was directly fed into a GAP followed by stacks of the fully-connected layer. The MASK-incorporated model is also involved (denoted as BL+MASK) and has an overall pipeline similar to the BL, but the extracted features are multiplied elemental-wise with the MASK signal before being fed into the GAP layer.</p>
<p>Network hyperparameters: the minibatch size is 8, and the learning rate is 1e-3. The optimizer is Adam, and the weight-decay is 5e-4. The ratio of the learning rate of the ResNet model parameters to the subsequent newly added layer is 1:10; that is, the learning rate of the newly added layer is 1e-3, and of the ResNet layer is 1e-4. The training process traverses the training data in the database 20 times, which means the epoch = 20, and the highest test accuracy is selected as the final result. The division of training-test samples is randomly generated (a total of two, namely round = 2). The image index being used for training/testing is in the supplementary files teIdx01.mat (first test index), trIdx01.mat (first-time training index), teIdx02.mat (second test index), trIdx02.mat (second training index). The host configuration is i7-8700 CPU @3.2GHz &#x0026; 32GB RAM + GTX1080@8GB.</p>
<p>To facilitate the development of deep learning models using the MSHF dataset, it was manually segmented into an 80% training set and a 20% test set. The training set facilitated model learning, while the test set served for performance evaluation. There was no overlap between these two sets, ensuring a fair distribution of image variety. Each set maintained an approximately equal proportion of high- and low-quality images.</p>
</sec>
<sec id="S2.SS3">
<title>Statistical methods</title>
<p>For statistical validation, we employed a stratified 5-fold cross-validation technique to ensure that each subset of data was representative of the overall distribution, thus mitigating any potential bias due to imbalanced data. This method involved dividing the data into 5 of folds, each containing an equal proportion of images from different categories and quality levels, ensuring that each fold was used once as a test set while the others served as the training set. We utilized the Receiver Operating Characteristic (ROC) curve to evaluate the sensitivity and specificity of LGAANet across different thresholds of classification.</p>
</sec>
</sec>
<sec id="S3" sec-type="results">
<title>Results</title>
<sec id="S3.SS1">
<title>Experimental settings</title>
<p>We cropped blank areas of each image so that the width and height were equal and then scaled the cropped image to a resolution of 512 &#x00D7; 512. The eye-area mask was obtained through brightness and edge information, which was the alpha channel, denoted as MASK. The prediction model outputs a real value in the range of [0,1], outputs a 0/1 signal through the threshold judgment, and then compares it with the ground truth. In the experiment, the threshold (TH) was selected as 0.5.</p>
</sec>
<sec id="S3.SS2">
<title>Color fundus photography dataset</title>
<p>The dataset annotations are listed in <xref ref-type="table" rid="T2">Table 2</xref>. For the color fundus photography (CFP) dataset, images with good I/C accounted for 61.0%, while GLU contained 86.5% of the poor I/C images. As for &#x2018;blur&#x2019;, the CFP dataset had 58.6% images without noticeable blur conditions, where DRIVE and NORMAL datasets had no blurry images. The same thing happened with regard to LC, and 68.3% of the images in the CFP dataset showed eligible contrast. In each aspect, images from LOCAL_1 and LOCAL_2 were inferior to those from DR_1 and DR_2.</p>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>Dataset annotations.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<td valign="top" align="left" style="color:#ffffff;background-color: #7f8080;">Item</td>
<td valign="top" align="center" colspan="2" style="color:#ffffff;background-color: #7f8080;">I/C</td>
<td valign="top" align="center" colspan="2" style="color:#ffffff;background-color: #7f8080;">Blur</td>
<td valign="top" align="center" colspan="2" style="color:#ffffff;background-color: #7f8080;">LC</td>
<td valign="top" align="center" colspan="2" style="color:#ffffff;background-color: #7f8080;">Overall</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" style="color:#ffffff;background-color: #7f8080;"></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;"><bold>0</bold></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;"><bold>1</bold></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;"><bold>0</bold></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;"><bold>1</bold></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;"><bold>0</bold></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;"><bold>1</bold></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;"><bold>0</bold></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;"><bold>1</bold></td>
</tr>
<tr>
<td valign="top" align="left">LOCAL_1</td>
<td valign="top" align="center">158</td>
<td valign="top" align="center">41</td>
<td valign="top" align="center">94</td>
<td valign="top" align="center">105</td>
<td valign="top" align="center">85</td>
<td valign="top" align="center">114</td>
<td valign="top" align="center">142</td>
<td valign="top" align="center">57</td>
</tr>
<tr>
<td valign="top" align="left">LOCAL_2</td>
<td valign="top" align="center">78</td>
<td valign="top" align="center">25</td>
<td valign="top" align="center">59</td>
<td valign="top" align="center">44</td>
<td valign="top" align="center">41</td>
<td valign="top" align="center">62</td>
<td valign="top" align="center">77</td>
<td valign="top" align="center">26</td>
</tr>
<tr>
<td valign="top" align="left">DR_1</td>
<td valign="top" align="center">31</td>
<td valign="top" align="center">156</td>
<td valign="top" align="center">34</td>
<td valign="top" align="center">153</td>
<td valign="top" align="center">6</td>
<td valign="top" align="center">181</td>
<td valign="top" align="center">40</td>
<td valign="top" align="center">147</td>
</tr>
<tr>
<td valign="top" align="left">DR_2</td>
<td valign="top" align="center">36</td>
<td valign="top" align="center">199</td>
<td valign="top" align="center">120</td>
<td valign="top" align="center">115</td>
<td valign="top" align="center">78</td>
<td valign="top" align="center">157</td>
<td valign="top" align="center">117</td>
<td valign="top" align="center">118</td>
</tr>
<tr>
<td valign="top" align="left">GLU</td>
<td valign="top" align="center">45</td>
<td valign="top" align="center">7</td>
<td valign="top" align="center">48</td>
<td valign="top" align="center">4</td>
<td valign="top" align="center">42</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">50</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left">NORMAL</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">24</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">26</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">26</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">26</td>
</tr>
<tr>
<td valign="top" align="left">DRIMDB</td>
<td valign="top" align="center">54</td>
<td valign="top" align="center">140</td>
<td valign="top" align="center">74</td>
<td valign="top" align="center">120</td>
<td valign="top" align="center">76</td>
<td valign="top" align="center">118</td>
<td valign="top" align="center">70</td>
<td valign="top" align="center">124</td>
</tr>
<tr>
<td valign="top" align="left">DRIVE</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">40</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">40</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">40</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">40</td>
</tr>
<tr>
<td valign="top" align="left">DR_UWF</td>
<td valign="top" align="center">215</td>
<td valign="top" align="center">285</td>
<td valign="top" align="center">163</td>
<td valign="top" align="center">337</td>
<td valign="top" align="center">50</td>
<td valign="top" align="center">450</td>
<td valign="top" align="center">168</td>
<td valign="top" align="center">332</td>
</tr>
</tbody>
</table></table-wrap>
<p>Except for the DRIVE database, 80% of the CFP databases were randomly selected as the training set and 20% as the test set. We calculated the average prediction accuracy of the test set, attaining an acceptable result for the baseline; and with the addition of MASK, the accuracy increased to over 0.9. Spatial attention, multiscale, and self-attention algorithms all improved accuracy: BL+SelfAtt+MS achieved the best I/C and blur results, with accuracies of 0.947 and 0.924, respectively, and BL+SpatialAtt+MS produced the best results for LC, with an accuracy of 0.947.</p>
<p>Also, we added Gaussian white noise (Gauss) with a mean of 0 and a variance of 0.05 to images in the CFP datasets to improve the competence of the human visual system (HVS) -based algorithm. We conducted the experiments on each model, and the results showed robust properties, with the best accuracy over 0.85.</p>
<p>ROC curves were drawn to further evaluate the performance of the models, as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>, and the areas under the ROC curves (AUCs) were calculated. For the CFP dataset, the AUC of each model on every item was over 0.95. Detailed information on accuracy and AUCs of the datasets is presented in <xref ref-type="table" rid="T3">Tables 3</xref>,<xref ref-type="table" rid="T4">4</xref>, respectively.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption><p>ROC curve of different items for <bold>(A&#x2013;D)</bold> CFP datasets. <bold>(E&#x2013;H)</bold> UWF datasets. <bold>(A,E)</bold> Detection of uneven illumination or color. <bold>(B,F)</bold> Detection of blur. <bold>(C,G)</bold> Detection of low contrast. <bold>(D,H)</bold> Overall quality.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-11-1418048-g004.tif"/>
</fig>
<table-wrap position="float" id="T3">
<label>TABLE 3</label>
<caption><p>Overall accuracy of different models on various datasets.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<td valign="top" align="left" style="color:#ffffff;background-color: #7f8080;">Model</td>
<td valign="top" align="center" colspan="4" style="color:#ffffff;background-color: #7f8080;">CFP dataset</td>
<td valign="top" align="center" colspan="4" style="color:#ffffff;background-color: #7f8080;">UWF dataset</td>
<td valign="top" align="center" colspan="4" style="color:#ffffff;background-color: #7f8080;">Noise dataset</td>
</tr>
<tr>
<td valign="top" align="left" style="color:#ffffff;background-color: #7f8080;"></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">I/C</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Blur</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">LC</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Overall</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">I/C</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Blur</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">LC</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Overall</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">I/C</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Blur</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">LC</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Overall</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">BL</td>
<td valign="top" align="center">0.886</td>
<td valign="top" align="center">0.874</td>
<td valign="top" align="center">0.874</td>
<td valign="top" align="center">0.897</td>
<td valign="top" align="center">0.826</td>
<td valign="top" align="center">0.839</td>
<td valign="top" align="center">0.852</td>
<td valign="top" align="center">0.876</td>
<td valign="top" align="center">0.802</td>
<td valign="top" align="center">0.802</td>
<td valign="top" align="center">0.819</td>
<td valign="top" align="center">0.809</td>
</tr>
<tr>
<td valign="top" align="left">+MASK</td>
<td valign="top" align="center">0.922</td>
<td valign="top" align="center">0.902</td>
<td valign="top" align="center">0.917</td>
<td valign="top" align="center">0.919</td>
<td valign="top" align="center">0.852</td>
<td valign="top" align="center">0.862</td>
<td valign="top" align="center">0.889</td>
<td valign="top" align="center">0.893</td>
<td valign="top" align="center">0.819</td>
<td valign="top" align="center">0.822</td>
<td valign="top" align="center">0.839</td>
<td valign="top" align="center">0.826</td>
</tr>
<tr>
<td valign="top" align="left">+SpatialAtt</td>
<td valign="top" align="center">0.927</td>
<td valign="top" align="center">0.914</td>
<td valign="top" align="center">0.929</td>
<td valign="top" align="center">0.932</td>
<td valign="top" align="center">0.869</td>
<td valign="top" align="center">0.899</td>
<td valign="top" align="center">0.903</td>
<td valign="top" align="center">0.909</td>
<td valign="top" align="center">0.832</td>
<td valign="top" align="center">0.813</td>
<td valign="top" align="center">0.852</td>
<td valign="top" align="center">0.849</td>
</tr>
<tr>
<td valign="top" align="left">+SpatialAtt+MS</td>
<td valign="top" align="center"><bold>0.947</bold></td>
<td valign="top" align="center">0.919</td>
<td valign="top" align="center"><bold>0.947</bold></td>
<td valign="top" align="center"><bold>0.944</bold></td>
<td valign="top" align="center">0.883</td>
<td valign="top" align="center">0.909</td>
<td valign="top" align="center">0.916</td>
<td valign="top" align="center"><bold>0.926</bold></td>
<td valign="top" align="center">0.852</td>
<td valign="top" align="center">0.856</td>
<td valign="top" align="center"><bold>0.879</bold></td>
<td valign="top" align="center"><bold>0.873</bold></td>
</tr>
<tr>
<td valign="top" align="left">+SelfAtt+MS</td>
<td valign="top" align="center"><bold>0.947</bold></td>
<td valign="top" align="center"><bold>0.924</bold></td>
<td valign="top" align="center">0.942</td>
<td valign="top" align="center">0.939</td>
<td valign="top" align="center"><bold>0.889</bold></td>
<td valign="top" align="center"><bold>0.913</bold></td>
<td valign="top" align="center"><bold>0.923</bold></td>
<td valign="top" align="center">0.923</td>
<td valign="top" align="center"><bold>0.862</bold></td>
<td valign="top" align="center"><bold>0.869</bold></td>
<td valign="top" align="center">0.873</td>
<td valign="top" align="center">0.869</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p>The bold values in the table represent the highest values in the respective columns.</p></fn>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T4">
<label>TABLE 4</label>
<caption><p>The AUC of different models on various datasets.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<td valign="top" align="left" style="color:#ffffff;background-color: #7f8080;">Model</td>
<td valign="top" align="center" colspan="4" style="color:#ffffff;background-color: #7f8080;">CFP dataset</td>
<td valign="top" align="center" colspan="4" style="color:#ffffff;background-color: #7f8080;">UWF dataset</td>
<td valign="top" align="center" colspan="4" style="color:#ffffff;background-color: #7f8080;">Noise dataset</td>
</tr>
<tr>
<td valign="top" align="left" style="color:#ffffff;background-color: #7f8080;"></td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">I/C</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Blur</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">LC</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Overall</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">I/C</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Blur</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">LC</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Overall</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">I/C</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Blur</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">LC</td>
<td valign="top" align="center" style="color:#ffffff;background-color: #7f8080;">Overall</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">BL</td>
<td valign="top" align="center">0.957</td>
<td valign="top" align="center">0.959</td>
<td valign="top" align="center">0.956</td>
<td valign="top" align="center">0.972</td>
<td valign="top" align="center">0.909</td>
<td valign="top" align="center">0.936</td>
<td valign="top" align="center">0.891</td>
<td valign="top" align="center">0.962</td>
<td valign="top" align="center">0.862</td>
<td valign="top" align="center">0.879</td>
<td valign="top" align="center">0.874</td>
<td valign="top" align="center">0.884</td>
</tr>
<tr>
<td valign="top" align="left">+MASK</td>
<td valign="top" align="center"><bold>0.979</bold></td>
<td valign="top" align="center"><bold>0.975</bold></td>
<td valign="top" align="center">0.972</td>
<td valign="top" align="center">0.983</td>
<td valign="top" align="center">0.931</td>
<td valign="top" align="center">0.938</td>
<td valign="top" align="center">0.925</td>
<td valign="top" align="center">0.958</td>
<td valign="top" align="center">0.874</td>
<td valign="top" align="center">0.877</td>
<td valign="top" align="center">0.878</td>
<td valign="top" align="center">0.854</td>
</tr>
<tr>
<td valign="top" align="left">+SpatialAtt</td>
<td valign="top" align="center">0.968</td>
<td valign="top" align="center">0.967</td>
<td valign="top" align="center">0.983</td>
<td valign="top" align="center">0.981</td>
<td valign="top" align="center">0.907</td>
<td valign="top" align="center"><bold>0.956</bold></td>
<td valign="top" align="center"><bold>0.956</bold></td>
<td valign="top" align="center">0.972</td>
<td valign="top" align="center">0.888</td>
<td valign="top" align="center">0.899</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.922</td>
</tr>
<tr>
<td valign="top" align="left">+SpatialAtt+MS</td>
<td valign="top" align="center">0.976</td>
<td valign="top" align="center">0.969</td>
<td valign="top" align="center"><bold>0.986</bold></td>
<td valign="top" align="center"><bold>0.986</bold></td>
<td valign="top" align="center"><bold>0.923</bold></td>
<td valign="top" align="center">0.954</td>
<td valign="top" align="center">0.948</td>
<td valign="top" align="center"><bold>0.974</bold></td>
<td valign="top" align="center">0.891</td>
<td valign="top" align="center"><bold>0.915</bold></td>
<td valign="top" align="center"><bold>0.928</bold></td>
<td valign="top" align="center"><bold>0.931</bold></td>
</tr>
<tr>
<td valign="top" align="left">+SelfAtt+MS</td>
<td valign="top" align="center">0.977</td>
<td valign="top" align="center">0.972</td>
<td valign="top" align="center">0.972</td>
<td valign="top" align="center">0.969</td>
<td valign="top" align="center">0.906</td>
<td valign="top" align="center">0.936</td>
<td valign="top" align="center">0.952</td>
<td valign="top" align="center">0.944</td>
<td valign="top" align="center"><bold>0.905</bold></td>
<td valign="top" align="center">0.894</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.917</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn><p>The bold values in the table represent the highest values in the respective columns.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Visualization of the prediction is interpreted by heat map, as shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. For high-quality images, the activated area is even and covers the whole image. When an image is suspected of poor quality, such as an area of uneven illumination, the model will not activate the designated area.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption><p>Heat map of the proposed model. <bold>(A)</bold> is a high-quality fundus image; the activated area is even and covers the whole image. <bold>(B)</bold> is a fundus image that contains a small area of uneven illumination, and therefore the top of the image is not activated. <bold>(C)</bold> contains a large area of strong light around the optic disk as well as the top of the image, and the rest area is properly activated.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-11-1418048-g005.tif"/>
</fig>
</sec>
<sec id="S3.SS3">
<title>Ultra-wide field fundus image dataset</title>
<p>In the UWF dataset, images with good quality accounted for 66.4%. Blurring was less common in UWF images, and the overall contrast was acceptable. The UWF dataset was not exploited for training, and we tested it with the proposed model as an external dataset. Performance on the BL was moderate, and compared with the BL, the following models all achieved better results. BL+SelfAtt+MS attained accuracies of 0.889, 0.913, and 0.923 for I/C, Blur, and LC separately.</p>
<p>The ROC curves for UWF images exhibited similar performance. BL+SpatialAtt+MS attained an AUC of 0.923 for I/C. Nevertheless, the AUCs for Blur and LC reached their maximums (both 0.956) in the BL+SpatialAtt model.</p>
<p><xref ref-type="table" rid="T5">Table 5</xref> provides a clear overview of the key technical terms and concepts used in the study, making it easier for readers from diverse backgrounds to understand the key aspects of the research.</p>
<table-wrap position="float" id="T5">
<label>TABLE 5</label>
<caption><p>Appendix explains key technical terms and concepts.</p></caption>
<table cellspacing="5" cellpadding="5" frame="box" rules="all">
<thead>
<tr>
<td valign="top" align="left" style="color:#ffffff;background-color: #7f8080;">Term and Concepts</td>
<td valign="top" align="left" style="color:#ffffff;background-color: #7f8080;">Simple Explanation</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Image Quality Assessment (IQA)</td>
<td valign="top" align="left">Evaluating how clear and useful an image is for medical purposes.</td>
</tr>
<tr>
<td valign="top" align="left">LGAANet</td>
<td valign="top" align="left">A smart system assessing eye images by analyzing both local details and the overall picture.</td>
</tr>
<tr>
<td valign="top" align="left">Multi-Source Heterogeneous Fundus (MSHF) Database</td>
<td valign="top" align="left">Collection of eye images from various sources and cameras.</td>
</tr>
<tr>
<td valign="top" align="left">Color Fundus Photography (CFP)</td>
<td valign="top" align="left">Standard color images of the retina.</td>
</tr>
<tr>
<td valign="top" align="left">Ultrawide-Field (UWF) Imaging</td>
<td valign="top" align="left">Wide-angle images capturing a broad view of the retina.</td>
</tr>
<tr>
<td valign="top" align="left">Attention Mechanisms</td>
<td valign="top" align="left">Focuses on significant parts of the image for analysis.</td>
</tr>
<tr>
<td valign="top" align="left">Saliency Maps</td>
<td valign="top" align="left">Highlights important image regions for decision-making in the neural network.</td>
</tr>
<tr>
<td valign="top" align="left">Multi-Level Spatially Pooled (MLSP)</td>
<td valign="top" align="left">Combines information from multiple levels of image analysis.</td>
</tr>
<tr>
<td valign="top" align="left">Global Average Pooling (GAP)</td>
<td valign="top" align="left">Computes the average of all feature maps in a neural network layer.</td>
</tr>
<tr>
<td valign="top" align="left">Spatial-Information-Retained (SIR)</td>
<td valign="top" align="left">Method preserving spatial details during image processing.</td>
</tr>
<tr>
<td valign="top" align="left">Receiver Operating Characteristic (ROC)</td>
<td valign="top" align="left">Graphical representation of a classifier&#x2019;s performance.</td>
</tr>
<tr>
<td valign="top" align="left">Human Visual System (HVS)</td>
<td valign="top" align="left">System responsible for processing visual information in humans.</td>
</tr>
<tr>
<td valign="top" align="left">Areas Under the ROC Curves (AUCs)</td>
<td valign="top" align="left">Measure of the overall performance of a classifier.</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
</sec>
<sec id="S4" sec-type="discussion">
<title>Discussion</title>
<p>In the realm of IQA, much of the existing literature has concentrated on singular modalities, predominantly CFP. The incorporation of alternative imaging modalities, such as portable fundus photography and UWF fundus imaging, which may be preferable in certain clinical scenarios, has been relatively overlooked. Wang et al represented a notable exception, employing both portable fundus camera images and public CFP datasets, demonstrating the machine learning model&#x2019;s robust performance across these modalities (<xref ref-type="bibr" rid="B21">21</xref>).</p>
<p>To date, our research indicates a scarcity of research employing UWF images for fundus IQA, particularly studies that integrate CFP, portable fundus photography, and UWF imaging. Given that each imaging method addresses specific clinical requirements, developing an IQA system capable of accommodating this diversity is crucial. Furthermore, the challenge of &#x2018;domain variance&#x2019; has been partially addressed in the prior research, which involved collecting images from both the source and target domains to train the network (<xref ref-type="bibr" rid="B22">22</xref>). Therefore, to fill these gaps, we compiled a multi-source heterogeneous fundus (MSHF) dataset, designed to meet the varied demands of clinical practice and mitigate the issue of domain variability.</p>
<p>Our Local and Global Attention Aggregated Deep Neural Network (LGAANet) was initially trained on images from portable and tabletop cameras, yet it demonstrated commendable adaptability and effectiveness when applied to UWF images. This underscores our model&#x2019;s potential and versatility across different clinical settings. Previous contributions have introduced several notable networks, focusing on segmentation or generic evaluation, leveraging both conventional machine learning techniques and advanced deep learning methodologies. Our LGAANet, aimed at enhancing algorithmic performance and accommodating multi-source heterogeneous data, integrates both local and global information, resulting in incremental improvements in accuracy and AUC with each enhancement.</p>
<p>The advent of AI in clinical practice has underscored the importance of medical imaging quality assessment. Li et al. introduced DeepQuality, a deep learning-based system for assessing and enhancing the quality of infantile fundus images to mitigate misdiagnosis risks in infant retinopathy screening, demonstrating significant improvements in diagnostic models&#x2019; performance through analysis of over two million real-world images (<xref ref-type="bibr" rid="B23">23</xref>). This study introduces the innovative LGAANet for evaluating the quality of fundus images. Our MSHF dataset encompasses three primary types of retinal images: those captured by portable cameras, CFP images, and UWF images. These images were annotated by clinical ophthalmologists based on three distinct HVS characteristics and overall quality. The diversity of our dataset is visually represented through a spatial scatter plot. Developed on the sophisticated multi-level feature extractor SIR and incorporating an attention mechanism, the LGAANet was trained with images from portable cameras and CFP images. To evaluate the model&#x2019;s robustness, we also tested it with UWF images and noisy data, analyzing overall accuracy and generating ROC curves to calculate the AUC for each set. Additionally, we propose the use of a salience map as a post hoc interpretability tool. This model paves the way for further exploration into AI-driven diagnostics, especially in the field of ophthalmology.</p>
<p>While the LGAANet has demonstrated significant advancements in fundus IQA, there are notable limitations that must be addressed in future research. One such limitation is the current model&#x2019;s inability to enhance poor-quality images. Although LGAANet excels at assessing image quality, it does not yet possess the capability to improve subpar images to meet diagnostic standards. Future work should focus on developing algorithms that can transform low-quality images into high-quality ones, thereby increasing their diagnostic utility. Additionally, the reliance on a manually annotated dataset for model training and validation could introduce biases; thus, expanding the dataset and incorporating more diverse imaging conditions will be crucial for further validation. Finally, the generalizability of LGAANet to other imaging modalities and diseases outside of diabetic retinopathy and glaucoma remains to be explored. Addressing these limitations will be essential to fully realize the potential of LGAANet in clinical applications and to enhance the robustness and versatility of computer-aided diagnostic systems in ophthalmology.</p>
</sec>
<sec id="S5" sec-type="data-availability">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/supplementary material.</p>
</sec>
<sec id="S6" sec-type="ethics-statement">
<title>Ethics statement</title>
<p>Ethical review and approval was not required for the study on human participants in accordance with the local legislation and institutional requirements. Written informed consent from the patients/participants was not required to participate in this study in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec id="S7" sec-type="author-contributions">
<title>Author contributions</title>
<p>SW: Conceptualization, Formal analysis, Investigation, Validation, Writing&#x2212;original draft. WS: Formal analysis, Validation, Writing&#x2212;original draft, Methodology. ZG: Formal analysis, Methodology, Validation, Data curation, Writing&#x2212;original draft. XJ: Methodology, Writing&#x2212;review and editing, Visualization. YW: Formal analysis, Validation, Writing&#x2212;review and editing, Resources. YL: Writing&#x2212;review and editing, Validation, Visualization. XM: Formal analysis, Methodology, Software, Validation, Visualization, Writing&#x2212;review and editing. WW: Formal analysis, Methodology, Validation, Writing&#x2212;review and editing. SX: Formal analysis, Methodology, Validation, Writing&#x2212;review and editing. WR: Formal analysis, Methodology, Validation, Writing&#x2212;review and editing. KJ: Conceptualization, Formal analysis, Resources, Supervision, Writing&#x2212;review and editing. JY: Conceptualization, Funding acquisition, Resources, Writing&#x2212;review and editing.</p>
</sec>
</body>
<back>
<sec id="S8" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of the article. This research received financial support from the Natural Science Foundation of China (Grant number 82201195), Ningbo Clinical Research Center for Ophthalmology (2022L003), Ningbo Key Laboratory for neuroretinopathy medical research, Ningbo Clinical Research Center for Ophthalmology and the Project of NINGBO Leading Medical and Health Disipline (2016-S05), Technology Innovation 2025 Major Project of Ningbo (2021Z054), The project of Ningbo Medical Science and Technology (2018A27).</p>
</sec>
<sec id="S9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="S10" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1"><label>1.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Poplin</surname> <given-names>R</given-names></name> <name><surname>Varadarajan</surname> <given-names>A</given-names></name> <name><surname>Blumer</surname> <given-names>K</given-names></name> <name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>McConnell</surname> <given-names>M</given-names></name> <name><surname>Corrado</surname> <given-names>G</given-names></name><etal/></person-group> <article-title>Prediction of cardiovascular risk factors from retinal fundus photographs via deep learning.</article-title> <source><italic>Nat Biomed Eng.</italic></source> (<year>2018</year>) <volume>2</volume>:<fpage>158</fpage>&#x2013;<lpage>64</lpage>.</citation></ref>
<ref id="B2"><label>2.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ting</surname> <given-names>D</given-names></name> <name><surname>Pasquale</surname> <given-names>L</given-names></name> <name><surname>Peng</surname> <given-names>L</given-names></name> <name><surname>Campbell</surname> <given-names>J</given-names></name> <name><surname>Lee</surname> <given-names>A</given-names></name> <name><surname>Raman</surname> <given-names>R</given-names></name><etal/></person-group> <article-title>Artificial intelligence and deep learning in ophthalmology.</article-title> <source><italic>Br J Ophthalmol.</italic></source> (<year>2019</year>) <volume>103</volume>:<fpage>167</fpage>&#x2013;<lpage>75</lpage>.</citation></ref>
<ref id="B3"><label>3.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ting</surname> <given-names>D</given-names></name> <name><surname>Cheung</surname> <given-names>C</given-names></name> <name><surname>Lim</surname> <given-names>G</given-names></name> <name><surname>Tan</surname> <given-names>G</given-names></name> <name><surname>Quang</surname> <given-names>N</given-names></name> <name><surname>Gan</surname> <given-names>A</given-names></name><etal/></person-group> <article-title>Development and Validation of a Deep Learning System for Diabetic Retinopathy and Related Eye Diseases Using Retinal Images From Multiethnic Populations With Diabetes.</article-title> <source><italic>JAMA.</italic></source> (<year>2017</year>) <volume>318</volume>:<issue>22</issue>. <pub-id pub-id-type="doi">10.1001/jama.2017.18152</pub-id> <pub-id pub-id-type="pmid">29234807</pub-id></citation></ref>
<ref id="B4"><label>4.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gulshan</surname> <given-names>V</given-names></name> <name><surname>Rajan</surname> <given-names>R</given-names></name> <name><surname>Widner</surname> <given-names>K</given-names></name> <name><surname>Wu</surname> <given-names>D</given-names></name> <name><surname>Wubbels</surname> <given-names>P</given-names></name> <name><surname>Rhodes</surname> <given-names>T</given-names></name><etal/></person-group> <article-title>Performance of a Deep-Learning Algorithm vs Manual Grading for Detecting Diabetic Retinopathy in India.</article-title> <source><italic>JAMA Ophthalmology.</italic></source> (<year>2019</year>) <volume>137</volume>:<issue>9</issue>. <pub-id pub-id-type="doi">10.1001/jamaophthalmol.2019.2004</pub-id> <pub-id pub-id-type="pmid">31194246</pub-id></citation></ref>
<ref id="B5"><label>5.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sayres</surname> <given-names>R</given-names></name> <name><surname>Taly</surname> <given-names>A</given-names></name> <name><surname>Rahimy</surname> <given-names>E</given-names></name> <name><surname>Blumer</surname> <given-names>K</given-names></name> <name><surname>Coz</surname> <given-names>D</given-names></name> <name><surname>Hammel</surname> <given-names>N</given-names></name><etal/></person-group> <article-title>Using a Deep Learning Algorithm and Integrated Gradients Explanation to Assist Grading for Diabetic Retinopathy.</article-title> <source><italic>Ophthalmology.</italic></source> (<year>2019</year>) <volume>126</volume>:<fpage>552</fpage>&#x2013;<lpage>64</lpage>. <pub-id pub-id-type="doi">10.1016/j.ophtha.2018.11.016</pub-id> <pub-id pub-id-type="pmid">30553900</pub-id></citation></ref>
<ref id="B6"><label>6.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Raj</surname> <given-names>A</given-names></name> <name><surname>Tiwari</surname> <given-names>A</given-names></name> <name><surname>Martini</surname> <given-names>M</given-names></name></person-group>. <article-title>Fundus image quality assessment: survey, challenges, and future scope.</article-title> <source><italic>IET Image Processing.</italic></source> (<year>2019</year>) <volume>13</volume>:<fpage>1211</fpage>&#x2013;<lpage>24</lpage>.</citation></ref>
<ref id="B7"><label>7.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Talebi</surname> <given-names>H</given-names></name> <name><surname>Milanfar</surname> <given-names>P.</given-names></name></person-group> <source><italic>NIMA: Neural Image Assessment.</italic></source> <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name> (<year>2018</year>).</citation></ref>
<ref id="B8"><label>8.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Weijer</surname> <given-names>J</given-names></name> <name><surname>Bagdanov</surname> <given-names>A</given-names></name></person-group> <role>editors</role>. <article-title>RankIQA: Learning from Rankings for No-Reference Image Quality Assessment.</article-title> <source><italic>Proceedings of the 2017 IEEE International Conference on Computer Vision (ICCV).</italic></source> <publisher-loc>Piscataway, NJ</publisher-loc>: (<year>2017</year>). <pub-id pub-id-type="doi">10.1109/TIP.2021.3084750</pub-id> <pub-id pub-id-type="pmid">34086572</pub-id></citation></ref>
<ref id="B9"><label>9.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bosse</surname> <given-names>S</given-names></name> <name><surname>Maniry</surname> <given-names>D</given-names></name> <name><surname>Muller</surname> <given-names>K</given-names></name> <name><surname>Wiegand</surname> <given-names>T</given-names></name> <name><surname>Samek</surname> <given-names>W</given-names></name></person-group>. <article-title>Deep Neural Networks for No-Reference and Full-Reference Image Quality Assessment.</article-title> <source><italic>IEEE Trans Image Process.</italic></source> (<year>2018</year>) <volume>27</volume>:<fpage>206</fpage>&#x2013;<lpage>19</lpage>.</citation></ref>
<ref id="B10"><label>10.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>K</given-names></name> <name><surname>Liu</surname> <given-names>W</given-names></name> <name><surname>Zhang</surname> <given-names>K</given-names></name> <name><surname>Duanmu</surname> <given-names>Z</given-names></name> <name><surname>Wang</surname> <given-names>Z</given-names></name> <name><surname>Zuo</surname> <given-names>W</given-names></name></person-group>. <article-title>End-to-End Blind Image Quality Assessment Using Deep Neural Networks.</article-title> <source><italic>IEEE Trans Image Process.</italic></source> (<year>2018</year>) <volume>27</volume>:<fpage>1202</fpage>&#x2013;<lpage>13</lpage>.</citation></ref>
<ref id="B11"><label>11.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>L</given-names></name> <name><surname>Wu</surname> <given-names>X</given-names></name> <name><surname>Lin</surname> <given-names>D</given-names></name> <name><surname>Zhao</surname> <given-names>L</given-names></name> <name><surname>Li</surname> <given-names>M</given-names></name> <name><surname>Yun</surname> <given-names>D</given-names></name><etal/></person-group> <article-title>DeepFundus: A flow-cytometry-like image quality classifier for boosting the whole life cycle of medical artificial intelligence.</article-title> <source><italic>Cell reports Medicine.</italic></source> (<year>2023</year>) <volume>4</volume>:<issue>100912</issue>. <pub-id pub-id-type="doi">10.1016/j.xcrm.2022.100912</pub-id> <pub-id pub-id-type="pmid">36669488</pub-id></citation></ref>
<ref id="B12"><label>12.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jin</surname> <given-names>K</given-names></name> <name><surname>Gao</surname> <given-names>Z</given-names></name> <name><surname>Jiang</surname> <given-names>X</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Ma</surname> <given-names>X</given-names></name> <name><surname>Li</surname> <given-names>Y</given-names></name><etal/></person-group> <article-title>MSHF: A Multi-Source Heterogeneous Fundus (MSHF) Dataset for Image Quality Assessment.</article-title> <source><italic>Scientific data.</italic></source> (<year>2023</year>) <volume>10</volume>:<issue>286</issue>. <pub-id pub-id-type="doi">10.1038/s41597-023-02188-x</pub-id> <pub-id pub-id-type="pmid">37198230</pub-id></citation></ref>
<ref id="B13"><label>13.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>K</given-names></name> <name><surname>Wang</surname> <given-names>G</given-names></name></person-group>. <article-title>Hallucinated-IQA: No-Reference Image Quality Assessment via Adversarial Learning.</article-title> <source><italic>Proceedings of the 2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition.</italic></source> <publisher-loc>Piscataway, NJ</publisher-loc>: (<year>2018</year>). p. <fpage>732</fpage>&#x2013;<lpage>41</lpage>.</citation></ref>
<ref id="B14"><label>14.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>D</given-names></name> <name><surname>Jiang</surname> <given-names>T</given-names></name> <name><surname>Lin</surname> <given-names>W</given-names></name> <name><surname>Jiang</surname> <given-names>M</given-names></name></person-group>. <article-title>Which Has Better Visual Quality: The Clear Blue Sky or a Blurry Animal?</article-title> <source><italic>IEEE Trans on Multimedia.</italic></source> (<year>2019</year>) <volume>21</volume>:<fpage>1221</fpage>&#x2013;<lpage>34</lpage>.</citation></ref>
<ref id="B15"><label>15.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Su</surname> <given-names>S</given-names></name> <name><surname>Yan</surname> <given-names>Q</given-names></name> <name><surname>Zhu</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>C</given-names></name> <name><surname>Ge</surname> <given-names>X</given-names></name> <name><surname>Sun</surname> <given-names>J</given-names></name><etal/></person-group> <article-title>Blindly Assess Image Quality in the Wild Guided by A Self-Adaptive Hyper Network.</article-title> <source><italic>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition.</italic></source> <publisher-loc>Piscataway, NJ</publisher-loc>: <year>(2020)</year>.</citation></ref>
<ref id="B16"><label>16.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Li</surname> <given-names>H</given-names></name> <name><surname>Du</surname> <given-names>J</given-names></name> <name><surname>Qin</surname> <given-names>J</given-names></name> <name><surname>Wang</surname> <given-names>T</given-names></name> <name><surname>Chen</surname> <given-names>Y</given-names></name><etal/></person-group> <article-title>3D Multi-Attention Guided Multi-Task Learning Network for Automatic Gastric Tumor Segmentation and Lymph Node Classification.</article-title> <source><italic>IEEE Trans Med Imaging.</italic></source> (<year>2021</year>) <volume>40</volume>:<fpage>1618</fpage>&#x2013;<lpage>31</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2021.3062902</pub-id> <pub-id pub-id-type="pmid">33646948</pub-id></citation></ref>
<ref id="B17"><label>17.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Q</given-names></name> <name><surname>Keenan</surname> <given-names>T</given-names></name> <name><surname>Allot</surname> <given-names>A</given-names></name> <name><surname>Peng</surname> <given-names>Y</given-names></name> <name><surname>Agron</surname> <given-names>E</given-names></name> <name><surname>Domalpally</surname> <given-names>A</given-names></name><etal/></person-group> <article-title>Multimodal, multitask, multiattention (M3) deep learning detection of reticular pseudodrusen: Toward automated and accessible classification of age-related macular degeneration.</article-title> <source><italic>J Am Med Inform Assoc.</italic></source> (<year>2021</year>) <volume>28</volume>:<fpage>1135</fpage>&#x2013;<lpage>1118</lpage>. <pub-id pub-id-type="doi">10.1093/jamia/ocaa302</pub-id> <pub-id pub-id-type="pmid">33792724</pub-id></citation></ref>
<ref id="B18"><label>18.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>You</surname> <given-names>J</given-names></name> <name><surname>Korhonen</surname> <given-names>J</given-names></name></person-group>. <article-title>Transformer for Image Quality Assessment.</article-title> <source><italic>IEEE International Conference on Image Processing.</italic></source> <publisher-loc>Piscataway, NJ</publisher-loc>: (<year>2020</year>).</citation></ref>
<ref id="B19"><label>19.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Q</given-names></name> <name><surname>Zhang</surname> <given-names>W</given-names></name> <name><surname>Zhou</surname> <given-names>N</given-names></name> <name><surname>Lei</surname> <given-names>P</given-names></name> <name><surname>Xu</surname> <given-names>Y</given-names></name> <name><surname>Zheng</surname> <given-names>Y</given-names></name><etal/></person-group> <article-title>Adaptive Fractional Dilated Convolution Network for Image Aesthetics Assessment.</article-title> <source><italic>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR).</italic></source> <publisher-loc>Piscataway, NJ</publisher-loc>: (<year>2020</year>).</citation></ref>
<ref id="B20"><label>20.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>C</given-names></name> <name><surname>Gong</surname> <given-names>D</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Li</surname> <given-names>Z</given-names></name> <name><surname>Wong</surname> <given-names>K</given-names></name></person-group>. <article-title>Learning Spatial Attention for Face Super-Resolution.</article-title> <source><italic>IEEE Trans Image Process.</italic></source> (<year>2021</year>) <volume>30</volume>:<fpage>1219</fpage>&#x2013;<lpage>31</lpage>.</citation></ref>
<ref id="B21"><label>21.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>S</given-names></name> <name><surname>Jin</surname> <given-names>K</given-names></name> <name><surname>Lu</surname> <given-names>H</given-names></name> <name><surname>Cheng</surname> <given-names>C</given-names></name> <name><surname>Ye</surname> <given-names>J</given-names></name> <name><surname>Qian</surname> <given-names>D</given-names></name></person-group>. <article-title>Human Visual System-Based Fundus Image Quality Assessment of Portable Fundus Camera Photographs.</article-title> <source><italic>IEEE Trans Med Imaging.</italic></source> (<year>2016</year>) <volume>35</volume>:<fpage>1046</fpage>&#x2013;<lpage>55</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2015.2506902</pub-id> <pub-id pub-id-type="pmid">26672033</pub-id></citation></ref>
<ref id="B22"><label>22.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>Y</given-names></name> <name><surname>Sheng</surname> <given-names>B</given-names></name> <name><surname>Fang</surname> <given-names>R</given-names></name> <name><surname>Li</surname> <given-names>H</given-names></name> <name><surname>Dai</surname> <given-names>L</given-names></name> <name><surname>Stolte</surname> <given-names>S</given-names></name><etal/></person-group> <article-title>Domain-invariant interpretable fundus image quality assessment.</article-title> <source><italic>Medical image analysis.</italic></source> (<year>2020</year>) <volume>61</volume>:<issue>101654</issue>. <pub-id pub-id-type="doi">10.1016/j.media.2020.101654</pub-id> <pub-id pub-id-type="pmid">32066065</pub-id></citation></ref>
<ref id="B23"><label>23.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>L</given-names></name> <name><surname>Lin</surname> <given-names>D</given-names></name> <name><surname>Lin</surname> <given-names>Z</given-names></name> <name><surname>Li</surname> <given-names>M</given-names></name> <name><surname>Lian</surname> <given-names>Z</given-names></name> <name><surname>Zhao</surname> <given-names>L</given-names></name><etal/></person-group> <article-title>DeepQuality improves infant retinopathy screening.</article-title> <source><italic>NPJ Digit Med.</italic></source> (<year>2023</year>) <volume>6</volume>:<issue>192</issue>. <pub-id pub-id-type="doi">10.1038/s41746-023-00943-3</pub-id> <pub-id pub-id-type="pmid">37845275</pub-id></citation></ref>
</ref-list>
</back>
</article>