<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Astron. Space Sci.</journal-id>
<journal-title>Frontiers in Astronomy and Space Sciences</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Astron. Space Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-987X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1082694</article-id>
<article-id pub-id-type="doi">10.3389/fspas.2023.1082694</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Astronomy and Space Sciences</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A selective up-sampling method applied upon unbalanced data for flare prediction: potential to improve model performance</article-title>
<alt-title alt-title-type="left-running-head">Liu et&#xa0;al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fspas.2023.1082694">10.3389/fspas.2023.1082694</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Siwei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1997174/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wang</surname>
<given-names>Jingjing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1893438/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Ming</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2073711/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cui</surname>
<given-names>Yanmei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2236155/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Guo</surname>
<given-names>Juan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Shi</surname>
<given-names>Yurong</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Luo</surname>
<given-names>Bingxian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Siqing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>State Key Laboratory of Space Weather</institution>, <institution>National Space Science Center</institution>, <institution>Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Key Laboratory of Science and Technology on Environmental Space Situation Awareness</institution>, <institution>Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>University of Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/873548/overview">Ankush Bhaskar</ext-link>, Vikram Sarabhai Space Centre, India</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2254939/overview">Vishal Upendran</ext-link>, Lockheed Martin Solar and Astrophysics Laboratory (LMSAL), United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1621264/overview">Xiantong Wang</ext-link>, University of Michigan, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Jingjing Wang, <email>wangjingjing@nssc.ac.cn</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>08</day>
<month>06</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>10</volume>
<elocation-id>1082694</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>10</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>24</day>
<month>05</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Liu, Wang, Li, Cui, Guo, Shi, Luo and Liu.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Liu, Wang, Li, Cui, Guo, Shi, Luo and Liu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>The Spaceweather HMI Active Region Patch (SHARP) parameters have been widely used to develop flare prediction models. The relatively small number of strong-flare events leads to an unbalanced dataset that prediction models can be sensitive to the unbalanced data and might lead to bias and limited performance. In this study, we adopted the logistic regression algorithm to develop a flare prediction model for the next 48&#xa0;h based on the SHARP parameters. The model was trained with five different inputs. The first input was the original unbalanced dataset; the second and third inputs were obtained by using two widely used sampling methods from the original dataset, while the fourth input was the original dataset but accompanied by a weighted classifier. Based on the distribution properties of strong-flare occurrences related to SHARP parameters, we established a new selective up-sampling method and applied it to the mixed-up region (referred to as the confusing distribution areas consisting of both the strong-flare events and non-strong-flare events) to pick up the flare-related samples and add small random values to them and finally create a large number of flare-related samples that are very close to the ground truth. Thus, we obtained the fifth balanced dataset aiming to 1) promote the forecast capability in the mixed-up region and 2) increase the robustness of the model. We compared the model performance and found that the selective up-sampling method has potential to improve the model performance in strong-flare prediction with its F1 score reaching 0.5501 &#xb1; 0.1200, which is approximately 22% &#x2212; 33% higher than other imbalance mitigation schemes.</p>
</abstract>
<kwd-group>
<kwd>solar flare</kwd>
<kwd>solar active regions</kwd>
<kwd>solar photospheric magnetic parameters</kwd>
<kwd>up-sample</kwd>
<kwd>machine learning</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Space Physics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Flare prediction plays an important role in space weather forecast. The photospheric magnetic field information of active regions (ARs) is valuable (<xref ref-type="bibr" rid="B28">Yu&#xa0;et&#xa0;al., 2009</xref>), and the data are helpful in accurately predicting solar flares, which can be extended up to less than 10&#xa0;days before the eruption (<xref ref-type="bibr" rid="B2">Alipour&#xa0;et&#xa0;al., 2019</xref>). The solar flare might be accompanied by the coronal mass ejection (CME). Furthermore, CMEs might impact the Earth and affect the geospace such as by triggering geomagnetic storms, causing damage to the electricity transmission system (Quebec Blackout event in 1989 (<xref ref-type="bibr" rid="B6">Boteler, 2019</xref>)) and disabling the space satellite equipment.</p>
<p>Many photospheric magnetic parameters of ARs are highly related to strong-flare occurrences, for example, SHARP (Spaceweather HMI Active Region Patch) parameters, which are available from the data product called Spaceweather HMI Active Region Patches (<xref ref-type="bibr" rid="B5">Bobra&#xa0;et&#xa0;al., 2014</xref>), given by the Helioseismic and Magnetic Imager (HMI) onboard the Solar Dynamics Observatory (SDO).</p>
<p>Solar observations are used in different situations, such as the HMI photospheric line-of-sight magnetic field and multi-wavelength EUV filtergrams (<xref ref-type="bibr" rid="B16">Jarolim&#xa0;et&#xa0;al., 2022</xref>), SHARP parameters (<xref ref-type="bibr" rid="B29">Zhang&#xa0;et&#xa0;al., 2022</xref>), critical scales of parameters under the <italic>&#x3ba;</italic>-scheme (<xref ref-type="bibr" rid="B17">Kusano&#xa0;et&#xa0;al., 2020</xref>), and HMI magnetograms (<xref ref-type="bibr" rid="B4">Bobra and Couvidat, 2015</xref>), for predicting solar flares under different conditions. <xref ref-type="bibr" rid="B10">Dhuri&#xa0;et&#xa0;al. (2019)</xref> showed that the SHARP parameters are the leading contributors to the machine classification, so we decided to use the SHARP parameters in the following experiments for flare prediction as our initial research direction.</p>
<p>In recent years, machine learning algorithms have been applied to solar physics and have made progress in flare prediction, especially in extracting new predictors and developing effective models (<xref ref-type="bibr" rid="B20">Liu&#xa0;et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B26">Wang&#xa0;et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B8">Chen&#xa0;et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B24">Sun&#xa0;et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B25">Wang&#xa0;et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B19">Li&#xa0;et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B21">Nishizuka&#xa0;et&#xa0;al., 2021</xref>; <xref ref-type="bibr" rid="B23">Sun&#xa0;et&#xa0;al., 2021</xref>). <xref ref-type="bibr" rid="B20">Liu&#xa0;et&#xa0;al. (2017)</xref> adopted the random forest method for the multiclass classification of flares. <xref ref-type="bibr" rid="B26">Wang&#xa0;et&#xa0;al. (2020)</xref> adopted the long short-term memory (LSTM) network to learn from the time series of magnetic parameters. <xref ref-type="bibr" rid="B24">Sun&#xa0;et&#xa0;al. (2022)</xref> adopted a stacking ensemble approach to combine the convolutional neural network (CNN) and LSTM. <xref ref-type="bibr" rid="B25">Wang&#xa0;et&#xa0;al. (2022)</xref> extracted the predictor MSE, the mean squared errors between the pictures of the ARs and the corresponding reconstructed pictures derived by an unsupervised auto-encoder network, from the radial magnetic field of SHARPs. <xref ref-type="bibr" rid="B19">Li&#xa0;et&#xa0;al. (2022)</xref> adopted the knowledge-informed CNN/fusion model to develop a classification model to predict the strong flares in the next 48&#xa0;h. Furthermore, <xref ref-type="bibr" rid="B21">Nishizuka&#xa0;et&#xa0;al. (2021)</xref> developed the Deep Flare Net with an operable interface to detect ARs, extract their features, and conduct a prediction of the probability of flares within 24&#xa0;h. Furthermore, <xref ref-type="bibr" rid="B23">Sun&#xa0;et&#xa0;al. (2021)</xref> expanded their machine learning method to the interpretability of its neural network.</p>
<p>As we all know, strong flares are rare events, which leads to an unbalanced dataset consisting of a relatively small number of positive samples (referred to as strong-flare events) and a large number of negative samples (referred to as non-strong-flare events). Prediction models trained by an unbalanced dataset might be sensitive to the bias and achieve limited performance at last. Therefore, how to balance positive and negative samples in the dataset for flare prediction is considered one of the difficult and crucial problems to tackle.</p>
<p>There are some widely used methods tackling unbalanced data in machine learning, for example, the Synthetic Minority Over-sampling Technique (SMOTE) up-sampling method (<xref ref-type="bibr" rid="B7">Chawla&#xa0;et&#xa0;al., 2002</xref>), which is used to establish a balanced dataset by increasing the number of minority samples; a random down-sampling method (<xref ref-type="bibr" rid="B15">Japkowicz, 2000</xref>), which is used to randomly remove samples from a majority to create a balanced dataset; and the weighted-class method (<xref ref-type="bibr" rid="B13">Hashemi and Karimi, 2018</xref>), which is used to eliminate the bias of the model that was trained on the unbalanced dataset by enlarging the weight of low-probability categories.</p>
<p>However, the question arises whether those widely used methods can be well applied to our dataset for flare prediction and provide good results. On the one hand, randomly increasing the positive samples might lead to too much noise and make the generated samples away from the ground truth. On the other hand, decreasing the negative samples will lead to a loss of a lot of valid information. If we increase the number of positive samples selectively considering the correlation between the flare occurrence and magnetic parameters of ARs, will it help us obtain a better classification model that is more accurate and more reliable?</p>
<p>In this study, we focused on tackling the unbalanced dataset for flare prediction and developed a selective up-sampling method by picking up more positive samples from the mixed-up region (referred to as the confusing distribution areas consisting of both the positive and negative samples). Then, we conducted a comparable analysis of the influence of the input dataset on model performance.</p>
<p>The remainder of the paper is organized as follows: data preparation is given in <xref ref-type="sec" rid="s2">Section&#xa0;2</xref>. In <xref ref-type="sec" rid="s3">section&#xa0;3</xref>, we introduce the selective up-sampling method and develop a strong-flare prediction model. Then, we conduct a comparable analysis of the model performance based on different input datasets in <xref ref-type="sec" rid="s4">Section&#xa0;4</xref>. The conclusion and discussion are given in <xref ref-type="sec" rid="s5">Section&#xa0;5</xref>.</p>
</sec>
<sec id="s2">
<title>2 Data preparation</title>
<p>The dataset was obtained from the Helioseismic and Magnetic Imager data product on the Solar Dynamics Observatory, which is called SHARPs.</p>
<p>The SHARP parameters, including 16 photospheric magnetic parameters, such as the total magnetic flux, spatial gradients of the field, characteristics of the vertical current density, current helicity, and a proxy for the integrated free magnetic energy (<xref ref-type="bibr" rid="B5">Bobra&#xa0;et&#xa0;al., 2014</xref>), were calculated per patch and were available on a 12-min&#xa0;cadence. Furthermore, SHARP parameters have been widely used to develop flare prediction models by statistical and machine learning methods (<xref ref-type="bibr" rid="B22">Sinha&#xa0;et&#xa0;al., 2022</xref>) because many previous studies showed that these parameters play an important role in characterizing the properties and complexity of ARs (<xref ref-type="bibr" rid="B18">Leka and Barnes, 2003</xref>; <xref ref-type="bibr" rid="B12">Georgoulis and Rust, 2007</xref>).</p>
<p>Active area parameters were stored in each SHARP series as keywords (<xref ref-type="bibr" rid="B5">Bobra&#xa0;et&#xa0;al., 2014</xref>), and the data we used in this paper were sampled from the hmi.sharp&#x5f;720s dataset, verified by <xref ref-type="bibr" rid="B14">Huang&#xa0;et&#xa0;al. (2018</xref>), and published on Alibaba Tianchi. The dataset contained 10 photospheric magnetic parameters, whose correlation coefficient matrix was calculated and is shown in <xref ref-type="fig" rid="F1">Figure&#xa0;1</xref> with a thermal heatmap. The meaning of photospheric magnetic parameters has been described in detail by <xref ref-type="bibr" rid="B5">Bobra&#xa0;et&#xa0;al. (2014)</xref>, and we listed a brief description and formula of six keywords of the parameters that have the greatest correlation with strong flares in each activity area of the dataset in <xref ref-type="table" rid="T1">Table&#xa0;1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Correlation coefficient matrix heatmap of photospheric magnetic parameters and event categories.</p>
</caption>
<graphic xlink:href="fspas-10-1082694-g001.tif"/>
</fig>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Keywords for six active-region parameters in the SHARP series.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Parameter</th>
<th align="center">Description</th>
<th align="center">Formula</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">TOTUSJH</td>
<td align="center">Total unsigned current helicity</td>
<td align="center">
<inline-formula id="inf1">
<mml:math id="m1">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x221d;</mml:mo>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">TOTPOT</td>
<td align="center">Total photospheric magnetic free-energy density</td>
<td align="center">
<inline-formula id="inf2">
<mml:math id="m2">
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x221d;</mml:mo>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:mi mathvariant="bold-italic">o</mml:mi>
<mml:mi mathvariant="bold-italic">t</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">TOTUSJZ</td>
<td align="center">Total unsigned vertical current</td>
<td align="center">
<inline-formula id="inf3">
<mml:math id="m3">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">ABSNJZH</td>
<td align="center">Absolute value of the net current helicity</td>
<td align="center">
<inline-formula id="inf4">
<mml:math id="m4">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">H</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x221d;</mml:mo>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">SAVNCPP</td>
<td align="center">Sum of the modulus of the net current per polarity</td>
<td align="center">
<inline-formula id="inf5">
<mml:math id="m5">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>u</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:msup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">J</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">USFLUX</td>
<td align="center">Total unsigned flux</td>
<td align="center">
<inline-formula id="inf6">
<mml:math id="m6">
<mml:mi mathvariant="normal">&#x3a6;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
<mml:mfenced open="|" close="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mi>d</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The strong-flare events (positive samples) included at least one flare of the M and above class within 48&#xa0;h. The non-strong-flare events (negative samples) included no flares or only flares of the C and below class. In this study, we focused on strong-flare events because they are highly related to the geoeffectiveness.</p>
<p>For the flare forecasting task, <xref ref-type="bibr" rid="B1">Ahmadzadeh&#xa0;et&#xa0;al. (2021</xref>) suggested some rules on normalization, class imbalance, temporal coherence, performance metrics, and comparison of models. In this study, we adopted the data normalization method for data preprocessing, proposed a selective up-sampling method considering class imbalance and temporal coherence for model training, and used an evaluation metrics (F1 score that should be less biased through class imbalance) for model evaluation.</p>
<p>Furthermore, the rules for selecting data for this dataset are as follows:</p>
<p>(1) The time range is from 16:00 on 4 May 2010 to 16:00 on 26 January 2019.</p>
<p>(2) The time interval for sampling the same event is 96&#xa0;min&#xa0;(the sampling frequency is lower than that of the SDO) in order to guarantee enough variations between the closest AR images.</p>
<p>(3) The location range of SHARPs is within &#xb1;30 heliolongitude degrees from the solar disk center to reduce the influence of projection. <xref ref-type="bibr" rid="B9">Cui&#xa0;et&#xa0;al. (2007)</xref> evaluated this issue of the influence of the AR projection effect on the solar flare productivity and found that the projection effect can be ignored for ARs located within &#xb1;30&#xb0; from the solar disk center.</p>
<p>First, we need to divide the training set and testing set in a scientific way. The rules are as follows:</p>
<p>(1) The ratio of positive to negative samples should be similar in both training and testing sets. This has two purposes. First, to avoid the situation where the number of positive examples in the testing set is too small or even zero (very likely to occur if the datasets are divided randomly). Second, as the proportion of positive cases in the testing set is close to that in reality, the testing results can reflect the real performance of the model when facing the actual situation.</p>
<p>(2) The data in the training and testing set cannot be from the same event, which is to ensure that the testing and training sets are independent of each other, and active regions with multiple flares cannot appear in both training and testing sets simultaneously (<xref ref-type="bibr" rid="B10">Dhuri&#xa0;et&#xa0;al., 2019</xref>).</p>
<p>In this study, we used only the photospheric magnetic parameter data that contain 73,810 samples (from 2,542 ARs) of photospheric magnetic parameters. After the statistics, we found that there were only 2,988 positive samples from 155 strong-flare ARs (the climate probability &#x2248; only 6%) and the remaining 70,822 samples from 2,387 non-strong-flare ARs were all negative, which makes it extremely difficult for machine learning methods to predict flares only through these original and imbalanced data because the classifier can easily learn the information of the majority of non-strong-flare events, but there are not enough strong-flare events to learn from.</p>
<p>In each round of experiments, we divided the dataset into the training set and the testing set randomly in a 9:1 ratio. At the same time, in both training and testing sets, we ensured that the ratio of positive to negative samples is approximately the same. Furthermore, we ensured that the data in the testing set and the training set come from different ARs for data independence.</p>
</sec>
<sec id="s3">
<title>3 Application of processing methods upon unbalanced data for flare prediction</title>
<p>In order to eliminate the negative impact of data imbalance, there are some ways that are widely used in industrial applications, which will also be used as control groups in the following data training in this study:</p>
<p>(1) Random down-sampling method: Random sampling from the majority samples (negative samples, as for flare prediction) to make its number equal to the minority samples (positive samples, as for flare prediction).</p>
<p>(2) SMOTE up-sampling method (<xref ref-type="bibr" rid="B7">Chawla&#xa0;et&#xa0;al., 2002</xref>): We took one dataset from the minority samples (positive samples) and named it <italic>x</italic>
<sub>
<italic>i</italic>
</sub>, calculated and sorted it according to the Euclidean distance between this sample and other samples, and then took the first <italic>n</italic> samples (<italic>n</italic> is the sampling multiple number set according to the sample imbalance ratio) as the selected nearest neighbors <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. Furthermore, for each <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, we constructed a new sample <italic>x</italic>
<sub>
<italic>new</italic>
</sub> according to the following formula:<disp-formula id="equ1">
<mml:math id="m9">
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</disp-formula>
</p>
<p>(3) Weighted classifier: This did not change the samples but changed the weights corresponding to different categories in the classifier. When there are mixed samples, the classifier will be more inclined to retain more minority samples.</p>
<p>The three photospheric magnetic parameters that have the greatest correlations with the strong-flare occurrence were selected for this study, namely, the total unsigned current helicity (TOTUSJH), the absolute value of net current helicity (ABSNJZH), and the sum of the modulus of the net current per polarity (SAVNCPP).</p>
<p>The frequency densities of the photospheric magnetic parameter in positive and negative samples are shown in <xref ref-type="fig" rid="F2">Figure&#xa0;2A</xref> (taking TOTUSJH as an example), and the number distribution is shown in <xref ref-type="fig" rid="F2">Figure&#xa0;2B</xref>. The statistical characteristics indicate that when the values of these photospheric magnetic parameters are low, positive samples are totally submerged by negative samples; on the contrary, when the parameter values are high, almost all samples are positive. It means that on the one hand, when creating new samples while up-sampling in this region, a large number of negative samples are likely to be classified as positive samples, thus seriously losing the accuracy of the model; while, on the other hand, when the parameter values are large, there are only positive samples, so it is unnecessary to up-sample the positive samples here for the model to learn more. However, there is a mixed-up region referred to as the confusing distribution areas consisting of both strong-flare events and non-strong-flare events (as marked in <xref ref-type="fig" rid="F2">Figure&#xa0;2B</xref>). The mixed-up region can be quantitatively described as the region near the strong-flare probability of 50%. Here, the strong-flare probability is the probability of a strong flare occurring within 48&#xa0;h. In the mixed-up regions, the number of positive samples and negative samples is similar, so the characteristics are the most difficult to distinguish. We assumed that it is worthwhile to up-sample the positive examples in these mixed-up regions and proposed the selective up-sampling method.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Distribution of the photospheric magnetic parameter (taking TOTUSJH as an example). The <italic>Y</italic>-axis in <bold>(A)</bold> represents the frequency density, and the <italic>Y</italic>-axis in <bold>(B)</bold> represents the number of samples at different intervals (since the negative samples are far more than the positive samples, the range of <italic>Y</italic> only considers the positive sample to show the details). The green circular box indicates the mixed-up region of the two types of samples.</p>
</caption>
<graphic xlink:href="fspas-10-1082694-g002.tif"/>
</fig>
<sec id="s3-1">
<title>3.1 Probability function of flare occurrences based on SHARP parameters</title>
<p>As we need a convincing method to identify the mixed-up regions for the selective up-sampling method, it is necessary to know the quantitative relationship between strong-flare probability and photospheric magnetic parameters. We can see that the photospheric magnetic parameters in strong-flare events and non-strong-flare events both show the characteristics of a skewed distribution, which can be fitted with the standard form of log-normal function as follows:<disp-formula id="e1">
<mml:math id="m10">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>a</mml:mi>
<mml:msqrt>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
</mml:msqrt>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="italic">log</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mspace width="0.2em"/>
<mml:mi>a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>d</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mo>.</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>The aforementioned formula represents the probability density of the photospheric magnetic parameter when its value is <italic>x</italic>, which has different <italic>&#x3c3;</italic> values in positive and negative samples.</p>
<p>Furthermore, when the form of the probability density of photospheric magnetic parameters in positive samples and that in negative cases are known, we can derive the form of the probability function of strong-flare occurrence (<italic>P</italic>
<sub>
<italic>f</italic>
</sub>) based on Bayes&#x2019; theorem as follows (<xref ref-type="bibr" rid="B3">Barnes and Leka, 2008</xref>):</p>
<p>Bayes&#x2019; theorem can be used to estimate the probability of a flare occurring event. When the magnetic parameter is equal to x, the probability of a strong flare occurring within 48&#xa0;h is equal to<disp-formula id="e2">
<mml:math id="m11">
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Since P (x&#x7c;strong) and P (x&#x7c;not strong) &#x2192; 0, through L&#x2019;H&#xf4;pital&#x2019;s rule, we can obtain<disp-formula id="e3">
<mml:math id="m12">
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>By replacing f with Eq&#xa0;<xref ref-type="disp-formula" rid="e1">1</xref>, P (class) with N (class)/N (total), we can make the probability function of strong flare P (strong&#x7c;x) as a function of <italic>P</italic>
<sub>
<italic>f</italic>
</sub> with x as the input and two sigma parameters as tuning parameters:<disp-formula id="e4">
<mml:math id="m13">
<mml:msub>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>This indicates the probability of strong flares when the photospheric magnetic parameter value is x, where <italic>&#x3c3;</italic>
<sub>1</sub> and <italic>&#x3c3;</italic>
<sub>0</sub> representing the standard deviation of <italic>lnx</italic> in positive samples and negative samples, respectively, are the parameters we need to fit. Moreover, <italic>N</italic>
<sub>1</sub> and <italic>N</italic>
<sub>0</sub> are the number of positive samples and negative samples, respectively.</p>
<p>We can fit the observed strong-flare probability to the probability function as previously mentioned. Dividing the photospheric magnetic parameter into 16 bins according to Doane&#x2019;s rule (<xref ref-type="bibr" rid="B11">Doane, 1976</xref>), the proportion of positive samples in each interval <italic>p</italic>
<sub>
<italic>i</italic>
</sub> can be used as the probability data to be fitted. By assuming each interval as a separate sample population estimated to be normally distributed, the Wilson confidence intervals (<xref ref-type="bibr" rid="B27">Wilson, 1927</xref>) in each interval can be written as follows (<xref ref-type="fig" rid="F3">Figure&#xa0;3</xref>)<disp-formula id="equ2">
<mml:math id="m14">
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2261;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>n</mml:mi>
<mml:mo>&#xb1;</mml:mo>
<mml:mi>z</mml:mi>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>/</mml:mo>
<mml:mn>4</mml:mn>
<mml:msubsup>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula>
</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Wilson confidence intervals under different photospheric magnetic parameter values. The observation probability and its confidence intervals at different intervals containing different sample numbers are shown.</p>
</caption>
<graphic xlink:href="fspas-10-1082694-g003.tif"/>
</fig>
<p>Here, <italic>n</italic>
<sub>
<italic>i</italic>
</sub> represents the number of samples in the ith interval and <italic>z</italic> &#x3d; 1.96 under the 95% confidence interval. The advantage of using the Wilson confidence intervals is that it can show confidence intervals for any sample size without making specific assumptions about the sample size. In addition, the method can also effectively solve the deviation problem in binomial distribution parameter estimation, thus improving the accuracy of the confidence interval.</p>
<p>In order to accelerate the convergence speed of fitting, we scaled the input <italic>x</italic>. The values of the two sigma parameters in Eq.&#xa0;<xref ref-type="disp-formula" rid="e4">4</xref> and scaling factor <italic>&#x3b1;</italic> (which means that the input value of the function is <italic>&#x3b1;x</italic>) in fitting graphs of three different photospheric magnetic parameters (<xref ref-type="fig" rid="F4">Figure&#xa0;4</xref>) are as follows: for TOTUSJH (<italic>G</italic>
<sup>2</sup>
<italic>m</italic>
<sup>&#x2212;1</sup>), the best fitting parameters are <italic>&#x3c3;</italic>
<sub>1</sub> &#x3d; 3158.18, <italic>&#x3c3;</italic>
<sub>0</sub> &#x3d; 1.53, and <italic>&#x3b1;</italic> &#x3d; 0.23; for ABSNJZH (<italic>G</italic>
<sup>2</sup>
<italic>m</italic>
<sup>&#x2212;1</sup>), the best fitting parameters are <italic>&#x3c3;</italic>
<sub>1</sub> &#x3d; 33.53, <italic>&#x3c3;</italic>
<sub>0</sub> &#x3d; 1.89, and <italic>&#x3b1;</italic> &#x3d; 0.85; and for SAVNCPP (A), the best fitting parameters are <italic>&#x3c3;</italic>
<sub>1</sub> &#x3d; 44.28, <italic>&#x3c3;</italic>
<sub>0</sub> &#x3d; 1.96, and <italic>&#x3b1;</italic> &#x3d; 2.94<italic>e</italic> &#x2212; 11.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Fitting curve of the flare occurrence probability and the mixed-up region division. The mixed-up regions are selected near the flare yield of 50%. The green parts are the mixed-up regions divided based on three photospheric magnetic parameters (TOTUSJH, ABSNJZH, and SAVNCPP), and the depth of their colors represents different probability ranges: 50&#xb1;5%, 50%&#xb1;10%, 50%&#xb1;20%, and 50%&#xb1;30% (from deep to light).</p>
</caption>
<graphic xlink:href="fspas-10-1082694-g004.tif"/>
</fig>
<p>After fitting the functional relationship between the strong-flare probability and the photospheric magnetic parameters, we divided four mix-up regions according to the fitting curve with &#x201c;<italic>p</italic> &#x3d; 50%&#x201d; as the median line: 50&#xb1;5%, 50%&#xb1;10%, 50%&#xb1;20%, 50%&#xb1;30%, as shown in detail in <xref ref-type="fig" rid="F4">Figure&#xa0;4</xref>. Furthermore, the selective up-sampling method applied to low-probability positive samples in these regions can then be realized.</p>
</sec>
<sec id="s3-2">
<title>3.2 Flare prediction model based on sampling methods</title>
<p>The training information including the control groups is as follows:</p>
<p>(1) Raw data group: The original unbalanced data are directly sent to the logical regression model, which contains 56,869 negative samples, 2,837 positive samples, and 66,428 samples in total.</p>
<p>(2) Random down-sampling group: After the down-sampling, 2,837 negative samples, 2,837 positive samples, and 5,674 samples in total are sent to the logical regression model.</p>
<p>(3) SMOTE up-sampling group: After the up-sampling, 56,869 negative samples, 56,869 positive samples, and 1,13,738 samples in total are sent to the logical regression model.</p>
<p>(4) Weighted classifier: The input data are not changed, but the weight of different categories in the classifier is changed according to the number of samples. In this training set, the ratio of majority samples to minority samples is 20, so the weight of the minority category (strong flare) should be 20 times that of the majority samples (non-strong flare). The input data of the model contain 56,869 negative samples, 2,837 positive samples, and 66,428 samples in total.</p>
<p>(5) Selective up-sampling group: We determine the mixed-up regions through the method introduced in the previous section and expand the positive samples by repeating them in the mix-up region until it has size as same as the negative samples. The input data from the first round of experiments contain 56,869 negative samples, 56,869 positive samples, and 1,13,738 samples in total.</p>
<p>We obtained the parameter distributions before and after resampling, as shown in <xref ref-type="fig" rid="F5">Figure&#xa0;5</xref> (we take TOTUSJH as an example). The KL divergence value of strong-flare events before and after sampling was calculated as follows: KL (random down-sampling) &#x3d; 0; KL (SMOTE up-sampling) &#x3d; 0.02; and KL (selective up-sampling) &#x3d; 1.70. It can be seen that the selective up-sampling method has the greatest impact on the parameter distribution of strong-flare events.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Parameter distribution before and after resampling (taking TOTUSJH as an example). The four images show the distribution before sampling, after down-sampling, SMOTE up-sampling, and selective up-sampling.</p>
</caption>
<graphic xlink:href="fspas-10-1082694-g005.tif"/>
</fig>
<p>Furthermore, for each newborn sample, we added a random perturbation (&#xb1;5%) to each value separately, as small random differences can improve the performance of the model. The reason of not using the SMOTE method is that the distribution of the newborn samples supplemented by the SMOTE method may be quite different from the original minority samples, which will weaken the characteristics of this region where we need to strengthen the recognition, but the method in <xref ref-type="sec" rid="s3-2">Section&#xa0;3.2</xref> (5) will not have this negative effect.</p>
<p>After feeding the data to each logistic regression model, the logistic regression algorithm will adjust the model parameters through the gradient descent method to reduce the cross entropy loss round by round until it reaches a certain threshold (we set it at 0.001), as a sign&#xa0;of the end of training.</p>
<p>The models in different groups are tested on the same testing set and we give the evaluation results in <xref ref-type="sec" rid="s4">Section&#xa0;4</xref>, while evaluation indicators are introduced in <xref ref-type="sec" rid="s3-3">Section&#xa0;3.3</xref>.</p>
</sec>
<sec id="s3-3">
<title>3.3 Evaluation metrics</title>
<p>For a binary classification task like flare prediction, the confusion matrix is listed in <xref ref-type="table" rid="T2">Table&#xa0;2</xref>. The true positive (TP) is the hit case, where the strong-flare events are correctly classified in the strong-flare category. The false positive (FP) is the false alarm case, where the non-strong-flare events are falsely classified as the strong-flare category. The false negative (FN) is the miss case, where strong-flare events are falsely classified as the non-strong-flare category. The true negative (TN) is the correct negative sample, where the non-strong-flare samples are correctly classified as the non-strong-flare category.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Confusion matrix for binary classification.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Actual class (observations)</th>
<th colspan="2" align="center">Predicted class (forecasts)</th>
</tr>
<tr>
<th align="center">Positive</th>
<th align="center">Negative</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="2" align="center">Positive</td>
<td align="center">TP (true positive) hit case</td>
<td align="center">FN (false negative) miss case</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
</tr>
<tr>
<td rowspan="2" align="center">Negative</td>
<td align="center">FP (false positive) false alarm case</td>
<td align="center">TN (true negative) correct negative case</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
</tr>
</tbody>
</table>
</table-wrap>
<p>Based on the confusion matrix, we adopted three evaluation metrics: recall, precision, and F1 score, which are computed as follows:<disp-formula id="equ3">
<mml:math id="m15">
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mspace width="5.8em"/>
</mml:math>
</disp-formula>
<disp-formula id="equ4">
<mml:math id="m16">
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mspace width="7.3em"/>
</mml:math>
</disp-formula>
<disp-formula id="equ5">
<mml:math id="m17">
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula>
</p>
<p>Because the samples are unbalanced, and we only focus on the prediction of samples with flares, we do not need to calculate the evaluation results of negative samples and the accuracy of positive samples. (In fact, they are all close to 1 because of data imbalance). The scikit-learn package (<ext-link ext-link-type="uri" xlink:href="https://scikit-learn.org">https://scikit-learn.org</ext-link>) is used to calculate the aforementioned metrics.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Comparison analysis of model performance</title>
<p>For such flare data samples, due to the imbalance of data, we chose not to consider the accuracy rate because its results are greatly affected by TN, and as the number of negative samples is large, the effect of training on negative samples (non-strong-flare events) is good, which makes the accuracy of the model close to 1&#xa0;at any time. Thus, the recall and precision references are compared for their model performance.</p>
<p>After 10 rounds of random experiments, the evaluation results of the same logistic regression model sampled in different ways are shown in <xref ref-type="fig" rid="F6">Figure&#xa0;6</xref>. The mean and standard deviation of the results are listed in <xref ref-type="table" rid="T3">Table&#xa0;3</xref>.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Model testing results, where SUS refers to the method of selective up-sampling. The dotted line represents the equivalent curve of the F1 score. The more the curve deviates to the upper right, the greater the value of the F1 score is. We can find that most of the results of the selective up-sampling method on mixed-up regions are better than those of the original data and other sampling methods. (The dark-yellow dotted line represents the best result of the F1 score sampled in the mixed-up regions, while the orange dotted line represents the result of training the raw data without sampling. Furthermore, the color depth of the point represents the width selected for the sampling region near the strong-flare yield of 50%).</p>
</caption>
<graphic xlink:href="fspas-10-1082694-g006.tif"/>
</fig>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Prediction results of the model on the testing dataset (average &#xb1; standard deviation).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Sampling method</th>
<th align="center">F1 score</th>
<th align="center">Recall</th>
<th align="center">Precision</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Raw data</td>
<td align="left">0.4459 &#xb1; 0.0804</td>
<td align="left">0.3072 &#xb1; 0.0770</td>
<td align="left">0.8133 &#xb1; 0.1453</td>
</tr>
<tr>
<td align="left">Random down-sampling</td>
<td align="left">0.4488 &#xb1; 0.0855</td>
<td align="left">0.8216 &#xb1; 0.0711</td>
<td align="left">0.3087 &#xb1; 0.0736</td>
</tr>
<tr>
<td align="left">SMOTE up-sampling</td>
<td align="left">0.4122 &#xb1; 0.0790</td>
<td align="left">0.8504 &#xb1; 0.0604</td>
<td align="left">0.2721 &#xb1; 0.0661</td>
</tr>
<tr>
<td align="left">Weighted classifier</td>
<td align="left">0.4455 &#xb1; 0.0788</td>
<td align="left">0.8187 &#xb1; 0.0792</td>
<td align="left">0.3060 &#xb1; 0.0676</td>
</tr>
<tr>
<td align="left">Overlap up-sampling (best)</td>
<td align="left">0.5501 &#xb1; 0.1200</td>
<td align="left">0.6197 &#xb1; 0.1750</td>
<td align="left">0.4945 &#xb1; 0.1185</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>It can be found that although the precision of the models trained on the original data is high, the recall is extremely low (about 0.3). On the contrary, after SMOTE up-sampling, random down-sampling, or training using a weighted classifier, although the recall is improved, the precision is seriously reduced at the same time (from 0.8 to 0.3).</p>
<p>By up-sampling the positive samples related to strong-flare events in the mixed-up region before training, the best F1 score reaches 0.5501 &#xb1; 0.1200, which is approximately 22&#x2212;33% higher than other methods. In this simple comparison, we can draw the conclusion that the performance of the model significantly improved.</p>
<p>The reason for the improvement is that the categories of events cannot be distinguished by a certain photospheric magnetic parameter in its mixed-up region. Therefore, the classifier may lose a dimension of information in this region. By repeatedly up-sampling minority samples in the mixed-up region, we can strengthen the information here for the classifier. Furthermore, by adding some random disturbances, we not only avoid over fitting but also make newborn data close to the real situation.</p>
</sec>
<sec id="s5">
<title>5 Conclusion and discussion</title>
<p>In this study, we focused on tackling the unbalanced dataset based on SHARP parameters. After repeatedly up-sampling the minority samples in the mixed-up region and adding some random disturbances, we compared their model performances. The methods used in the comparison are as follows: 1) raw data (no processing); 2) random down-sampling; 3) SMOTE up-sampling; and 4) weighted classifier; which are all described in detail in <xref ref-type="sec" rid="s3">Section&#xa0;3</xref>. Furthermore, the result shows that the forecast capability is promoted in the mixed-up region, the robustness of the model is increased, and the selective up-sampling method has potential to improve the model performance in strong-flare prediction as its F1 score reaches 0.5501 &#xb1; 0.1200, which is approximately 22&#x2212;33% higher than the other methods.</p>
<p>The purpose of increasing the number of samples from the mixed-up region of positive strong-flare samples (selective up-sampling) is to better distinguish the previous &#x201c;difficult to predict&#x201d; events. Furthermore, the method of adding samples is based on the original samples plus random values, which is close to the ground truth, while it is likely that the created physical parameter values might deviate significantly from the ground truth if we use SMOTE or other up-sampling methods. The selective up-sampling method we proposed could provide a new suggestion on the preparation of data for the machine learning model in the future, especially when we expand data for unbalanced samples.</p>
<p>This study also presents the characteristics of interdisciplinary. On the one hand, the application modeling of the AI method is valuable, while on the other hand, it also requires manual improvements based on data characteristics rather than simply using it directly.</p>
<p>Although the proposed sampling method reaches a higher F1 score than the other three sampling methods, it is consistent with the other existing methods. The main reason is that in this experiment we developed a flare forecasting model based on only three parameters adopting a relatively simple algorithm (logistic regression) for classification. In this study, we highlighted the importance of the sampling method tackling the class-imbalance problem. We found that the selective up-sampling method has potential to improve the flare forecasting performance. Considering that many complex machine learning models can help boost the model performance significantly compared to simple statistical models, in the future, we would like to adopt other models (for example, CNN &#x2b; LSTM) and investigate comparable studies on flare forecasting.</p>
<p>Moreover, as the time resolution of SHARP data is high enough (sampling every 12&#xa0;min), timing information of continuous samples will be used as well, which will increase the amount of information contained in the photospheric magnetic parameters by another dimension, so as to assist solar-flare forecast better. We are currently working on applying this method for combining continuous magnetograms and photospheric magnetic parameters with the time dimension.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/Supplementary Material; further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>SL, JW, YC, ML, JG, YS, BL, and SL meet the authorship criteria and agree to be accountable for the content of the work. All authors contributed to the article and approved the submitted version<italic>.</italic>
</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>JW was supported by the National Science Foundation of China (Grant No. 42074224), the Youth Innovation Promotion Association, CAS, the Key Research Program of the Chinese Academy of Sciences (Grant No. ZDRE-KT-2021-3), and Pandeng Program of National Space Science Center, Chinese Academy of Sciences.</p>
</sec>
<ack>
<p>The authors thank the SDO/HMI team members that contributed to the SDO mission. They also thank the reviewers for providing valuable suggestions.</p>
</ack>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ahmadzadeh</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Aydin</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Georgoulis</surname>
<given-names>M. K.</given-names>
</name>
<name>
<surname>Kempton</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Mahajan</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Angryk</surname>
<given-names>R. A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>How to train your flare prediction model: Revisiting robust sampling of rare events</article-title>. <source>Astrophysical J. Suppl. Ser.</source>
<volume>254</volume>, <fpage>23</fpage>. <pub-id pub-id-type="doi">10.3847/1538-4365/abec88</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alipour</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Mohammadi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Safari</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Prediction of flares within 10 days before they occur on the sun</article-title>. <source>Astrophysical J. Suppl. Ser.</source>
<volume>243</volume>, <fpage>20</fpage>. <pub-id pub-id-type="doi">10.3847/1538-4365/ab289b</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barnes</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Leka</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Evaluating the performance of solar flare forecasting methods</article-title>. <source>Astrophysical J.</source>
<volume>688</volume>, <fpage>L107</fpage>&#x2013;<lpage>L110</lpage>. <pub-id pub-id-type="doi">10.1086/595550</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bobra</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Couvidat</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Solar flare prediction using sdo/hmi vector magnetic field data with a machine-learning algorithm</article-title>. <source>Astrophysical J.</source>
<volume>798</volume>, <fpage>135</fpage>. <pub-id pub-id-type="doi">10.1088/0004-637x/798/2/135</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bobra</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Hoeksema</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Turmon</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hayashi</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <source>The helioseismic and magnetic imager (hmi) vector magnetic field pipeline: Sharps - space-weather hmi active region patches</source>. <pub-id pub-id-type="doi">10.1007/s11207-014-0529-3</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Boteler</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A 21st century view of the march 1989 magnetic storm</article-title>. <source>Space weather.</source>
<volume>17</volume>, <fpage>1427</fpage>&#x2013;<lpage>1441</lpage>. <pub-id pub-id-type="doi">10.1029/2019SW002278</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chawla</surname>
<given-names>N. V.</given-names>
</name>
<name>
<surname>Bowyer</surname>
<given-names>K. W.</given-names>
</name>
<name>
<surname>Hall</surname>
<given-names>L. O.</given-names>
</name>
<name>
<surname>Kegelmeyer</surname>
<given-names>W. P.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>Smote: Synthetic minority over-sampling technique</article-title>. <source>J. Artif. Intell. Res.</source>
<volume>16</volume>, <fpage>321</fpage>&#x2013;<lpage>357</lpage>. <pub-id pub-id-type="doi">10.1613/jair.953</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Two-stage solar flare forecasting based on convolutional neural networks</article-title>. <source>Space Sci. Technol.</source>
<volume>2022</volume>. <pub-id pub-id-type="doi">10.34133/2022/9761567</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Correlation between solar flare productivity and photospheric magnetic field properties ii. magnetic gradient and magnetic shear</article-title>. <source>Sol. Phys.</source>
<volume>242</volume>, <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1007/s11207-007-0369-5</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dhuri</surname>
<given-names>D. B.</given-names>
</name>
<name>
<surname>Hanasoge</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Cheung</surname>
<given-names>M. C.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Machine learning reveals systematic accumulation of electric current in lead-up to solar flares</article-title>. <source>Proc. Natl. Acad. Sci.</source>
<volume>116</volume>, <fpage>11141</fpage>&#x2013;<lpage>11146</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1820244116</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Doane</surname>
<given-names>D. P.</given-names>
</name>
</person-group> (<year>1976</year>). <article-title>Aesthetic frequency classifications</article-title>. <source>Am. Statistician</source>
<volume>30</volume>, <fpage>181</fpage>&#x2013;<lpage>183</lpage>. <pub-id pub-id-type="doi">10.1080/00031305.1976.10479172</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Georgoulis</surname>
<given-names>M. K.</given-names>
</name>
<name>
<surname>Rust</surname>
<given-names>D. M.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Quantitative forecasting of major solar flares</article-title>. <source>Astrophysical J.</source>
<volume>661</volume>, <fpage>L109</fpage>&#x2013;<lpage>L112</lpage>. <pub-id pub-id-type="doi">10.1086/518718</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hashemi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Karimi</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Weighted machine learning</article-title>. <source>Statistics, Optim. Inf. Comput.</source>
<volume>6</volume>, <fpage>497</fpage>&#x2013;<lpage>525</lpage>. <pub-id pub-id-type="doi">10.19139/soic.v6i4.479</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Deep learning based solar flare forecasting model. i. results for line-of-sight magnetograms</article-title>. <source>Astrophysical J.</source>
<volume>856</volume>, <fpage>7</fpage>. <pub-id pub-id-type="doi">10.3847/1538-4357/aaae00</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Japkowicz</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2000</year>). &#x201c;<article-title>Learning from imbalanced data sets: A comparison of various strategies</article-title>,&#x201d; in <source>AAAI workshop on learning from imbalanced data sets</source> (<publisher-name>AAAI Press Menlo Park, CA</publisher-name>), <volume>68</volume>, <fpage>10</fpage>&#x2013;<lpage>15</lpage>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.aaai.org/Papers/Workshops/2000/WS-00-05/WS00-05-003.pdf">https://www.aaai.org/Papers/Workshops/2000/WS-00-05/WS00-05-003.pdf</ext-link>
</comment>.</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jarolim</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Veronig</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Podladchikova</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Thalmann</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Narnhofer</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hofinger</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Interpretable solar flare prediction with deep learning</article-title>. <source>Tech. Rep. Copernic. Meet</source>. <pub-id pub-id-type="doi">10.5194/egusphere-egu22-2994</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kusano</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Iju</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bamba</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Inoue</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A physics-based method that can predict imminent large solar flares</article-title>. <source>Science</source>
<volume>369</volume>, <fpage>587</fpage>&#x2013;<lpage>591</lpage>. <pub-id pub-id-type="doi">10.1126/science.aaz2511</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Leka</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Barnes</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>Photospheric magnetic field properties of flaring versus flare-quiet active regions. ii. discriminant analysis</article-title>. <source>Astrophysical J.</source>
<volume>595</volume>, <fpage>1296</fpage>&#x2013;<lpage>1306</lpage>. <pub-id pub-id-type="doi">10.1086/377512</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Knowledge-informed deep neural networks for solar flare forecasting</article-title>. <source>Space weather.</source>
<volume>20</volume>. <pub-id pub-id-type="doi">10.1029/2021SW002985</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Predicting solar flares using sdo/hmi vector magnetic data products and the random forest algorithm</article-title>. <source>Astrophysical J.</source>
<volume>843</volume>, <fpage>104</fpage>. <pub-id pub-id-type="doi">10.3847/1538-4357/aa789b</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nishizuka</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Kubo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sugiura</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Den</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ishii</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Operational solar flare prediction model using deep flare net</article-title>. <source>Earth, Planets Space</source>
<volume>73</volume>, <fpage>64</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1186/s40623-021-01381-9</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sinha</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Lekshmi</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Nandy</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mitra</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <source>A comparative analysis of machine learning models for solar flare forecasting: Identifying high performing active region flare indicators</source>, <fpage>05910</fpage>. <comment>
<italic>arXiv preprint arXiv:2204</italic>
</comment>. <pub-id pub-id-type="doi">10.3847/1538-4357/ac7955</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Manchester IV</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Improved and interpretable solar flare predictions with spatial and topological features of the polarity inversion line masked magnetograms</article-title>. <source>Space weather.</source>
<volume>19</volume>, <fpage>e2021SW002837</fpage>. <pub-id pub-id-type="doi">10.1029/2021SW002837</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Bobra</surname>
<given-names>M. G.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gombosi</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Predicting solar flares using cnn and lstm on two solar cycles of active region data</article-title>. <source>Astrophysical J.</source>
<volume>931</volume>, <fpage>163</fpage>. <pub-id pub-id-type="doi">10.3847/1538-4357/ac64a6</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Precursor identification for strong flares based on anomaly detection algorithm</article-title>. <source>Front. Astronomy Space Sci.</source>
<volume>300</volume>. <pub-id pub-id-type="doi">10.3389/fspas.2022.1037863</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Toth</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Manchester</surname>
<given-names>W. B.</given-names>
</name>
<name>
<surname>Gombosi</surname>
<given-names>T. I.</given-names>
</name>
<name>
<surname>Hero</surname>
<given-names>A. O.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Predicting solar flares with machine learning: Investigating solar cycle dependence</article-title>. <source>Astrophysical J.</source>
<volume>895</volume>, <fpage>3</fpage>. <pub-id pub-id-type="doi">10.3847/1538-4357/ab89ac</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wilson</surname>
<given-names>E. B.</given-names>
</name>
</person-group> (<year>1927</year>). <article-title>Probable inference, the law of succession, and statistical inference</article-title>. <source>J. Am. Stat. Assoc.</source>
<volume>22</volume>, <fpage>209</fpage>&#x2013;<lpage>212</lpage>. <pub-id pub-id-type="doi">10.1080/01621459.1927.10502953</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Short-term solar flare prediction using a sequential supervised learning method</article-title>. <source>Sol. Phys.</source>
<volume>255</volume>, <fpage>91</fpage>&#x2013;<lpage>105</lpage>. <pub-id pub-id-type="doi">10.1007/s11207-009-9318-9</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jing</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <source>Solar flare index prediction using sdo/hmi vector magnetic data products with statistical and machine learning methods</source>. <comment>arXiv preprint arXiv:2209.13779. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2209.13779">https://arxiv.org/abs/2209.13779</ext-link>
</comment>.</citation>
</ref>
</ref-list>
</back>
</article>