<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">612551</article-id>
<article-id pub-id-type="doi">10.3389/frai.2021.612551</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Causal Datasheet for Datasets: An Evaluation Guide for Real-World Data Analysis and Data Collection Design Using Bayesian Networks</article-title>
<alt-title alt-title-type="left-running-head">Butcher et al.</alt-title>
<alt-title alt-title-type="right-running-head">Causal Datasheet for Datasets</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Butcher</surname>
<given-names>Bradley</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1099972/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Huang</surname>
<given-names>Vincent S.</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Robinson</surname>
<given-names>Christopher</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Reffin</surname>
<given-names>Jeremy</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sgaier</surname>
<given-names>Sema K.</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Charles</surname>
<given-names>Grace</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1051761/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Quadrianto</surname>
<given-names>Novi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/148909/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Department of Informatics, Predictive Analytics Lab (PAL), University of Sussex, <addr-line>Brighton</addr-line>, <country>United Kingdom</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Surgo Ventures, <addr-line>Washington</addr-line>, <addr-line>DC</addr-line>, <country>United States</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Harvard T. H. Chan School of Public Health, <addr-line>Cambridge</addr-line>, <addr-line>MA</addr-line>, <country>United States</country>
</aff>
<aff id="aff4">
<label>
<sup>4</sup>
</label>Department of Global Health, University of Washington, <addr-line>Seattle</addr-line>, <addr-line>WA</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/740103/overview">Wojciech Samek</ext-link>, Heinrich Hertz Institute (FHG), Germany</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/418412/overview">Martin Becker</ext-link>, Stanford University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/911928/overview">Nandini Ramanan</ext-link>, The University of Texas at Dallas, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Novi Quadrianto, <email>n.quadrianto@sussex.ac.uk</email>
</corresp>
<fn fn-type="equal" id="fn1">
<p>
<sup>&#x2020;</sup>These authors have contributed equally to this work</p>
</fn>
<fn fn-type="other">
<p>This article was submitted to Machine Learning and Artificial Intelligence, a section of the journal Frontiers in Artificial Intelligence</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>14</day>
<month>04</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>4</volume>
<elocation-id>612551</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>09</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>02</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Butcher, Huang, Robinson, Reffin, Sgaier, Charles and Quadrianto.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Butcher, Huang, Robinson, Reffin, Sgaier, Charles and Quadrianto</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Developing data-driven solutions that address real-world problems requires understanding of these problems&#x2019; causes and how their interaction affects the outcome&#x2013;often with only observational data. Causal Bayesian Networks (BN) have been proposed as a powerful method for discovering and representing the causal relationships from observational data as a Directed Acyclic Graph (DAG). BNs could be especially useful for research in global health in Lower and Middle Income Countries, where there is an increasing abundance of observational data that could be harnessed for policy making, program evaluation, and intervention design. However, BNs have not been widely adopted by global health professionals, and in real-world applications, confidence in the results of BNs generally remains inadequate. This is partially due to the inability to validate against some ground truth, as the true DAG is not available. This is especially problematic if a learned DAG conflicts with pre-existing domain doctrine. Here we conceptualize and demonstrate an idea of a &#x201c;Causal Datasheet&#x201d; that could approximate and document BN performance expectations for a given dataset, aiming to provide confidence and sample size requirements to practitioners. To generate results for such a Causal Datasheet, a tool was developed which can generate synthetic Bayesian networks and their associated synthetic datasets to mimic real-world datasets. The results given by well-known structure learning algorithms and a novel implementation of the OrderMCMC method using the Quotient Normalized Maximum Likelihood score were recorded. These results were used to populate the Causal Datasheet, and recommendations could be made dependent on whether expected performance met user-defined thresholds. We present our experience in the creation of Causal Datasheets to aid analysis decisions at different stages of the research process. First, one was deployed to help determine the appropriate sample size of a planned study of sexual and reproductive health in Madhya Pradesh, India. Second, a datasheet was created to estimate the performance of an existing maternal health survey we conducted in Uttar Pradesh, India. Third, we validated generated performance estimates and investigated current limitations on the well-known ALARM dataset. Our experience demonstrates the utility of the Causal Datasheet, which can help global health practitioners gain more confidence when applying BNs.</p>
</abstract>
<kwd-group>
<kwd>bayesian network</kwd>
<kwd>causality</kwd>
<kwd>causal modeling</kwd>
<kwd>lower middle income country</kwd>
<kwd>machine learning</kwd>
<kwd>big data</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>To meet ambitious global health and development goals in lower and middle income countries (LMICs), policy decisions have been increasingly reliant on data-driven approaches to provide necessary insights. This has spawned numerous programs ranging from specific subjects at the sub-national and national level (e.g., Community Behavior Tracking Survey in Uttar Pradesh, India, and the Social And Living Standards Measurement in Pakistan) to broad health topics with multinational participation (e.g., the Multiple Indicator Cluster Surveys developed by the United Nations Children&#x2019;s Fund, and the USAID-backed Demographic and Health Survey) (<xref ref-type="bibr" rid="B12">Croft et al., 2018</xref>; <xref ref-type="bibr" rid="B29">Khan and Hancioglu, 2019</xref>; <xref ref-type="bibr" rid="B72">Huang et al., 2020</xref>; <xref ref-type="bibr" rid="B42">Pakistan Bureau of Statistics, 2020</xref>). These programs have a mandate to collect and disseminate accurate and population-representative health, nutrition, and population data in the developing world. These surveys allow governments and international agencies to monitor trends across health program areas and set priorities for health policy, interventions, and program funding (<xref ref-type="bibr" rid="B16">Fabic et al., 2012</xref>). As a result, there has been an explosion of data being generated that has the potential to be used to not only monitor/evaluate the status quo but to inform health intervention design.</p>
<p>Global health and development problems are often complex. An understanding of these complexities is often needed to get the right intervention to the right person, at the right time and place&#x2013;also known as a Precision Public Health approach (<xref ref-type="bibr" rid="B15">Desmond-Hellmann, 2016</xref>; <xref ref-type="bibr" rid="B30">Khoury et al., 2016</xref>; <xref ref-type="bibr" rid="B11">Chowkwanyun et al., 2018</xref>). Traditionally, for informing intervention design, randomized controlled trials (RCT) remain the gold standard. However, due to cost, lack of infrastructure, and other practical reasons, RCTs are not always possible in LMICs. As a result, many of the available data collected are observational only and limited in scope. Without an RCT, quantifying which variables are the proximate causes of an outcome or determining causes and effects for specific set of variables remains a challenge for global health practitioners. Moreover, RCTs are by design conducted with the intent to test a narrow set of hypotheses, not to explore unknown causal structures - a potential missed opportunity to target public health solutions more precisely.</p>
<p>Causal inference and discovery approaches such as causal Bayesian Network (BN) can fill this void. BNs readily deal with observational data, can utilize numerous algorithms to facilitate automatic causal discovery, allow for expert-specified constraints, and can infer the causal effects of hypothetical interventions (<xref ref-type="bibr" rid="B43">Pearl, 1995</xref>; <xref ref-type="bibr" rid="B4">Arora et al., 2019</xref>; <xref ref-type="bibr" rid="B20">Glymour et al., 2019</xref>). Despite causal Bayesian Network&#x2019;s many offerings, we have not seen a wide adoption in real-world problems (<xref ref-type="bibr" rid="B4">Arora et al., 2019</xref>; <xref ref-type="bibr" rid="B35">Kyrimi et al., 2020</xref>; <xref ref-type="bibr" rid="B57">Sgaier et al., 2020</xref>). We have found that validating the structure, parameterization, predictive accuracy, and generalizability of BN presents a significant hurdle and is subject to considerable debate and interpretation when applied to data with real-world complexity. Our inability to communicate uncertainty in structure learning algorithm performance for specific datasets can call entire models into question (<xref ref-type="bibr" rid="B65">van der Bles et al., 2019</xref>). Generally, practitioners using BNs must resort to domain expertize to validate model structure, if they do not forgo validation entirely (<xref ref-type="bibr" rid="B1">Aguilera et al., 2011</xref>; <xref ref-type="bibr" rid="B38">Lewis and McCormick, 2012</xref>; <xref ref-type="bibr" rid="B41">Moglia et al., 2018</xref>). This makes BN model results especially difficult to defend when they, even if just in part, contradict previous domain beliefs or doctrines. Thus, BN results are often presented as a proof-of-concept of techniques to show that the method can recover insights already known rather than as an actionable model for discovery, change, or intervention (<xref ref-type="bibr" rid="B38">Lewis and McCormick, 2012</xref>; <xref ref-type="bibr" rid="B41">Moglia et al., 2018</xref>; <xref ref-type="bibr" rid="B49">Raqeujo-Castro et al., 2018</xref>).</p>
<p>The problem of not knowing how well machine learning algorithms will perform in real-world conditions is not restricted to causal discovery and inference and has been subject to a broader debate. One proposed solution is adopting the standard &#x201c;datasheet&#x201d; practice of constructing and accompanying any given dataset with a full description of the data, its collection context, operating characteristics (i.e., the characteristics of the data to which a machine learning algorithms is applied), and test results (i.e., the expected performance of the machine learning algorithm) (<xref ref-type="bibr" rid="B18">Gebru et al., 2018</xref>). Measuring the expected causal discovery and inference performance and their uncertainties for any given dataset is, however, not straightforward. First, it is not clear what performance metrics should be used to measure BN algorithms&#x2019; ability to recover the ground truth causal structure when the ground truth is unknown. In addition, such data may not include the appropriate variables to establish causal or interventional sufficiency, can have incomplete observations, and may be imbalanced (<xref ref-type="bibr" rid="B62">Spirtes et al., 2000</xref>; <xref ref-type="bibr" rid="B44">Pearl, 2009</xref>; <xref ref-type="bibr" rid="B33">Kleinberg and Hripcsak, 2011</xref>; <xref ref-type="bibr" rid="B45">Peters et al., 2017</xref>). Lastly, the sample size of a dataset may be insufficient to support BN analyses (<xref ref-type="bibr" rid="B67">Wang and Gelfand, 2002</xref>). Perhaps due to the data challenges mentioned above, the evaluation of novel BN algorithms has been largely based on standard synthetic datasets such as ALARM, Insurance, Child and others (<xref ref-type="bibr" rid="B6">Beinlich et al., 1989</xref>; <xref ref-type="bibr" rid="B13">Dawid, 1992</xref>; <xref ref-type="bibr" rid="B7">Binder et al., 1997</xref>; <xref ref-type="bibr" rid="B54">Scutari, 2009</xref>), which can have vastly different characteristics compared to real-world data at hand. One suggested method for ranking algorithms&#x2019; performance is to assume the intersection of the structures found by a collection of algorithms as the partial ground truth as in the Intersection-Validation method by <xref ref-type="bibr" rid="B66">Viinikka et al. (2018)</xref>. However, the Intersection-Validation method will often neglect to consider the most complex relationships, and while it provides relative sample size requirements for each algorithm, it cannot directly inform the data collection process. We face the following quandary: with real-world data we lack the ground truth against which to evaluate the modeling algorithms, and with synthetic data we lack the complexity and limitations that are typically imposed in real-world circumstances (<xref ref-type="bibr" rid="B19">Gentzel et al., 2019</xref>).</p>
<p>To solve this quandary and to empower practitioners to estimate uncertainty levels around the causal structures learned under the typical contexts and constraints applicable to their analytical problem of interest, we propose an approach to attach two types of causal extension to such datasheet proposed by <xref ref-type="bibr" rid="B18">Gebru et al. (2018)</xref> to 1) inform study design at the data collection stage to enable subsequent causal discovery analysis similar to, in spirit, conducting power analysis before sample size is determined, and 2) describe expected causal discovery and inference algorithm performance and corresponding uncertainty when presented an existing dataset. The key idea is to generate synthetic data with a spectrum of properties that mimic the existing or projected real-world data. We call our instantiation of this capability the &#x2018;Causal Datasheet Generation Tool&#x2019;, or CDG-T.</p>
<p>In this work, our goal is to provide further confidence in BN results from the perspective of practitioners&#x2019; needs. BNs are introduced in 2.1 of the Materials and Methods Section. In <xref ref-type="sec" rid="s2-2">Section 2.2</xref> we briefly look at pertinent related work. In <xref ref-type="sec" rid="s2-3">Section 2.3</xref>, we introduce the approach taken in generating causal datasheets, including a brief discussion the assumptions that are made. Following this in <xref ref-type="sec" rid="s2-4">Sections 2.4&#x2013;2.7</xref> we define the data characteristics used to generate synthetic data, what structure learning algorithms were explored, definitions of the performance metrics used in the datasheets, and the two datasheet usage scenarios. In <xref ref-type="sec" rid="s3">Section 3</xref>, Results, we illustrate the usage of three example datasheets. First, to inform data collection design in an LMIC setting, we provide an example on how a Causal Datasheet was used in planning of a Sexual Reproductive Health survey in Madhya Pradesh, India, where the performance value is computed over a range of potential variables and sample sizes. Next, for evaluating data suitability for BN we provide two example Causal Datasheets for existing data evaluation: one example for an existing dataset in the global development domain (a survey about Reproductive Maternal Neonatal Child Health (RMNCH) that we administered in Uttar Pradesh, India), and another generated for the well-known ALARM dataset (<xref ref-type="bibr" rid="B6">Beinlich et al., 1989</xref>). Lastly, we note the implications and future research directions in the Discussion.</p>
</sec>
<sec id="s2">
<title>2 Materials and Methods</title>
<sec id="s2-1">
<title>2.1 Causal Bayesian Network</title>
<p>A Bayesian network <inline-formula id="inf1">
<mml:math id="minf1">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x398;</mml:mtext>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for a set of variables <italic>X</italic> consists of two components: a directed acyclic graph (DAG), and a set of parameters <inline-formula id="inf2">
<mml:math id="minf2">
<mml:mtext>&#x398;</mml:mtext>
</mml:math>
</inline-formula>. The DAG <inline-formula id="inf3">
<mml:math id="minf3">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> of a BN encodes the statistical dependence among the set of variables <italic>X</italic> by means of the set of edges <italic>E</italic> which connect nodes <italic>V</italic> (<xref ref-type="fig" rid="F1">Figure 1</xref>). Each node <inline-formula id="inf4">
<mml:math id="minf4">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> corresponds to one variable <inline-formula id="inf5">
<mml:math id="minf5">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>A DAG with three variables <bold>(A&#x2013;C)</bold> and two edges.</p>
</caption>
<graphic xlink:href="frai-04-612551-g001.tif"/>
</fig>
<p>Conversely, the absence of an edge between variables suggests a statistical (conditional) independence. Thus, a BN induces the factorization:<disp-formula id="equ1">
<mml:math id="mequ1">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x398;</mml:mtext>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x220f;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>D</mml:mi>
</mml:munderover>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mtext>&#x3a0;</mml:mtext>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>&#x398;</mml:mtext>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>where the global distribution <inline-formula id="inf6">
<mml:math id="minf6">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x398;</mml:mtext>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> factorizes into a set of local distributions; one for each <inline-formula id="inf7">
<mml:math id="minf7">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> with parameters <inline-formula id="inf8">
<mml:math id="minf8">
<mml:mrow>
<mml:msub>
<mml:mtext>&#x398;</mml:mtext>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, conditional on its parents <inline-formula id="inf9">
<mml:math id="minf9">
<mml:mrow>
<mml:msub>
<mml:mtext>&#x3a0;</mml:mtext>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Discrete BNs assume that a variable <inline-formula id="inf10">
<mml:math id="minf10">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is distributed multinomially conditioned on a configuration of its parents <inline-formula id="inf11">
<mml:math id="minf11">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mtext>&#x3a0;</mml:mtext>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf12">
<mml:math id="minf12">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>P</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mtext>&#x3a0;</mml:mtext>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the probability when <inline-formula id="inf13">
<mml:math id="minf13">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> conditioned on the <italic>j</italic>th value of the possible parent combinations. These discrete conditional distributions can be represented as conditional probability tables (CPTs) (<xref ref-type="bibr" rid="B26">Heckerman et al., 1995</xref>).</p>
<p>A factorization can represent multiple DAGs, this set of DAGs is known as the equivalence class and are said to be Markov equivalent. BNs of the same equivalence class share the same skeleton: the underlying undirected graph, and V-structures. The skeleton of a DAG is the undirected graph resulted by ignoring every edge&#x2019;s directionality. A V-structure is an unshielded common effect; that is, for the pattern of edges A <inline-formula id="inf14">
<mml:math id="minf14">
<mml:mo>&#x2192;</mml:mo>
</mml:math>
</inline-formula> C <inline-formula id="inf15">
<mml:math id="minf15">
<mml:mo>&#x2190;</mml:mo>
</mml:math>
</inline-formula> B, A and B are independent (<xref ref-type="fig" rid="F2">Figure 2</xref>). In this example, by having two edges pointing at it, C is said to have an in-degree of 2; A and B are the parent nodes, and C is the child node. The combination of both skeleton and V-structures is known as a complete partially directed acyclic graph, or CPDAG, and represents the equivalence class of DAGs for a factorization. Thus, we believe that how well structural learning algorithms recover the ground truth from observational data should include both skeleton and V-structure recovery.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>A CPDAG with four variables <bold>(A&#x2013;D)</bold>, with two possible DAG instantiations. The edges forming the V-structure <bold>(A&#x2013;C)</bold> are purple, and the two alternative <bold>(B&#x2013;D)</bold> connections are in red.</p>
</caption>
<graphic xlink:href="frai-04-612551-g002.tif"/>
</fig>
<p>In order for a Bayesian network to be considered causal, the parents of each of the nodes must be its direct causes. A node <italic>A</italic> is considered a direct cause of <italic>C</italic> if varying the value of <italic>A</italic>, while all other nodes remain unchanged, effects the distribution of <italic>C</italic> i.e.,:<disp-formula id="equ2">
<mml:math id="mequ2">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2260;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>Where <inline-formula id="inf16">
<mml:math id="minf16">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the interventional distribution; the distribution of <italic>C</italic> given an intervention on <italic>A</italic> that sets it to <inline-formula id="inf17">
<mml:math id="minf17">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Additionally, the causal Markov assumption must be made to treat a BN as causal; it is assumed all common causes are present within <italic>G</italic>, with each node being independent conditioned on its direct causes (<xref ref-type="bibr" rid="B25">Hausman and Woodward, 1999</xref>).</p>
</sec>
<sec id="s2-2">
<title>2.2 Related Work</title>
<p>When learning a Bayesian network we are attempting to model the underlying generative model behind a given dataset. Performance of causal discovery algorithms is a function of both the distance to the underlying causal structure, as well as the distance to the true parameters. However, measuring the distance is generally not possible as the ground truth is unavailable. One strategy to obtain some expected level of Bayesian network performance, in the absence of any ground truth to compare against, is to construct a proxy of the ground-truth. Conceptually, this is similar to the previously mentioned intersection-validation method. In this method a proxy agreement graph is constructed by taking the intersection of the output from many structure learning algorithms (<xref ref-type="bibr" rid="B66">Viinikka et al., 2018</xref>). These algorithms are then ranked by how many samples it takes to reach this agreement graph. This forms a dependence between the selection of algorithms and the proxy, and by extension the ranking. Forming a proxy independent of algorithm choice is desirable.</p>
<p>Synthetic data is system-generated data which is not obtained by any direct measurement. Generally, the goal of this generated data is to mimic some real-world data, given some user-defined parameters. One can create synthetic data by two means; by modification or generation. Data can be modified, normally through anonymization, to create a synthetic dataset. Alternatively, generative models such as Generative Adversarial Networks, Variational Auto-encoders or Normalizing Flows can be sampled from to create the data (<xref ref-type="bibr" rid="B32">Kingma and Welling, 2013</xref>; <xref ref-type="bibr" rid="B22">Goodfellow et al., 2014</xref>; <xref ref-type="bibr" rid="B50">Rezende and Mohamed, 2015</xref>). In this study, we required a generative model which could be explicitly represented as a BN, in order to ascertain how well BN learning procedures performed. As BNs are generative models themselves, our goal is to directly create Bayesian networks with similar properties to the underlying generative model behind the real-world processes.</p>
<p>Previous studies have used synthetic Bayesian networks to evaluate performance of structure learning algorithms (<xref ref-type="bibr" rid="B63">Tasaki et al., 2015</xref>; <xref ref-type="bibr" rid="B3">Andrews et al., 2018</xref>; <xref ref-type="bibr" rid="B64">Gadetsky et al., 2020</xref>; <xref ref-type="bibr" rid="B71">Zhang et al., 2020</xref>; <xref ref-type="bibr" rid="B21">Gogoshin et al., 2020</xref>) (<xref ref-type="table" rid="T1">Table 1</xref>). These are often limited in terms of user-controllable parameters, with structures being sampled uniformly from the space of DAGs, or limited in terms of variation in topology. Other studies use standard benchmark datasets (<xref ref-type="bibr" rid="B56">Scutari et al., 2019</xref>; <xref ref-type="bibr" rid="B48">Ramanan and Natarajan, 2020</xref>). A flexible synthetic generation system would allow the user to specify many parameters which influence the BN generation, in order to match a given real dataset as closely as possible.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Existing synthetic Bayesian network generation methods, with network attributes each method implements control over. Properties compared are those known both to vary, and to influence structure learning performance (&#x2713;) signifies a feature is implemented (&#x2717;) it is not, and (<bold>-</bold>) a partial implementation. Existing methods do not support latent confounding or offer control over parameter generation, and offer only random DAGs and at most one pre-defined structure type. Our work presents a step toward fully flexible structure generation, with these features the main remaining known limitations.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Method/Property</th>
<th align="right">Flexible DAG Generation</th>
<th align="right">Flexible Parameter Generation</th>
<th align="right">Controllable levels</th>
<th align="right">Latent confounding</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">CDG-T (ours)</td>
<td align="center">
<bold>-</bold>
</td>
<td align="center">
<bold>-</bold>
</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2717;</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B21">Gogoshin et al. (2020)</xref>
</td>
<td align="center">
<bold>-</bold>
</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2717;</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B71">Zhang et al. (2020)</xref>
</td>
<td align="center">
<bold>-</bold>
</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2717;</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B3">Andrews et al. (2018)</xref>
</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2717;</td>
<td align="center">&#x2713;</td>
<td align="center">&#x2717;</td>
</tr>
<tr>
<td align="left">
<xref ref-type="bibr" rid="B63">Tasaki et al. (2015)</xref>
</td>
<td align="center">
<bold>-</bold>
</td>
<td align="center">N/A</td>
<td align="center">N/A</td>
<td align="center">&#x2717;</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2-3">
<title>2.3 Causal Datasheet Generation Tool</title>
<p>There are two primary goals of the Causal Datasheet. The first goal is to provide some expectation of performance given the basic, observable, characteristics of a dataset. The second goal is to provide guidance as to how many samples will be required in order to meet desired performance levels. The proof-of-concept approach we employ is described in the subsequent Section, followed by an outline of the assumptions made using this method.</p>
<sec id="s2-3-1">
<title>2.3.1 Approach</title>
<p>Our general approach is illustrated in <xref ref-type="fig" rid="F3">Figure 3</xref>. In order to provide a performance estimate of structure and parameter learning for a given real dataset, we generate a set of synthetic Bayesian networks to act as a proxy for real data. Because we will have access to the ground-truths of these synthetic networks, we can calculate the performance of the structure learning, parameter learning, and any downstream estimates. Performance estimates will only be accurate so long as the generated synthetic datasets are similar enough to the given real dataset. We therefore generate synthetic BNs, and corresponding datasets, with matching observable characteristics of the real dataset. These characteristics include number of samples, variables, and levels. This corresponds to box I1 of <xref ref-type="fig" rid="F3">Figure 3</xref>. In addition to the observable characteristics, there are a number of unobservable characteristics which are varied throughout the BN generation&#x2013;these are discussed in <xref ref-type="sec" rid="s2-4">Section 2.4</xref>. A small Python library which can generate and sample synthetic BNs was developed, and can be found at: <ext-link ext-link-type="uri" xlink:href="https://pypi.org/project/BayNet/">https://pypi.org/project/BayNet/</ext-link>. An example Jupyter notebook of how to generate a datasheet can be found at: <ext-link ext-link-type="uri" xlink:href="https://github.com/predictive-analytics-lab/datasheet_generation.The">https://github.com/predictive-analytics-lab/datasheet_generation.The</ext-link> BN and data generation, as well as the learning and evaluation process is described in <xref ref-type="other" rid="alg1">Algorithm 1</xref>, corresponding to boxes A1&#x2013;A4 in <xref ref-type="fig" rid="F3">Figure 3</xref>. Box A1 concerns the generation of the synthetic data, details of which can be found in <xref ref-type="sec" rid="s2-4">Section 2.4</xref>. Box A2 and A3 concern learning a BN using the synthetic data, details of the structure learning algorithms used can be found in <xref ref-type="sec" rid="s2-5">Section 2.5</xref>. Box A4 concerns the evaluation of the learned models, the metrics used can be found in <xref ref-type="sec" rid="s2-6">Section 2.6</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Illustration of the approach used to create the Causal Datasheets presented in <xref ref-type="sec" rid="s3">Section 3</xref>. <xref ref-type="other" rid="alg1">Algorithm 1</xref> refers to the algorithm labelled <xref ref-type="other" rid="alg1">
<italic>Algorithm 1</italic>
</xref>
<italic>:CDG-T Overview</italic> below.</p>
</caption>
<graphic xlink:href="frai-04-612551-g003.tif"/>
</fig>
<p>The synthetic BN generation is performed <italic>T</italic> times per set of data characteristics, where <italic>T</italic> is a user-defined number of trials, in order to capture performance variation. While we attempt to capture as much of the space of possible BNs as we can, the number of experiments that can be performed are limited by finite computation resources. In our experiments, we set <italic>T</italic> to 10 to balance the total computation time for generating the data sets and learning the model spanning across configurations and to capture result uncertainty simply due to random seeding.<boxed-text id="dBox1">
<p>
<statement content-type="algorithm" id="alg1">
<label>
<bold>Algorithm</bold> 1:</label>
<p> CDG-T Overview.</p>
<p>Given:<list list-type="bullet">
<list-item>
<p>The observable properties of a real dataset (sample size <italic>n</italic>, number of variables <italic>p</italic>, average levels <italic>l</italic>)</p>
</list-item>
<list-item>
<p>The unobservable estimates of a real dataset (structure type <italic>t</italic>, CPT imbalance &#x3b1;)</p>
</list-item>
<list-item>
<p>The number of trials <italic>T</italic>
</p>
</list-item>
<list-item>
<p>Hyper-parameters of structure learning algorithms</p>
</list-item>
</list>
</p>
<p>For <inline-formula id="inf18">
<mml:math id="minf18">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2192;</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
<list list-type="simple">
<list-item>
<p>1. Generate a DAG <italic>G</italic> with variables <italic>V</italic> where <inline-formula id="inf19">
<mml:math id="minf19">
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> according to user-specified structure-types</p>
</list-item>
<list-item>
<p>2. Sample number of levels for each variable from <inline-formula id="inf20">
<mml:math id="minf20">
<mml:mrow>
<mml:mi mathvariant="script">U</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> where <inline-formula id="inf21">
<mml:math id="minf21">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>3. Populate the parameters of the BN <inline-formula id="inf22">
<mml:math id="minf22">
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>&#x398;</mml:mtext>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> with multinomial distributions <inline-formula id="inf23">
<mml:math id="minf23">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> drawn from a Dirichlet distribution with <inline-formula id="inf24">
<mml:math id="minf24">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b1;</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mi>&#x3b1;</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula> for each variable <inline-formula id="inf25">
<mml:math id="minf25">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>4. Draw <italic>s</italic> synthetic samples from synthetically created BN <italic>B</italic> to create synthetic dataset <inline-formula id="inf26">
<mml:math id="minf26">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>S</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>5. Learn a DAG using a set of structure learning algorithms using the synthetic samples</p>
</list-item>
<list-item>
<p>6. Learn the parameters with maximum likelihood estimation (MLE) using the learned DAG and synthetic samples</p>
</list-item>
<list-item>
<p>7. Record the structural performance (skeleton/v-structure precision, recall) and interventional performance (PCOR)</p>
</list-item>
</list>
</p>
</statement>
</p>
</boxed-text>
</p>
<p>We envision that this extensive evaluation is synthesized into a digestible <italic>Causal Datasheet for Existing Datasets</italic> or <italic>Causal Datasheet for Data Collection</italic> format (<xref ref-type="fig" rid="F3">Figure 3</xref>, Box O1). Domain experts can then assess whether this level of performance is sufficient for a particular application. Due to the flexibility of this system, we can not only construct proxies of existing datasets, but of datasets we plan to collect. In this manner, data collection can be designed around desired performance of our models. This concept is extensible to other systems with the capability to produce and evaluate synthetic data sets and structural learning algorithms. As the capability and flexibility of these systems increase, so too will the accuracy of the estimates within the Causal Datasheet.</p>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Assumptions</title>
<p>A number of assumptions are made in the generation of this synthetic data: A) we introduce no latent confounders. In order for a BN to be considered <italic>causal</italic>, one must assume there are no confounders absent. There are potentially complex repercussions of having confounders latent in a BN, but this is currently not examined. B) parameters are generated from a Dirichlet distribution assuming the <italic>&#x3b1;</italic> vector is uniform. The implications of this simplification are, given sufficient samples, the mean of the distributions drawn will be uniform. Therefore, generally, the marginal distributions of all nodes in the synthetic BNs will be uniform&#x2013;this can make the BN learning process easier, potentially inflating performance estimates for cases where variables are highly imbalanced. This is further discussed in <xref ref-type="sec" rid="s4">Section 4</xref>. Initial work has been performed to go beyond this simplification, and can be found in the supplementary material. C) it is assumed that the unobervable characteristics of a real dataset have been appropriately selected. We have assumed the used structure types can sufficiently represent the underlying DAG of a given real dataset. In the case these are incorrectly set, this could lead to incorrect performance estimates. D) we assume that synthetic data can sufficiently mimic a real dataset. Initial work has been performed to guide whether Assumptions C and D hold, and can be found in the supplementary material. These assumptions do not invalidate the concept of the Causal Datasheet, but must be kept in mind when interpreting results of a datasheet generated using CDG-T.</p>
</sec>
</sec>
<sec id="s2-4">
<title>2.4 Dataset Characteristics</title>
<p>To study the variability of structural learning performance with different synthetic data properties, we defined two classes of dataset characteristics that can be varied to produce a distribution of synthetic data: observable and non-observable (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>A table on the observability of the properties of BNs, as well as the values the synthetic generation tool can use.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Characteristic</th>
<th align="center">Observable</th>
<th align="center">Possible values</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Number of Samples</td>
<td align="center">&#x2713;</td>
<td align="center">1&#x2013;1,000,000</td>
</tr>
<tr>
<td align="left">Number of variables</td>
<td align="center">&#x2713;</td>
<td align="center">1&#x2013;500</td>
</tr>
<tr>
<td align="left">Average variable levels</td>
<td align="center">&#x2713;</td>
<td align="center">1&#x2013;10</td>
</tr>
<tr>
<td align="left">Structure type</td>
<td align="center">&#x2717;</td>
<td align="center">Forest fire, IC-DAG, barabasi-albert, waxman, Small world</td>
</tr>
<tr>
<td align="left">Maximum in-degree</td>
<td align="center">&#x2717;</td>
<td align="center">1&#x2013;<inline-formula id="inf27">
<mml:math id="minf27">
<mml:mi>&#x221e;</mml:mi>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">&#x3b1; (imbalance)</td>
<td align="center">&#x2717;</td>
<td align="center">0&#x2013;<inline-formula id="inf28">
<mml:math id="minf28">
<mml:mi>&#x221e;</mml:mi>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Observable characteristics are those which the designer of the dataset has control and can be easily calculated (e.g., sample size and number of variables). Non-observable characteristics are properties of the underlying truth (e.g., degree distribution, type of structure, or imbalance). Non-observable characteristics can be estimated, but doing so introduces modeling assumptions. When evaluating a real-world dataset in practice, one could look up a Causal Datasheet with corresponding observable characteristics, to estimate performance uncertainty from the unobservable characteristics. Number of samples, number of variables, and average variable levels are straightforward; we describe the other characteristics below.</p>
<sec id="s2-4-1">
<title>2.4.1 Structure Type</title>
<p>We make use of five existing graph generation algorithms when creating synthetic Bayesian networks (<xref ref-type="fig" rid="F4">Figure 4</xref>):<list list-type="bullet">
<list-item>
<p>
<bold>Forest Fire:</bold> A growing network model which resembles how forest fires spread to nearby nodes (<xref ref-type="bibr" rid="B37">Leskovec et al., 2005</xref>).</p>
</list-item>
<list-item>
<p>
<bold>IC-DAG:</bold> A graph generation algorithm which samples uniformly from the set of DAGs (<xref ref-type="bibr" rid="B27">Ide and Cozman, 2002</xref>).</p>
</list-item>
<list-item>
<p>
<bold>Barabasi-Albert</bold> An evolving graph generation algorithm which adds edges to a new node dependent on current in-degree (<xref ref-type="bibr" rid="B5">Barab&#xe1;si and Albert, 1999</xref>).</p>
</list-item>
<list-item>
<p>
<bold>Waxman:</bold> Nodes are placed uniformly in a rectangular domain (<xref ref-type="bibr" rid="B69">Waxman, 1988</xref>).</p>
</list-item>
<list-item>
<p>
<bold>Small-World:</bold> A type of graph where most nodes are not direct neighbors, but the shortest path between any two nodes is generally low (<xref ref-type="bibr" rid="B68">Watts and Strogatz, 1998</xref>).</p>
</list-item>
</list>
</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>20-node examples of the five structure types used to generate synthetic BNs. As the waxman structure type is a random geometric graph, the connectivity is proportional to the number of nodes&#x2013;at 20 nodes the DAG remains sparse.</p>
</caption>
<graphic xlink:href="frai-04-612551-g004.tif"/>
</fig>
<p>BNs decompose into a set of local distributions <inline-formula id="inf29">
<mml:math id="minf29">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mtext>&#x3a0;</mml:mtext>
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. This property is utilized in structure learning algorithms; local scores or conditional independence tests are used to test a parent <inline-formula id="inf30">
<mml:math id="minf30">
<mml:mo>&#x2192;</mml:mo>
</mml:math>
</inline-formula> child relationship. The difficulty in correctly identifying an edge is a function of the data-to-parameter ratio. The DAG has a direct effect on the number of parameters, as the higher the in-degree for a node, the more parameters it will have: <inline-formula id="inf31">
<mml:math id="minf31">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf32">
<mml:math id="minf32">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the product of number of parent levels. It follows then, that the DAG influences the difficulty of learning a BN by its distribution of node in-degrees. Having some control over this distribution is essential in order to complete a comprehensive evaluation. <xref ref-type="bibr" rid="B21">Gogoshin et al. (2020)</xref>, <xref ref-type="bibr" rid="B3">Andrews et al. (2018)</xref>, and <xref ref-type="bibr" rid="B71">Zhang et al. (2020)</xref> generate random networks with caps on maximum in-degree (<xref ref-type="table" rid="T1">Table 1</xref>). <xref ref-type="bibr" rid="B63">Tasaki et al. (2015)</xref> use graph generation algorithms in order to create synthetic BNs, but limit their use to a single type of topology. Here, we make use of multiple graph generation algorithms, allowing us to model many different <italic>realistic</italic> distributions of in-degrees, without having to specify them explicitly. Knowledge of what type of graphs are present in a particular domain can be incorporated by stratifying the structure type. New structure types can also be added in the case where current structure types do not sufficiency represent the topology of a specific domain.</p>
</sec>
<sec id="s2-4-2">
<title>2.4.2 Maximum In-degree</title>
<p>Maximum in-degree is the parameter which controls the cap on the number of parents each node can have within a network. We have found structures with high in-degrees have a major effect on the performance of structure learning algorithms. Having a parameter which can control this is crucial given prior knowledge about maximum in-degree is available. Unlike other studies, in the absence of domain knowledge we did not specifically cap the maximum in-degree in addition to what structural type would implicitly generate as previously mentioned.</p>
</sec>
<sec id="s2-4-3">
<title>2.4.3 Conditional Probability Table Imbalance: <italic>&#x3b1;</italic>
</title>
<p>The CPT for each node <inline-formula id="inf33">
<mml:math id="minf33">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the synthetic Bayesian network are populated from parameters drawn from a Dirichlet distribution, with <inline-formula id="inf34">
<mml:math id="minf34">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mi>&#x3b1;</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>. Where <inline-formula id="inf35">
<mml:math id="minf35">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the cardinality of variable <inline-formula id="inf36">
<mml:math id="minf36">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf37">
<mml:math id="minf37">
<mml:mrow>
<mml:msub>
<mml:mi>q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the product of cardinalities of parent set of <inline-formula id="inf38">
<mml:math id="minf38">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. A hyper-parameter, &#x3b1;, controls the over-all conditional imbalance, and thus connection strength, in the network. Consider an example of populating a CPT for a node with three levels and one parent. In the case the parent has two levels, two multinomial distributions must be drawn. One for each parent configuration. For example, using an <italic>&#x3b1;</italic> value of 12, applying the normalization: <inline-formula id="inf39">
<mml:math id="minf39">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>12</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Given this low value, the Dirichlet will likely draw a two low entropy multinomial distributions such as <inline-formula id="inf40">
<mml:math id="minf40">
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mn>.8</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>.1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>.1</mml:mn>
<mml:mo>]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf41">
<mml:math id="minf41">
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mn>.2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>.7</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>.1</mml:mn>
<mml:mo>]</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. As these distributions are substantially different from one another, the relationship between the parent and child should be relatively easy to observe once data has been drawn conditioned on the parent value. Note that <inline-formula id="inf42">
<mml:math id="minf42">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a vector <inline-formula id="inf43">
<mml:math id="minf43">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">&#x211d;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and the simplifying assumption has been made that it is uniform. Thus, the <italic>&#x3b1;</italic> controls the imbalance of the distributions drawn, but does not enforce any consistency as to which values are imbalanced across the conditional distribution. While we currently present results from a uniform <italic>&#x3b1;</italic> estimate within the datasheets, we have completed preliminary work to estimate a non-uniform <italic>&#x3b1;</italic> from any given dataset. Descriptions of this method, as well as results, can be found in the <xref ref-type="sec" rid="s9">Supplementary Material</xref>.</p>
</sec>
</sec>
<sec id="s2-5">
<title>2.5 Structure Learning Algorithms</title>
<p>The causal discovery step of training a causal Bayesian network is performed by structure learning algorithms (<xref ref-type="bibr" rid="B20">Glymour et al., 2019</xref>). In the current iteration of the Causal Datasheet three state-of-the-art structure learning algorithms are used. Each of the algorithms represents an example of constraint-based, score-based, and hybrid class of structural learning algorithms:<list list-type="bullet">
<list-item>
<p>
<bold>Peter-Clark (PC):</bold> A <bold>Constraint-based</bold> algorithm. This algorithm starts with the graph fully connected, then uses (conditional) independence tests to iteratively prune edges. The chi-square test with mutual information is used (<xref ref-type="bibr" rid="B62">Spirtes et al., 2000</xref>).</p>
</list-item>
<list-item>
<p>
<bold>Greedy Equivalence Search (GES):</bold> A greedy <bold>Score-based</bold> algorithm, which goes through phases of adding then removing edges where doing so increases the score, alternating until the score no longer improves (<xref ref-type="bibr" rid="B10">Chickering, 1995</xref>). A commonly used information theoretic score, the Bayesian Information Criterion (BIC), is used (<xref ref-type="bibr" rid="B53">Schwarz, 1978</xref>).</p>
</list-item>
<list-item>
<p>
<bold>OrderMCMC:</bold> A <bold>Hybrid</bold> algorithm. This optimizes a score in the space of topological node orderings, rather than DAGs. This is an implementation of Markov Chain Monte Carlo method in the topological node ordering space (<xref ref-type="bibr" rid="B17">Friedman and Koller, 2003</xref>), based on modifications proposed by <xref ref-type="bibr" rid="B34">Kuipers et al. (2018)</xref>. Each order is scored by summing the maximum score for each node, out of parent sets falling in the intersection of those permitted by the order and those in a candidate set. This set is typically initialized using a constraint-based method, such as PC, however we instead use pairwise mutual information in an effort to decouple the algorithm&#x2019;s performance from that of <italic>P</italic>C. This candidate parent set is greedily expanded to improve recall. This combination of constrained global search with greedy means performance should be lower bounded by GES, but at much higher computation cost. For consistency with the GES algorithm the BIC score is used. We were also interested in a novel application of OrderMCMC using a recently developed score&#x2013;the Quotient Normalized Maximum Likelihood (qNML) score; we included this to demonstrate differences due to choice of score (<xref ref-type="bibr" rid="B60">Silander et al., 2018</xref>).</p>
</list-item>
</list>
</p>
<p>These algorithms were selected in order to include one of each constraint-based, score-based, and hybrid structure learning algorithms. PC and GES were used as the constraint-based and score-based representatives as they are easily available algorithms, in terms of implementation (<xref ref-type="bibr" rid="B54">Scutari, 2009</xref>; <xref ref-type="bibr" rid="B28">Kalainathan and Goudet, 2019</xref>). OrderMCMC was used as the hybrid representative as it is the algorithm which is currently employed for the real-world examples in <xref ref-type="sec" rid="s3">Section 3</xref>. The OrderMCMC implementation is currently proprietary; access can be granted from the authors on a per-request basis.</p>
<sec id="s2-5-1">
<title>2.5.1 Score Functions</title>
<p>Score-based and hybrid structure learning algorithms&#x2019; performance is highly dependent on choice of score function. While equivalent in the infinite data limit, the qNML and BIC scores differ significantly for small sample sizes (<xref ref-type="bibr" rid="B60">Silander et al., 2018</xref>). This is due to the difference in penalization; while both are based on the minimum description length (MDL) principle, they take differing approaches. The BIC takes a Bayesian approach to the penalization, using the number of parameters scaled by the log of the sample size (<xref ref-type="bibr" rid="B53">Schwarz, 1978</xref>); while the qNML is based on the NML (Normalized Maximum Likelihood), an exact formulation of the minimax code length regret (<xref ref-type="bibr" rid="B23">Gr&#xfc;nwald and Grunwald, 2007</xref>). Both are score equivalent and free of tuning hyper-parameters.</p>
<p>Another score function prominent in literature is Bayesian Dirichlet equivalent uniform (BDeu) (<xref ref-type="bibr" rid="B8">Buntine, 1991</xref>), however it has been shown to be highly sensitive to its hyper-parameter, the effective sample size (<xref ref-type="bibr" rid="B59">Silander et al., 2012</xref>)&#x2013;it is therefore impossible to give a reasonable estimate of performance, thus making it unsuitable for use in a datasheet.</p>
</sec>
</sec>
<sec id="s2-6">
<title>2.6 Metrics</title>
<sec id="s2-6-1">
<title>2.6.1 Structural Performance</title>
<p>Discovery of the entire causal topology through structure learning algorithms is an appealing feature of BNs in global health settings. This sets it apart from simply testing causality between a candidate cause and the outcome interest (bivariate causal discovery), where a practitioner would be ignorant of the interaction of the system as a whole. To empirically evaluate structure learning methods with different synthetic characteristics, we measure the precision and recall of the learned structure with respect to the ground truth structure. This allowed us to separate errors into learning false edges v not identifying true edges, as opposed to quantifying aggregated structural distance measures (e.g., Structural Hamming Distance (<xref ref-type="bibr" rid="B14">de Jongh and Druzdzel, 2009</xref>)). For the same reason, we did not include a summarization of precision and recall, such as the F1 score. Having a clear separation of precision and recall is important in decision making; situations may arise where practitioners must favor one over the other, and the two are often a trade-off.</p>
<p>Structure learning algorithms estimate a DAG up to the equivalence class (CPDAG). Therefore, we do not calculate the precision and recall with respect to the true DAG, but the learned skeleton and V-structures to their ground truth counterparts. Evaluating with respect to the DAG, while helpful from an ease-of-interpretability standpoint, introduces randomly directed edges correlated to the infrequency of V-structures. This correlation can lead to misleading hypothesis when performing experiments across many different types of structure with varying prevalence of V-structures.</p>
<p>Precision and recall for the skeleton and V-structures of a structure are calculated in the standard manner:<disp-formula id="equ3">
<mml:math id="mequ3">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>In the case of the skeleton true positives (TPs) are the number of undirected edges which are in both the true and learnt structure. False positives (FPs) are the number of undirected arcs which are in the learnt, yet not present in the true structure. False negatives (FNs) are the number of undirected arcs which are in the true, but not in the learnt structure. For V-structures, true positives are the number of V-structures which are present in both the learnt CPDAG and the true DAG. False positives are the number of V-structures in the learnt CPDAG while not in the true DAG. False negatives are present in the true DAG, but not the learnt CPDAG.</p>
</sec>
<sec id="s2-6-2">
<title>2.6.2 Interventional Performance</title>
<p>One of the key uses of a causal Bayesian Network model is that, for a given outcome variable of interest, one can test hypothetical interventions on each variable. One can then compute the <italic>interventional</italic> odds ratio (OR) of how the outcome may change based on the intervention.<disp-formula id="equ4">
<mml:math id="mequ4">
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:mo>/</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>B</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
</mml:mfrac>
<mml:mo>/</mml:mo>
<mml:mfrac>
<mml:mi>b</mml:mi>
<mml:mi>d</mml:mi>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>The results of this intervention encompasses both the causal structure learned, and the parameters (estimated by Maximum Likelihood) of the conditional probability tables at each variable. We calculate the standard error for the odds ratios by:<disp-formula id="equ5">
<mml:math id="mequ5">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>where <italic>N</italic> is the number of samples in the training data. 95% Confidence intervals are then obtained by <inline-formula id="inf44">
<mml:math id="minf44">
<mml:mrow>
<mml:mtext>log</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xb1;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mn>1.96</mml:mn>
</mml:mrow>
</mml:msqrt>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>In our Causal Datasheet we also wish to estimate how well we can approximate the true impact of interventions. A metric has been developed to measure the <italic>proportion of correct interventional odds ratios</italic> (PCOR) to quantify the impact of different learned structures on the interventional odds ratios. The measure was designed to provide an answer the question to practitioners: how trustworthy should these interventional odds ratios be with my dataset?</p>
<p>We calculate this metric by first splitting odds ratios into three types of effect: Protective (less than 1), detrimental (greater than 1), and neutral where the confidence interval crosses 1 (<xref ref-type="fig" rid="F5">Figure 5</xref>). This is represented by the piecewise function in <xref ref-type="disp-formula" rid="e2">Eq. 2</xref>. The piecewise function is then used within PCOR (<xref ref-type="disp-formula" rid="e1">Eq. 1</xref>) to calculate the proportion of correctly categorized ORs.<disp-formula id="e1">
<mml:math id="me1">
<mml:mrow>
<mml:mtext>PCOR</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>O</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>O</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mtext>&#x200b;</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>O</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mtext>max</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>O</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mtext>&#x200b;</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>O</mml:mi>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mi>f</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>O</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="me2">
<mml:mrow>
<mml:mtext>with</mml:mtext>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>O</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mtext>if&#xa0;protective</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mn>0</mml:mn>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mtext>if&#xa0;neutral</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mn>1</mml:mn>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mtext>if&#xa0;detrimental</mml:mtext>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <italic>O</italic> is the set of odds ratios obtained by performing all possible interventions on target <italic>T</italic> on the true BN <italic>B</italic> and <inline-formula id="inf45">
<mml:math id="minf45">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>O</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> are the corresponding odds ratio estimates from the learnt BN <inline-formula id="inf46">
<mml:math id="minf46">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>B</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. For a synthetic Bayesian network, the target is heuristically selected as the variable with the maximum number of ancestors.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Illustration demonstrating the three categories of odds ratio <italic>types</italic>. Protective effects are within the bounds of the yellow Section, Detrimental within red, and neutral where the confidence intervals intersect 1 (examples within the purple boxes).</p>
</caption>
<graphic xlink:href="frai-04-612551-g005.tif"/>
</fig>
<p>Due to variation in importance of interventions vs. outcomes, we allow the user to specify a threshold of PCOR. Recommendations from the Causal Datasheet should then be based off whether this threshold was met. Ultimately, because PCOR relies on both the structure and parameters of the network being learnt sufficiently well, these do not need to be individually assessed.</p>
</sec>
</sec>
<sec id="s2-7">
<title>2.7 Causal Datasheet for Datasets</title>
<p>There are two types of Causal Datasheets one may follow dependent on usage: a <bold>Causal Datasheet for Data Collection</bold> and a <bold>Causal Datasheet for Existing Datasets</bold>. CDG-T is designed to allow the adjustment of numerous characteristics of synthetic data to mimic that a real-world dataset. For existing datasets, as well as the user-defined characteristics, the structure types are varied in order to capture variation in performance due to differing causal structures. For data collection, the sample size and number of variables are also varied so a user could decide what combinations of sample size and number of variables (and if applicable, structural learning algorithm) best meet the user&#x2019;s analytic needs in a lookup table.</p>
<sec id="s2-7-1">
<title>2.7.1 Causal Datasheet for Data Collection</title>
<p>If researchers are designing their own survey or evaluation instrument, determining the sample size is a critical step. Researchers want to include a sufficient number of samples so that they can have confidence in their model results, but do not want to waste time, money, or effort by collecting unnecessarily large numbers of samples. Researchers are often constrained by budgetary concerns in low-resource settings, and adding an extra thousand samples may end up costing thousands of extra dollars in effort.</p>
<p>In traditional public health and medical studies, a priori power analysis is the preferred tool for quantifying the samples size needed to sufficiently detect changes, treatment effects, or associations between variables (<xref ref-type="bibr" rid="B46">Pourhoseingholi et al., 2013</xref>). The Datasheet for Data Collection can fill a similar role for BN analysis. In the Datasheet for Data Collection, users can specify a range of desired, potential sample size and variable size and then estimate performance for models of interest.</p>
<p>The resulting datasheet is organized in four main sections: Recommendations, Proportion of Correct Odds Ratios, Skeleton Precision and Recall, and V-structure Precision and Recall. The <italic>recommendations</italic> Section outlines the main takeaways and suggestions from the data creator/curator who examined the expected performance across a given range of sample sizes and variable sizes, and should be treated as a guide. <italic>Proportion of Correct Odds Ratios</italic>, <italic>Skeleton</italic> and <italic>V-structure performance</italic> sections allow a user to look up combinations of sample sizes and variable sizes that would fit the analysis requirement for correctness of intervention effect, correct edges ignoring directions, and correct edges considering directions accordingly. The resulting datasheet produces surface plots so a user can explore in either or both sample size or variable size dimensions that may maximize expected (median) model learning performance given the user&#x2019;s desired study design. One should also consider the variation of measure performances - lower variation is better, as it suggests an algorithm is less sensitive to the unobserved characteristics. This is provided as Inter-Quartile Range (IQR) in tables corresponding to the surface plots. If two algorithms, and/or two combination of sample size and variable sizes result in similar distribution of performance measures (median and IQR), then one may choose either one. This datasheet provides a general guidance for determining the best combination of sample size and number of variables to maximize BN model performance.</p>
</sec>
<sec id="s2-7-2">
<title>2.7.2 Causal Datasheet for Existing Datasets</title>
<p>The Datasheet for Existing Datasets assists researchers in determining the suitability of using BNs to meet research objectives, given they already know their sample size and number of variables. The goal of this datasheet is to provide insight into how much confidence they should have in BN models learnt from this dataset. Additionally, researchers may use this datasheet to determine which algorithms to use, and how much of a ground truth DAG they can expect to recover. For example, researchers in global health often rely on previously deployed datasets to generate insights. A public health researcher might want to use data from an existing survey to generate causal insights around health decisions in a particular area. They could use the Datasheet for Existing Datasets to evaluate its suitability for generating insights and to inform feature engineering decisions.</p>
<p>This type of Causal Datasheet starts with the data characteristics used to generate the synthetic data sets and to compute the various metrics followed with recommendations to a potential audience who may be considering using the data set to infer causal relationships. This leading section outlines the main takeaways and suggestions from the data creator/curator who examined the data set. The main body of the datasheet is then broken down to 1. Correctness of causal effects, 2. Learning the Skeleton, 3. Learning the Direction, 4. Improving with More Samples, and 5. Improving with Less Variables. The goal is to offer the potential data user the best guesses of expected performance from different perspectives, recognizing that different applications may call for choosing an algorithm or feature engineering approach optimized for different measures. These measures are described above and on a scale from 0 to 1, with 1 being perfect. They are depicted with violin plots to aid assessment of expected uncertainties given a measure. If there are multiple &#x201c;bumps&#x201d; in the violin plots, in our experience, this is because of different structural learning performance as results of learning different data sets generated with different structure types. If desired, performance stratification by structure type and further investigation may be warranted; however this is beyond the scope of the current work. In <italic>Correctness of Causal Effects</italic>, the PCOR metrics is used as an attempt to estimate how well we can approximate intervention impact using different structural learning algorithms where 1 represents all protective, neutral and detrimental intervention effects are likely to be correctly captured. In <italic>Learning the Skeleton</italic> Section, the precision and recall of edges, ignoring the directions, are presented. In <italic>Learning the Direction</italic> Section, edge directions (as a V-structures) are also considered in the precision and recall. Skeleton learning performance measures are usually better than V-structure learning performance; if one is not too concerned with learning the causal directions, one may be satisfied with good skeleton learning performance alone. Moreover, one may care more about recall over precision (e.g., in an exploratory study aiming to identify all potential relationships between variables) or vice versa. Lastly, the Improving with More Samples and Improving with Less Variables sections show how much more data or how many variables to reduce to improve structure learning performance for different algorithms, assuming relevant causal statistics is not degraded by the reduction of variables in the latter.</p>
</sec>
</sec>
<sec id="s2-8">
<title>2.8 Meta-Feature Similarity Between Synthetic Datasets and Existing Dataset of Interest</title>
<p>Our approach makes the assumption that we could tune the characteristics of the synthetic datasets such that the synthetic datasets are similar to the dataset of interest, and thus it is reasonable to suggest that expected performance on the dataset of interest could be approximated by metrics computed on the synthetic datasets. How similar are the synthetic datasets to the existing dataset of interest? We should not compare them using only data characteristics that are used to generate the synthetic datasets. Instead, we borrow the concept of defining meta-features and computing dataset similarities from the Bayesian optimization hyperparameter initialization literature (<xref ref-type="bibr" rid="B70">Wistuba et al., 2015</xref>). For categorical datasets, information theoretic meta-features (i.e., Shannon&#x2019;s entropy and Concentration Coefficient) are computed for both the real and synthetic datasets (<xref ref-type="bibr" rid="B40">Michie et al., 1994</xref>; <xref ref-type="bibr" rid="B2">Alexandros and Melanie, 2001</xref>). The similarity between them then is computed as the mean cosine similarity, with 0 being completely dissimilar and 1 being identical. For additional details, please refer to the Supplementary Material.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Results</title>
<sec id="s3-1">
<title>3.1 Causal Datasheet for Data Collection Example: Survey Design of a Study of Sexual and Reproductive Health</title>
<p>We first used a Datasheet for Data Collection, generated using CDG-T, to determine the appropriate sample size of a survey we deployed in Madhya Pradesh, India (<xref ref-type="sec" rid="s9">Supplementary Material A</xref>). In 2019, we had the opportunity to use the CDG-T to determine the sample size of a large-scale survey of sexual and reproductive health (SRH) we conducted in Madhya Pradesh, India. Determining sample size of this study was important because it had implications for the overall budget and timeline of our project. Typically we wish to have a survey to capture as many variable as possible (provided the survey is not too long) with as few samples as possible. Our survey sought to quantify a wide range of causal drivers around family planning decisions. These variables included demographics, knowledge and beliefs, risk perceptions, past experiences, and structural determinants such as accessibility. We estimated that we would have between 30&#x2013;60 variables that would be critical causal drivers of sexual and reproductive health decisions. From previous work, we estimated that causal variables would have, on average, three levels. We decided to use the Datasheet for Data Collection to determine model performance for between 5,000 and 15,000 survey respondents before commissioning the field study. While we had determined that 5,000 respondents was likely a large enough sample to have sufficient power for predictive regression models, we did not know whether this sample size would have sufficient performance for a causal Bayesian network model. The range of 5,000 to 15,000 samples represented the budget constraints of our survey.</p>
<p>For simplicity, we varied the potential number of variables to be included in the model and the potential sample sizes while keep other synthetic data property constant (<xref ref-type="table" rid="T3">Table 3</xref>).</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>The values used for each property when creating synthetic BNs (and their associated datasets) for the SRH datasheet. Combinatorial total of 175 property values, Each configuration of properties was repeated 10 times. In total 1750 Bayesian networks and datasets were created.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Property</th>
<th align="center">Values</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Variables</td>
<td align="center">30, 35, 40, 45, 50, 55, 60</td>
</tr>
<tr>
<td align="left">Samples</td>
<td align="center">5,000, 7,500, 10,000, 12,500, 15,000</td>
</tr>
<tr>
<td align="left">Average levels</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">Structure types</td>
<td align="center">Forest fire, barabasi-albert, IC-DAG, waxman, Small-world</td>
</tr>
<tr>
<td align="left">Maximum in-degree</td>
<td align="center">Uncapped</td>
</tr>
<tr>
<td align="left">&#x03B1;</td>
<td align="center">20</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The datasheet revealed insights around the optimal sample size for our study. We found that, in general, the OrderMCMC algorithm was the best for PCOR, skeleton precision and recall, and V-structure precision (<xref ref-type="table" rid="T4">Table 4</xref>) and recall (<xref ref-type="table" rid="T5">Table 5</xref>). When comparing model performance metrics, we found that 5,000 samples would likely not be enough to build robust BN models for designing interventions because, across all numbers of variables, the V-structure recall was low (&#x3c;0.42, GES and PC) or was high but had high IQR with both OrderMCMC instantiations <inline-formula id="inf47">
<mml:math id="minf47">
<mml:mo>&#x3e;</mml:mo>
</mml:math>
</inline-formula> 0.40. Our datasheet showed that as we increased sample size, the IQR of V-structure recall for the OrderMCMC algorithm decreased. In order to have better confidence in our Bayesian network models, we determined that we would need a sample of around 15,000 respondents to balance our desire of having at least 50 variables while minimizing the IQR of V-structure recall (<xref ref-type="table" rid="T5">Table 5</xref>).</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Pivot Table of V-structure precision. Rows stratify by number of variables. Columns are over samples size. V-structure Precision performance is provided as: <italic>Median (IQR)</italic>. Highest precision in each sample/variable combination is in bold.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Number of variables</th>
<th align="center">Algorithm</th>
<th align="center">5,000</th>
<th align="center">7,500</th>
<th align="center">10,000</th>
<th align="center">12,500</th>
<th align="center">15,000</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">30</td>
<td align="left">GES</td>
<td align="center">0.37 (0.31)</td>
<td align="center">0.39 (0.31)</td>
<td align="center">0.47 (0.37)</td>
<td align="center">0.39 (0.31)</td>
<td align="center">0.41 (0.31)</td>
</tr>
<tr>
<td align="left">30</td>
<td align="left">OrderMCMC (BIC)</td>
<td align="center">
<bold>1.00</bold> (<bold>0.04)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
<td align="center">1.00 (0.04)</td>
<td align="center">1.00 (0.02)</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
</tr>
<tr>
<td align="left">30</td>
<td align="left">OrderMCMC (qNML)</td>
<td align="center">1.00 (0.65)</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
</tr>
<tr>
<td align="left">30</td>
<td align="left">PC</td>
<td align="center">0.86 (0.24)</td>
<td align="center">0.89 (0.13)</td>
<td align="center">0.89 (0.13)</td>
<td align="center">0.89 (0.12)</td>
<td align="center">0.89 (0.13)</td>
</tr>
<tr>
<td align="left">40</td>
<td align="left">GES</td>
<td align="center">0.37 (0.24)</td>
<td align="center">0.36 (0.29)</td>
<td align="center">0.36 (0.21)</td>
<td align="center">0.38 (0.27)</td>
<td align="center">0.32 (0.37)</td>
</tr>
<tr>
<td align="left">40</td>
<td align="left">OrderMCMC (BIC)</td>
<td align="center">
<bold>1.00</bold> (<bold>0.03)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.02)</bold>
</td>
<td align="center">1.00 (0.02)</td>
<td align="center">1.00 (0.03)</td>
<td align="center">1.00 (0.02)</td>
</tr>
<tr>
<td align="left">40</td>
<td align="center">OrderMCMC (qNML)</td>
<td align="center">0.97 (0.50)</td>
<td align="center">1.00 (0.02)</td>
<td align="center">1.00 (0.00)</td>
<td align="center">1.00 (0.00)</td>
<td align="center">1.00 (0.00)</td>
</tr>
<tr>
<td align="left">40</td>
<td align="left">PC</td>
<td align="center">0.89 (0.16)</td>
<td align="center">0.87 (0.13)</td>
<td align="center">0.88 (0.15)</td>
<td align="center">0.89 (0.14)</td>
<td align="center">0.88 (0.16)</td>
</tr>
<tr>
<td align="left">50</td>
<td align="left">GES</td>
<td align="center">0.32 (0.20)</td>
<td align="center">0.33 (0.32)</td>
<td align="center">0.35 (0.25)</td>
<td align="center">0.32 (0.28)</td>
<td align="center">0.34 (0.30)</td>
</tr>
<tr>
<td align="left">50</td>
<td align="left">OrderMCMC (BIC)</td>
<td align="center">0.98 (0.04)</td>
<td align="center">1.00 (0.03)</td>
<td align="center">1.00 (0.03)</td>
<td align="center">0.99 (0.04)</td>
<td align="center">1.00 (0.03)</td>
</tr>
<tr>
<td align="left">50</td>
<td align="left">OrderMCMC (qNML)</td>
<td align="center">0.97 (0.61)</td>
<td align="center">0.99 (0.06)</td>
<td align="center">1.00 (0.01)</td>
<td align="center">1.00 (0.01)</td>
<td align="center">1.00 (0.00)</td>
</tr>
<tr>
<td align="left">50</td>
<td align="left">PC</td>
<td align="center">0.86 (0.16)</td>
<td align="center">0.87 (0.12)</td>
<td align="center">0.88 (0.12)</td>
<td align="center">0.88 (0.16)</td>
<td align="center">0.88 (0.15)</td>
</tr>
<tr>
<td align="left">60</td>
<td align="left">GES</td>
<td align="center">0.34 (0.20)</td>
<td align="center">0.34 (0.22)</td>
<td align="center">0.32 (0.24)</td>
<td align="center">0.30 (0.26)</td>
<td align="center">0.34 (0.25)</td>
</tr>
<tr>
<td align="left">60</td>
<td align="left">OrderMCMC (BIC)</td>
<td align="center">
<bold>0.98</bold> (<bold>0.03)</bold>
</td>
<td align="center">
<bold>0.99</bold> (<bold>0.03)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.02)</bold>
</td>
<td align="center">0.98 (0.03)</td>
<td align="center">0.99 (0.03)</td>
</tr>
<tr>
<td align="left">60</td>
<td align="left">OrderMCMC (qNML)</td>
<td align="center">0.93 (0.50)</td>
<td align="center">0.98 (0.23)</td>
<td align="center">
<bold>1.00</bold> (<bold>0.02)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.02)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.02)</bold>
</td>
</tr>
<tr>
<td align="left">60</td>
<td align="left">PC</td>
<td align="center">0.86 (0.20)</td>
<td align="center">0.87 (0.20)</td>
<td align="center">0.86 (0.17)</td>
<td align="center">0.90 (0.17)</td>
<td align="center">0.88 (0.16)</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Pivot Table of V-structure recall. Rows stratify by number of variables. Columns are over samples size. V-structure Recall performance is provided as: <italic>Median (IQR)</italic>. Highest recall in each sample/variable combination is in bold.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Number of variables</th>
<th align="center">Algorithm</th>
<th align="center">5,000</th>
<th align="center">7,500</th>
<th align="center">10,000</th>
<th align="center">12,500</th>
<th align="center">15,000</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">30</td>
<td align="left">GES</td>
<td align="center">0.21 (0.25)</td>
<td align="center">0.28 (0.33)</td>
<td align="center">0.30 (0.34)</td>
<td align="center">0.31 (0.25)</td>
<td align="center">0.33 (0.36)</td>
</tr>
<tr>
<td align="left">30</td>
<td align="left">OrderMCMC (BIC)</td>
<td align="center">0.94 (0.33)</td>
<td align="center">0.95 (0.28)</td>
<td align="center">0.95 (0.24)</td>
<td align="center">0.99 (0.21)</td>
<td align="center">0.99 (0.17)</td>
</tr>
<tr>
<td align="left">30</td>
<td align="left">OrderMCMC (qNML)</td>
<td align="center">
<bold>1.00</bold> (<bold>0.02)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
</tr>
<tr>
<td align="left">30</td>
<td align="left">PC</td>
<td align="center">0.43 (0.49)</td>
<td align="center">0.49 (0.49)</td>
<td align="center">0.54 (0.42)</td>
<td align="center">0.56 (0.44)</td>
<td align="center">0.56 (0.45)</td>
</tr>
<tr>
<td align="left">40</td>
<td align="left">GES</td>
<td align="center">0.26 (0.19)</td>
<td align="center">0.31 (0.25)</td>
<td align="center">0.30 (0.26)</td>
<td align="center">0.33 (0.25)</td>
<td align="center">0.36 (0.23)</td>
</tr>
<tr>
<td align="left">40</td>
<td align="left">OrderMCMC (BIC)</td>
<td align="center">0.84 (0.47)</td>
<td align="center">0.91 (0.43)</td>
<td align="center">0.92 (0.43)</td>
<td align="center">0.94 (0.39)</td>
<td align="center">0.97 (0.28)</td>
</tr>
<tr>
<td align="left">40</td>
<td align="left">OrderMCMC (qNML)</td>
<td align="center">
<bold>1.00</bold> (<bold>0.20)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.03)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.03)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.00)</bold>
</td>
</tr>
<tr>
<td align="left">40</td>
<td align="left">PC</td>
<td align="center">0.37 (0.26)</td>
<td align="center">0.41 (0.33)</td>
<td align="center">0.47 (0.23)</td>
<td align="center">0.46 (0.28)</td>
<td align="center">0.48 (0.27)</td>
</tr>
<tr>
<td align="left">50</td>
<td align="left">GES</td>
<td align="center">0.25 (0.17)</td>
<td align="center">0.28 (0.13)</td>
<td align="center">0.32 (0.20)</td>
<td align="center">0.32 (0.17)</td>
<td align="center">0.36 (0.22)</td>
</tr>
<tr>
<td align="left">50</td>
<td align="left">OrderMCMC (BIC)</td>
<td align="center">0.86 (0.42)</td>
<td align="center">0.90 (0.43)</td>
<td align="center">0.91 (0.40)</td>
<td align="center">0.93 (0.39)</td>
<td align="center">0.94 (0.38)</td>
</tr>
<tr>
<td align="left">50</td>
<td align="center">OrderMCMC (qNML)</td>
<td align="center">
<bold>0.97</bold> (<bold>0.40)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.33)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.19)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.21)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.05)</bold>
</td>
</tr>
<tr>
<td align="left">50</td>
<td align="left">PC</td>
<td align="center">0.36 (0.17)</td>
<td align="center">0.38 (0.22)</td>
<td align="center">0.39 (0.31)</td>
<td align="center">0.40 (0.32)</td>
<td align="center">0.39 (0.30)</td>
</tr>
<tr>
<td align="left">60</td>
<td align="left">GES</td>
<td align="center">0.24 (0.17)</td>
<td align="center">0.27 (0.18)</td>
<td align="center">0.29 (0.17)</td>
<td align="center">0.30 (0.21)</td>
<td align="center">0.33 (0.26)</td>
</tr>
<tr>
<td align="left">60</td>
<td align="left">OrderMCMC (BIC)</td>
<td align="center">0.72 (0.39)</td>
<td align="center">0.80 (0.38)</td>
<td align="center">0.84 (0.35)</td>
<td align="center">0.84 (0.34)</td>
<td align="center">0.86 (0.35)</td>
</tr>
<tr>
<td align="left">60</td>
<td align="left">OrderMCMC (qNML)</td>
<td align="center">
<bold>0.93</bold> (<bold>0.34)</bold>
</td>
<td align="center">
<bold>0.98</bold> (<bold>0.33)</bold>
</td>
<td align="center">
<bold>0.99</bold> (<bold>0.31)</bold>
</td>
<td align="center">
<bold>0.98</bold> (<bold>0.29)</bold>
</td>
<td align="center">
<bold>1.00</bold> (<bold>0.30)</bold>
</td>
</tr>
<tr>
<td align="left">60</td>
<td align="left">PC</td>
<td align="center">0.34 (0.29)</td>
<td align="center">0.35 (0.33)</td>
<td align="center">0.35 (0.34)</td>
<td align="center">0.38 (0.35)</td>
<td align="center">0.39 (0.34)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Fortunately, our budget constraints allowed us to expand our sample size to meet this constraint. However, in many cases, organizations operating in LMICs would not be able to treble their sample size. Here, the CDG-T also provides useful advice. For example, if our sample size remained at 5,000, reducing the number of causal variables from 60 to 30 would cause V-structure recall to increase for all algorithms and V-structure IQR to decrease. This would significantly improve confidence in the produced BN models, but with the implication that a potentially different analytical question may be necessary.</p>
<p>The CDG-T Datasheet for Data Collection provides useful information even if researchers decide that they do not want to reduce variables or increase sample size by estimating the performance of a DAG before a survey is carried out. This allows researchers to know what kind of insights and results they will be able to generate.</p>
</sec>
<sec id="s3-2">
<title>3.2 Causal Datasheet for Existing Datasets Example: Analysis of an Existing Global Health Survey (Surgo Household Dataset)</title>
<p>As the second example, we generated a Causal Datasheet for a global development dataset we administered in Uttar Pradesh, India in 2016 (<xref ref-type="bibr" rid="B61">Smittenaar et al., 2020</xref>) (<xref ref-type="sec" rid="s9">Supplementary Material B</xref>). For simplicity, we refer to this dataset as Surgo Household survey or SHH. It sought to quantify household reproductive, maternal, neonatal, and child health (RMNCH) journeys and to understand the drivers of various RMNCH behaviors. In all, we surveyed over 5,000 women on various RMNCH behaviors and outcomes. From this survey, we initially identified 41 variables we thought represented critical causal drivers of RMNCH outcomes and behaviors such as birth delivery locations and early breastfeeding initiation. We were interested in understanding which interventions might be most important for different health outcomes. While it was possible to use our datasets to generate DAGs, we could not validate their structures, nor could we assign confidence to graphs generated using different structural learning algorithms.</p>
<p>Using survey dataset characteristics, we generated synthetic dataset experiments with similar properties (<xref ref-type="table" rid="T6">Table 6</xref>). Using a method described in <xref ref-type="sec" rid="s2-8">Section 2.8</xref>, we computed information theoretic similarity between the synthetic datasets and the SHH data; the result is that they are indeed similar with a similarity score of 0.89. This is supportive of the assumption that the expected performance on the SHH data can be reasonably approximated by computing the metrics on the corresponding synthetic datasets.</p>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>The values used for each property when creating synthetic BNs (and their associated datasets) for the SHH datasheet. Combinatorial total of 80 property values, Each configuration of properties was repeated 10 times. In total 800 Bayesian networks and datasets were created. Italicized values were only used when presenting results pertaining to increasing samples or decreasing variables.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Property</th>
<th align="center">Values</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Variables</td>
<td align="center">40, <italic>30, 20, 10</italic>
</td>
</tr>
<tr>
<td align="left">Samples</td>
<td align="center">5,000, <italic>7,500, 10,000, 12,500</italic>
</td>
</tr>
<tr>
<td align="left">Average levels</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">Structure types</td>
<td align="center">Forest fire, barabasi-albert, IC-DAG, waxman, Small-world</td>
</tr>
<tr>
<td align="left">Maximum in-degree</td>
<td align="center">Uncapped</td>
</tr>
<tr>
<td align="left">&#x03B1;</td>
<td align="center">20</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The expected BN algorithm skeleton (<xref ref-type="fig" rid="F6">Figures 6A,B</xref>), V-structure (<xref ref-type="fig" rid="F6">Figures 6C,D</xref>), and PCOR score (<xref ref-type="fig" rid="F7">Figure 7A</xref>) were then attached to the datasheet for each of the structure learning algorithms. As our primary goal was to successfully simulate interventions, we set a threshold of 0.8 on the PCOR score. Meeting this threshold would imply we could have reasonable confidence in our model and the estimates it produced.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Performance obtained running the structure learning algorithms on the datasets produced for the Surgo Household datasheet (Datasheet for Existing Datasets). The width of the violin plots are the kernel density estimates of the distribution, the white dots represent the median, and the vertical thick and thin lines represent the IQRs and ranges respectively. <bold>(A)</bold>: Skeleton Precision; <bold>(B)</bold>: Skeleton Recall; <bold>(C)</bold>: V-structure Precision; <bold>(D)</bold>: V-structure Recall. From this figure we can see good skeleton performance can be obtained using OrderMCMC, but V-Structures are more likely to be correct (precision) when using the BIC and less likely to be missed (recall) when using the qNML score.</p>
</caption>
<graphic xlink:href="frai-04-612551-g006.tif"/>
</fig>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>
<bold>(A)</bold>: The PCOR score obtained running the structure learning algorithms on the datasets produced for the Surgo Household dataset datasheet. Performance for OrderMCMC is generally above the set threshold. There are cases where the there are no paths to the target variable, causing the PCOR to be 0. <bold>(B)</bold>: The PCOR scores obtained by reducing the number of variables. Median performance passes the threshold on the OrderMCMC results when variables are <inline-formula id="inf48">
<mml:math id="minf48">
<mml:mo>&#x2264;</mml:mo>
</mml:math>
</inline-formula> 20.</p>
</caption>
<graphic xlink:href="frai-04-612551-g007.tif"/>
</fig>
<p>The outputs from the Causal Datasheet provided a number of key insights for this dataset:<list list-type="simple">
<list-item>
<p>1. While all structure learning algorithms could achieve high skeleton precision and recall (<xref ref-type="fig" rid="F6">Figures 6A,B</xref>), the OrderMCMC algorithms (with either BIC or qNML) had superior median predicted performance for V-structure precision and recall (<xref ref-type="fig" rid="F6">Figures 6C,D</xref>).</p>
</list-item>
<list-item>
<p>2. The median PCOR for OrderMCMC (qNML) (0.89) and OrderMCMC (BIC) met our threshold test of 0.8 (<xref ref-type="fig" rid="F7">Figure 7A</xref>). There are cases when the PCOR is 0 due to target variables being determined as independent during structure learning. This is not a major concern as it will be clear to a practitioner when this has occurred once the DAG has been visualized.</p>
</list-item>
<list-item>
<p>3. Decreasing the number of variables from 40 to 20 could improve the mean V-structure recall by <inline-formula id="inf49">
<mml:math id="minf49">
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
<mml:mn>0.08</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="sec" rid="s9">Supplementary Material B</xref>) and PCOR by (<xref ref-type="fig" rid="F7">Figure 7B</xref>).</p>
</list-item>
<list-item>
<p>4. Structure type, specifically the distribution of in-degree, has a large effect on expected performance levels, particularly on V-structure precision (<xref ref-type="fig" rid="F6">Figure 6C</xref>). This leads to a multi-modal distribution of performance where similar structures are grouped. If a practitioner can ascertain the ground truth structure type or the distribution of in-degree, even if he/she cannot ascertain the ground truth structure itself, the uncertainty of the performance estimation can be reduced.</p>
</list-item>
</list>
</p>
<p>Specifically on ground truth skeleton recovery, we found that PC may provide marginally higher performance on skeleton precision (1 vs. 0.98), but performs poorly in recall by comparison with OrderMCMC (qNML) (0.75 vs. 1). OrderMCMC (BIC) had the highest median skeleton precision at 1. OrderMCMC (qNML), PC and GES were at 0.97, 0.91 and 0.36 respectively. However, the precision with PC is less sensitive with a IQR of 0.02, whereas OrderMCMC (qNML) was 0.21 and OrderMCMC (BIC) was 0.05.</p>
<p>On ground truth V-structure recovery, OrderMCMC (BIC) performs best in terms of V-structure precision, but suffers with V-structure recall compared to OrderMCMC (qNML). Particularly on structure types with higher in-degrees, forest fire and Barabasi-Albert. PC recall for V-structures is much worse than OrderMCMC (qNML) (0.37 vs. 1). Overall GES with BIC is a poor performer for our needs, especially when V-structure is concerned despite having the same score function as OrderMCMC (BIC).</p>
<p>These insights were invaluable for decision making in the relevant context and showed us that we would need to further reduce our number of variables or seek expert input before we could have confidence in our understanding of the effects of interventions on maternal health outcomes. The results also suggested that the OrderMCMC algorithm qNML would generally provide the best overall model performance among those tested. Ultimately, given we had a clear outcome variable of interest, we used multivariate regression to select 18 out of 40 variables based on significance of regression coefficients from the original dataset; this reduction in the number of variables allowed us to have more confidence in our resulting DAG structures. It should be pointed out that there are many general-purpose feature selection schemes, but feature selection with the intent for subsequent causal structural discovery is not well understood and beyond the scope of this study (<xref ref-type="bibr" rid="B24">Guyon et al., 2007</xref>).</p>
</sec>
<sec id="s3-3">
<title>3.3 Causal Datasheet for Existing Datasets Example: ALARM</title>
<p>As a third example, a Causal Datasheet for the well-known ALARM dataset was generated for the purpose of validating the estimates being produced by CDG-T (Supplement C). The characteristics of this dataset were approximated, mimicking how a researcher might use the Causal Dataset Generation Tool. The ALARM dataset has 37 variables, and an average of 2.8 levels per variable (<xref ref-type="bibr" rid="B6">Beinlich et al., 1989</xref>). Aside from a few binary variables, most variables have ordinal values. In this test case a sample size of 5,000 was used. Synthetic BNs with similar characteristics were then generated, the exact values used can be found in <xref ref-type="table" rid="T7">Table 7</xref>. Using a method described in <xref ref-type="sec" rid="s2-8">Section 2.8</xref>, we computed information theoretic similarity between the synthetic datasets and the ALARM data; the result is that they are indeed similar with a similarity score of 0.91. This is supportive of the assumption that the expected performance on the ALARM dataset can be reasonably approximated by computing the metrics on the corresponding synthetic datasets. Experiments using these Synthetic BNs were then performed, with the results summarized in the datasheet.</p>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>The values used for each property when creating synthetic BNs (and their associated datasets) for the ALARM datasheet. Combinatorial total of five property values, Each configuration of properties was repeated 10 times. In total 50 Bayesian networks and datasets were created.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Property</th>
<th align="center">Values</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Variables</td>
<td align="center">40</td>
</tr>
<tr>
<td align="left">Samples</td>
<td align="center">5,000</td>
</tr>
<tr>
<td align="left">Average levels</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">Structure types</td>
<td align="center">Forest fire, barabasi-albert, IC-DAG, waxman, Small-world</td>
</tr>
<tr>
<td align="left">Maximum in-degree</td>
<td align="center">Uncapped</td>
</tr>
<tr>
<td align="left">&#x03B1;</td>
<td align="center">6</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>While the synethetic datasets are similar to the ALARM dataset, they are not identical. As the ALARM dataset has a known corresponding ground-truth, it can be used to test the limitations of our current approach due the assumptions we make when generating synethetic datasets.</p>
<p>One such assumption is that when sampling parameters from a Dirichlet distribution we have assumed <italic>&#x3b1;</italic> is uniform, this means (on average) the marginal distributions of the variables will be balanced. Imbalanced marginal distributions can degrade structure learning performance, as information supporting conditional dependence becomes more scarce with the same amount of data.</p>
<p>Comparing the CDG-T estimate generated with the uniform <italic>&#x3b1;</italic> assumption to the actual performance obtained on the ALARM dataset shows general alignment with the PC and GES algorithms, but an overestimation of performance with OrderMCMC (<xref ref-type="fig" rid="F8">Figure 8</xref>). This difference can be observed when looking at V-structure recall (<xref ref-type="fig" rid="F8">Figure 8D</xref>). Our preliminary analysis suggests that when <italic>&#x3b1;</italic> is not assumed to be uniform, such misalignment decreases. Moreover, information theoretic similarity also increases (Supplementary Material D) from 0.91 to 0.99.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>The performance obtained on CDG-T datasets vs. the ALARM dataset over 100 runs. <bold>(A)</bold>: Skeleton Precision; <bold>(B)</bold>: Skeleton Recall; <bold>(C)</bold>: V-structure Precision; <bold>(D)</bold>: V-structure Recall. There is a general alignment on the skeleton estimates, but V-Structure Recall is overestimated for the OrderMCMC methods.</p>
</caption>
<graphic xlink:href="frai-04-612551-g008.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<title>4 Discussion</title>
<p>Having a Causal Datasheet that describes the expected performance in recovering ground truth structures for any given dataset can be tremendously valuable to both machine learning scientists and practitioners. We were particularly interested in scenarios where data characteristics are sub-optimal for data-driven causal BN learning, which is often the case for LMIC scenarios. This perspective differs from other evaluative reviews of algorithm in the sense that we are not only concerned with different structural learning algorithms&#x2019; (and score function choices&#x2019;) maximum capacity to recover the ground truth, but also how they differ in more constrained cases (<xref ref-type="bibr" rid="B47">Raghu et al., 2018</xref>). We have shown how Causal Datasheets can aid in the planning of studies that have the analytical goal of causal discovery and inference, and in analysis of studies after existing data have been collected. Our general approach of creating synthetic datasets that approximate the real-world data should accommodate other causal inference methods such as Neyman-Rubin causal models (the potential outcomes framework) in theory (<xref ref-type="bibr" rid="B52">Rubin, 2005</xref>).</p>
<p>In addition to the number of variables and sample size demonstrated in the case studies, we have also observed that extreme imbalance of levels, in-degree and structure type all affect structural learning performance. In practice, even observable characteristics may be beyond the modeler&#x2019;s control. Sometimes the sample size is restricted by resources such as survey budget or similar data have already been collected. Sometimes a variable may be very imbalanced (e.g., very few unhealthy samples vs. many healthy samples). Often, data are collected with specific questions in mind and may not contain all the right variables for another specific outcome of interest. However, upon referring to such Causal Datasheets, there may be scenarios where seemingly imperfect dataset could still yield useful insights, given a tolerable level of error. Moreover, one&#x2019;s tolerance may be different for precision and recall errors. In constrained scenarios, our results suggest that practitioners may be able to increase algorithm performances by additional feature engineering or transformation of the data by reducing the number of variables for example. However, one should be cautioned against too much data processing as it runs the risk of transforming the ground truth represented by the data as well.</p>
<p>We briefly discuss the potential impact of our Causal Datasheet work on algorithmic fairness research. <xref ref-type="bibr" rid="B18">Gebru et al. (2018)</xref> advocated that every dataset is accompanied with a datasheet to promote transparency and accountability, including to highlight if the dataset has unwanted bias toward a particular demographic. It is important for us to understand how and why demographic information, especially protected characteristics (e.g., race, gender, age), influences other variables in a dataset. Causal reasoning has recently been shown to be a powerful tool for understanding sources of algorithmic bias, and for mitigating bias in an algorithmic decision system (<xref ref-type="bibr" rid="B9">Chiappa and Isaac, 2018</xref>; <xref ref-type="bibr" rid="B39">Loftus et al., 2018</xref>; <xref ref-type="bibr" rid="B58">Sharmanska et al., 2020</xref>). Most existing causality-based algorithmic fairness methods require knowledge of the causal graph. One option is to learn causal structure from observational data. It is important to acknowledge that potentially very misleading conclusions might be drawn if incorrect causal structure is used (<xref ref-type="bibr" rid="B31">Kilbertus et al., 2020</xref>). Our Causal Datasheet can be used to help researchers and practitioners assess whether they can have confidence in the inferred structure.</p>
<sec id="s4-1">
<title>4.1 Assumptions and Limitations</title>
<p>While we believe the datasheet has utility in its current form, there are still a number of improvements to be made. Assumptions are made when building the Causal Datasheets. In order to present the results from synthetic experiments as performance which can be expected on empirical data, we must assume that synthetic data can act as a proxy for empirical data. Furthermore, the synthetic data in use can be improved upon; there may be other pertinent data characteristics that we had not considered or considered but incorrectly assumed. These include the assumed ground truth structure types, where the five graph generation algorithms used offer no guarantee of orthogonality. While using known structure types can be an advantage if a practitioner suspects their DAG may be of a particular distribution, using multiple can clearly bias results if they generate in-degree distributions which are too similar. A future direction of research would be to use generative graph models which when seeded with an initial DAG can preserve degree distribution, among other properties (<xref ref-type="bibr" rid="B36">Leskovec and Faloutsos, 2007</xref>). How to form a DAG as a seed in an unbiased and useful way is non-trivial. A high-recall DAG could be used in an attempt to provide an upper-bound on difficulty, under the assumption the in-degree distribution would be at least on par with reality. Alternatively an agreement graph, as in the Intersection-Validation paper, could be used as a seed to provide a DAG with less bias to any one structure learning algorithm. For simplicity we have also assumed that the conditional imbalance parameter applies to the entire dataset, but it is entirely possible that a real dataset has a large variance around the imbalance of parameters. Validating our tool with ALARM demonstrates there are special cases which are not yet entirely modeled in our synthetic data generation. The current simplifying assumption of uniform <italic>&#x3b1;</italic> values when sampling parameters from a Dirichlet distribution can clearly lead to overestimation of performance in some scenarios. Development of <italic>&#x3b1;</italic>-estimation techniques, or other methods of incorporating non-uniform <italic>&#x3b1;</italic> values is a clear next step. Some initial work can be found in the supplementary material. Introducing further modeling assumptions, whether by generative graph or <italic>&#x3b1;</italic> estimation techniques, can increase the specificity of the provided estimates. However, introducing bias in this way must be done with caution. As it could yield certain, yet incorrect, performance estimates. A method of determining whether introduced assumptions are correct, and to address the gap between real and synthetic data must be developed. Some initial work on this can again be found in the supplementary material. Others have shown that algorithm outputs are sensitive to hyper-parameters specific to that algorithm. For example, BDeu is a popular score but it is highly sensitive to its only hyper-parameter, the equivalent sample size (<xref ref-type="bibr" rid="B55">Scutari, 2018</xref>). This is part of the reason we included qNML and BIC in the current study as they do not have hyper-parameters. Estimation of hyper-parameters is often not trivial and may challenging to generalize across a spectrum of real-world data. Additionally, we have only considered BN here, which cannot accommodate cyclical causal relationships. Finally, we have assumed that the input datasets had no latent confounders and the datasets are at least meet the interventional sufficiency criteria (<xref ref-type="bibr" rid="B44">Pearl, 2009</xref>; <xref ref-type="bibr" rid="B45">Peters et al., 2017</xref>), which is known to be a problem.</p>
<p>There are also practical limitations as well. We had considered data with discrete variables only; however this approach can be extended to algorithms that deal with continuous variables as well. We did not consider computational power needed for different algorithms. While we bear a faithful optimism that computation power of current hardware will increase to eventually overcome this barrier, this is a useful addition to the Causal Datasheet. Similarly the computation time to generate the synthetic data sets is also highly dependent on the hardware. However, since it took about 1&#xa0;min to generate 50 synthetic data sets (40 variables, 5,000 samples, five structure types and 10 repetitions) on a workstation equipped with an AMD EPYC 7742 CPU and 256&#xa0;GB of RAM, we think this will not be a big problem for most in the long run as cloud computing solutions become democratized and cheaper. With ten repeats (i.e., <italic>T</italic> &#x3d; 10), our largest set of experiments had 1,750 datasets to generate, learn, and evaluate; taking around 40&#xa0;h to complete. Running times are highly dependent on configuration, as well as the machine being used, and should be selected appropriately for individual circumstances. Whether 10 repeats of each experiment are required, or is sufficient, remains unknown. While the synthetic BN and datasets generation is inexpensive (time-wise), structure learning on the data is not, and by-far takes the most time of any component in our datasheet generation pipeline. For example, while data generation for the ALARM dataset takes 1&#xa0;min, the structure learning takes close to an hour. Another clear direction of future work is to perform analysis to determine when enough experiments have taken place to reach some performance convergence. We have made assessments based on purely precision and recall of the ground truth, ignoring the fact that our OrderMCMC implementation takes longer than PC and GES; however there may be circumstances where computational speed outweighs the benefit of accuracy gains. Real-world data often come with missingness that are either random (MAR), completely at random (MCAR), and missing not at random (MNAR) (<xref ref-type="bibr" rid="B51">Rubin, 1976</xref>). We have developed the capacity to produce synthetic datasets with missingness. Determining missingness characteristics along with the appropriate imputation method for use in the datasheet is a future research direction. Lastly, we were inspired by the problem of inferring causality from global development datasets and have estimated the range of data characteristics subjectively in that domain. By all means, the range of data characteristics considered in this study may be very different for a different sub-domain. For example, the number of variables for agricultural data may be many more than that of a disease treatment survey. We leave these theoretical and practical limitations as potential areas for improvement to further the usage of Bayesian networks in practice.</p>
<p>In summary, a standard practice of reporting projected range of causal discovery and inference performance can help practitioners 1) during the experimental design phase, when they are interested in designing experiments with characteristics suitable for BN analysis, 2) during the analysis phase, when they are interested in choosing optimal structural learning algorithms and assigning confidence to DAGs, and 3) at the policy level, when they must justify their insights generated from BN analysis. We believe that this type of evaluation should be a vital component to a general causal discovery and inference work flow.</p>
</sec>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s9">Supplementary Material</xref>, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>All authors listed have made a substantial, direct and intellectual contribution to the work, and approved it for publication.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>The authors declare that this study received funding from Surgo Foundation. NQ is supported by the European Research Council (ERC) funding, grant agreement No 851538, and UK EPSRC project EP/P03442X/1.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<ack>
<p>The author would like to thank Wray Buntine for early discussion of the structural learning algorithms and performance metrics. We would also like to thank the reviewers for their insightful comments, which helped us to significantly improve the paper.</p>
</ack>
<sec id="s9">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2021.612551/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frai.2021.612551/full&#x23;supplementary-material</ext-link>.</p>
<supplementary-material xlink:href="datasheet1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aguilera</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Fernandez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fernandez</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Rumi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Salmeron</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Bayesian networks in environmental modeling</article-title>. <source>Environ. Model. Softw.</source> <volume>26</volume>, <fpage>1376</fpage>&#x2013;<lpage>1388</lpage>. <pub-id pub-id-type="doi">10.1016/j.envsoft.2011.06.004</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alexandros</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Melanie</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Model selection via meta-learning: a comparative study</article-title>. <source>Int. J. Artif. Intelligence Tools</source> <volume>10</volume>, <fpage>525</fpage>&#x2013;<lpage>554</lpage>. <pub-id pub-id-type="doi">10.1142/S0218213001000647</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Andrews</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ramsey</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cooper</surname>
<given-names>G. F.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Scoring Bayesian networks of mixed variables</article-title>. <source>Int. J. Data Sci. Analytics</source> <volume>6</volume>, <fpage>3</fpage>&#x2013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1007/s41060-017-0085-7</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arora</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Boyne</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Slater</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Brenner</surname>
<given-names>D. R.</given-names>
</name>
<name>
<surname>Druzdzel</surname>
<given-names>M. J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Bayesian networks for risk prediction using real-world data: a tool for precision medicine</article-title>. <source>Value in Health</source> <volume>22</volume>, <fpage>439</fpage>&#x2013;<lpage>445</lpage>. <pub-id pub-id-type="doi">10.1016/j.jval.2019.01.006</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Barab&#xe1;si</surname>
<given-names>A .L.</given-names>
</name>
<name>
<surname>Albert</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Emergence of scaling in random networks</article-title>. <source>Science</source> <volume>286</volume>, <fpage>509</fpage>&#x2013;<lpage>512</lpage>. <pub-id pub-id-type="doi">10.1126/science.286.5439.509</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Beinlich</surname>
<given-names>I. A.</given-names>
</name>
<name>
<surname>Suermondt</surname>
<given-names>H. J.</given-names>
</name>
<name>
<surname>Chavez</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Cooper</surname>
<given-names>G. F.</given-names>
</name>
</person-group> (<year>1989</year>). <article-title>The alarm monitoring system: a case study with two probabilistic inference techniques for belief networks</article-title>. <source>Aime</source>, <volume>89</volume>, <publisher-name>Springer</publisher-name>, <fpage>247</fpage>&#x2013;<lpage>256</lpage>. </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Binder</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Koller</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Russell</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kanazawa</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Adaptive probabilistic networks with hidden variables</article-title>. <source>Machine Learn.</source> <volume>29</volume>, <fpage>213</fpage>&#x2013;<lpage>244</lpage>. </citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Buntine</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>1991</year>). &#x201c;<article-title>Theory refinement on Bayesian networks</article-title>,&#x201d; in <conf-name>Proceedings of the seventh conference on Uncertainty in artificial intelligence</conf-name>, <conf-loc>Los Angeles, CA</conf-loc>, <conf-date>April 13, 1991</conf-date> (<publisher-loc>Montreal, Canada</publisher-loc>: <publisher-name>Morgan Kaufmann Publishers Inc.</publisher-name>), <fpage>52</fpage>&#x2013;<lpage>60</lpage>. </citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chiappa</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Isaac</surname>
<given-names>W. S.</given-names>
</name>
</person-group> (<year>2018</year>). <source>A causal Bayesian networks viewpoint on fairness</source>. <publisher-loc>Vienna, Austria</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>.</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chickering</surname>
<given-names>D. M.</given-names>
</name>
</person-group> (<year>1995</year>). &#x201c;<article-title>A transformational characterization of equivalent Bayesian network structures</article-title>,&#x201d; in <conf-name>Proceedings of the eleventh conference on Uncertainty in artificial intelligence</conf-name>, <conf-loc>San Francisco, CA</conf-loc>, <conf-date>April 13, 1995</conf-date> (<publisher-loc>Montreal, Canada</publisher-loc>: <publisher-name>Morgan Kaufmann Publishers Inc.</publisher-name>), <fpage>87</fpage>&#x2013;<lpage>98</lpage>. </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chowkwanyun</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bayer</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Galea</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>&#x201c;precision&#x201d; public health &#x2013; between novelty and hype</article-title>. <source>New Engl. J. Med.</source> <volume>379</volume>, <fpage>1398</fpage>&#x2013;<lpage>1401</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMp1806634</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Croft</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Marshall</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Allen</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Guide to DHS statistics Tech. Rep., the demographic and health surveys program</source>. <publisher-loc>United States</publisher-loc>: <publisher-name>Agency for International Development</publisher-name>.</citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dawid</surname>
<given-names>A. P.</given-names>
</name>
</person-group> (<year>1992</year>). &#x201c;<article-title>Prequential analysis, stochastic complexity and Bayesian inference</article-title>,&#x201d; in <source>Bayesian Statistics</source> (<publisher-loc>London, United Kingdom</publisher-loc>: <publisher-name>Oxford University Press</publisher-name>), <fpage>109</fpage>&#x2013;<lpage>125</lpage>. </citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>de Jongh</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Druzdzel</surname>
<given-names>M. J.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>A comparison of structural distance measures for causal Bayesian network models recent advances</article-title>,&#x201d; in <source>Intelligent Information systems, challenging problems of science computer science series</source>, (<publisher-name>Springer</publisher-name>) <fpage>443</fpage>&#x2013;<lpage>456</lpage>. </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Desmond-Hellmann</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Progress lies in precision</article-title>. <source>Science</source> <volume>353</volume>, <fpage>731</fpage>. <pub-id pub-id-type="doi">10.1126/science.aai7598</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fabic</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bird</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>A systematic review of demographic and health surveys: data availability and utilization for research</article-title>. <source>Bull. World Health Organ.</source> <volume>90</volume>, <fpage>604</fpage>&#x2013;<lpage>612</lpage>. <pub-id pub-id-type="doi">10.2471/BLT.11.095513</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Friedman</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Koller</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>Bayesian approach to structure discovery in Bayesian networks</article-title>. <source>Machine Learn.</source> <volume>50</volume>, <fpage>95</fpage>&#x2013;<lpage>125</lpage>. <pub-id pub-id-type="doi">10.1023/A:1020249912095</pub-id> </citation>
</ref>
<ref id="B64">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gadetsky</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Struminsky</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Robinson</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Quadrianto</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Vetrov</surname>
<given-names>D. P.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201C;<article-title>Low-variance black-box gradient estimates for the plackett-luce distribution</article-title>&#x201d;, in <source>The Thirty-Fourth {AAAI} Conference on Artificial Intelligence</source> <publisher-name>AAAI Press</publisher-name>, <fpage>10126</fpage>&#x2013;<lpage>10135</lpage>.</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gebru</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Morgenstern</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Vecchione</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Vaughan</surname>
<given-names>J. W.</given-names>
</name>
<name>
<surname>Wallach</surname>
<given-names>H. M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Datasheets for datasets</article-title>. <source>Computing Res. Repository</source>. <comment>arxiv:1803.09010</comment>. </citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gentzel</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Garant</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jensen</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>The case for evaluating causal models using interventional measures and empirical data</article-title>,&#x201d; in <source>Advances in neural Information Processing Systems 32</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Wallach</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Larochelle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Beygelzimer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>d&#x2019;Alch&#xe9;-Buc</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Garnett</surname>
<given-names>R.</given-names>
</name>
</person-group> (<publisher-loc>Boston, MA</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <fpage>11722</fpage>&#x2013;<lpage>11732</lpage>. </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Glymour</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Spirtes</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Review of causal discovery methods based on graphical models</article-title>. <source>Front. Genet.</source> <volume>10</volume>, <fpage>524</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2019.00524</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gogoshin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Branciamore</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Rodin</surname>
<given-names>A. S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Synthetic data generation with probabilistic Bayesian networks</article-title>. <comment>bioRxiv</comment>. </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goodfellow</surname>
<given-names>I. J.</given-names>
</name>
<name>
<surname>Pouget-Abadie</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Mirza</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Warde-Farley</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ozair</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Generative adversarial networks</article-title>. <comment>arXiv:1406.2661</comment>. </citation>
</ref>
<ref id="B23">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gr&#xfc;nwald</surname>
<given-names>P. D.</given-names>
</name>
<name>
<surname>Grunwald</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2007</year>). <source>The minimum description length principle</source>: <publisher-loc>Cambridge, MA:</publisher-loc> <publisher-name>MIT press</publisher-name>.</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guyon</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Aliferis</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Elisseeff</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Causal feature selection</article-title>. <source>Comput. Methods Feature Selection</source>, <volume>7</volume>, <fpage>63</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.4018/978-1-7998-5781-5.ch007</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hausman</surname>
<given-names>D. M.</given-names>
</name>
<name>
<surname>Woodward</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Independence, invariance and the causal Markov condition</article-title>. <source>Br. J. Philos. Sci.</source> <volume>50</volume>, <fpage>521</fpage>&#x2013;<lpage>583</lpage>. <pub-id pub-id-type="doi">10.1093/bjps/50.4.521</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Heckerman</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Geiger</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chickering</surname>
<given-names>D. M.</given-names>
</name>
</person-group> (<year>1995</year>). <article-title>Learning Bayesian networks: the combination of knowledge and statistical data</article-title>. <source>Machine Learn.</source> <volume>20</volume>, <fpage>197</fpage>&#x2013;<lpage>243</lpage>. <pub-id pub-id-type="doi">10.1023/A:1022623210503</pub-id> </citation>
</ref>
<ref id="B72">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>V. S.</given-names>
</name>
<name>
<surname>Morris</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Jain</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ramesh</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Kemp</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Blanchard</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Closing the gap on institutional delivery in northern India: a case study of how integrated machine learning approaches can enable precision public health</article-title>. <source>BMJ Global Health</source> <volume>5</volume>, <fpage>e002340</fpage>. <pub-id pub-id-type="doi">10.1136/bmjgh-2020-002340</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ide</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Cozman</surname>
<given-names>F. G.</given-names>
</name>
</person-group> (<year>2002</year>). &#x201c;<article-title>Random generation of Bayesian networks</article-title>,&#x201d; in <source>Brazilian symposium on artificial intelligence</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>366</fpage>&#x2013;<lpage>376</lpage>. </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kalainathan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Goudet</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Causal discovery toolbox: uncover causal relationships in python</article-title>. <comment>arXiv:1903.02278</comment>. </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hancioglu</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Multiple indicator cluster surveys: delivering robust data on children and women across the globe</article-title>. <source>Stud. Fam. Plann.</source> <volume>50</volume>, <fpage>279</fpage>&#x2013;<lpage>286</lpage>. <pub-id pub-id-type="doi">10.1111/sifp.12103</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khoury</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Iademarco</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Riley</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Precision public health for the era of precision medicine</article-title>. <source>Am. J. Prev. Med.</source> <volume>50</volume>, <fpage>398</fpage>&#x2013;<lpage>401</lpage>. <pub-id pub-id-type="doi">10.1016/j.amepre.2015.08.031</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kilbertus</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ball</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Kusner</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Weller</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Silva</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>The sensitivity of counterfactual fairness to unmeasured confounding</article-title>,&#x201d; in <conf-name>Proceedings of The 35th Uncertainty in Artificial Intelligence Conference</conf-name>, <conf-date>July 2019</conf-date>, <conf-loc>Tel Aviv</conf-loc>. Editors <person-group person-group-type="editor">
<name>
<surname>Adams</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Gogate</surname>
<given-names>V.</given-names>
</name>
</person-group> (<publisher-loc>Tel Aviv, Israel</publisher-loc>: <publisher-name>PMLR: Proceedings of Machine Learning Research</publisher-name>), <volume>Vol. 115</volume>, <fpage>616</fpage>&#x2013;<lpage>626</lpage>. </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kingma</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>Welling</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Auto-encoding variational bayes</article-title>. <comment>arXiv:1312.6114</comment>. </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kleinberg</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hripcsak</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>A review of causal inference for biomedical informatics</article-title>. <source>J. Biomed. Inform.</source> <volume>44</volume>, <fpage>1102</fpage>&#x2013;<lpage>1112</lpage>. <pub-id pub-id-type="doi">10.1016/j.jbi.2011.07.001</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kuipers</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Suter</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Moffa</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Efficient sampling and structure learning of Bayesian networks</source>.</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kyrimi</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>McLachlan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dube</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Fenton</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Bayesian Networks in Healthcare: the chasm between research enthusiasm and clinical adoption</article-title>. <comment>medRxiv</comment>. </citation>
</ref>
<ref id="B36">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Leskovec</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Faloutsos</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2007</year>). &#x201c;<article-title>Scalable modeling of real graphs using kronecker multiplication</article-title>,&#x201d; in <conf-name>Proceedings of the 24th International conference on machine learning</conf-name>, <conf-date>June 2007</conf-date>, <conf-loc>Corvallis, OR</conf-loc> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>497</fpage>&#x2013;<lpage>504</lpage>. </citation>
</ref>
<ref id="B37">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Leskovec</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kleinberg</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Faloutsos</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2005</year>). &#x201c;<article-title>Graphs over time: densification laws, shrinking diameters and possible explanations</article-title>,&#x201d; in <conf-name>Proceedings of the eleventh ACM SIGKDD international conference on Knowledge discovery in data mining</conf-name>, <conf-date>August 2005</conf-date>, <conf-loc>Chicago, IL</conf-loc>. (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>177</fpage>&#x2013;<lpage>187</lpage>. </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lewis</surname>
<given-names>F. I.</given-names>
</name>
<name>
<surname>McCormick</surname>
<given-names>B. J.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Revealing the complexity of health determinants in resource-poor settings</article-title>. <source>Am. J. Epidemiol.</source> <volume>176</volume>, <fpage>1051</fpage>&#x2013;<lpage>1059</lpage>. <pub-id pub-id-type="doi">10.1093/aje/kws183</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Loftus</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Russell</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kusner</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Silva</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Causal reasoning for algorithmic fairness</article-title>. <comment>arXiv:1805.05859</comment>. </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Michie</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Spiegelhalter</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>1994</year>). <article-title>Machine learning</article-title>. <source>Neural Stat. Classification</source> <volume>13</volume>, <fpage>1</fpage>&#x2013;<lpage>298</lpage>. <pub-id pub-id-type="doi">10.1080/00401706.1995.10484383</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moglia</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Alexander</surname>
<given-names>K. S.</given-names>
</name>
<name>
<surname>Thephavanh</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Thammavong</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Sodahak</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Khounsy</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>A Bayesian network model to explore practice change by smallholder rice farmers in Lao pdr</article-title>. <source>Agric. Syst.</source> <volume>164</volume>, <fpage>84</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1016/j.agsy.2018.04.004</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<collab>Pakistan Bureau of Statistics</collab> (<year>2020</year>). <article-title>Pakistan social and living standards measurement survey (PSLM) 2018-19 national/provincial (social report). Tech. rep. Government of Pakistan</article-title>, <comment>Available at: <ext-link ext-link-type="uri" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="https://www.pbs.gov.pk/content/pakistan-social-and-living-standards-measurement">https://www.pbs.gov.pk/content/pakistan-social-and-living-standards-measurement</ext-link>
</comment> </citation>
</ref>
<ref id="B43">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Pearl</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1995</year>). <source>From Bayesian networks to causal networks</source>. <publisher-loc>Boston, MA</publisher-loc>: <publisher-name>Springer US</publisher-name>, <fpage>157</fpage>&#x2013;<lpage>182</lpage>. </citation>
</ref>
<ref id="B44">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Pearl</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). <source>Causality: models, reasoning and inference</source>. <edition>2nd Edn</edition>. <publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name>. </citation>
</ref>
<ref id="B45">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Janzing</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Sch&#xf6;lkopf</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2017</year>). <source>Elements of causal inference: foundations and learning algorithms</source>. <publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>MIT press</publisher-name>.</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pourhoseingholi</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Vahedi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rahimzadeh</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Sample size calculation in medical studies</article-title>. <source>Gastroenterol. Hepatol. Bed Bench</source> <volume>6</volume>, <fpage>14</fpage>&#x2013;<lpage>17</lpage>. </citation>
</ref>
<ref id="B47">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Raghu</surname>
<given-names>V. K.</given-names>
</name>
<name>
<surname>Poon</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Benos</surname>
<given-names>P. V.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Evaluation of causal structure learning methods on mixed data types</article-title>,&#x201d; in <conf-name>Proceedings of 2018 ACM SIGKDD workshop on causal disocvery</conf-name>, <conf-date>August 2018</conf-date>, <conf-loc>London, United Kingdom</conf-loc>. (<publisher-loc>London, UK</publisher-loc>: <publisher-name>Proceedings of Machine Learning Research</publisher-name>), <volume>92</volume>, <fpage>48</fpage>&#x2013;<lpage>65</lpage>. </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramanan</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Natarajan</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Causal learning from predictive modeling for observational data</article-title>. <source>Front. Big Data</source> <volume>3</volume>, <fpage>535976</fpage>. <pub-id pub-id-type="doi">10.3389/fdata.2020.535976</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Raqeujo-Castro</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Gine-Garriga</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Perez-Foguet</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Exploring the interlinkages of water and sanitation across the 2030 agenda: a bayesian network approach</article-title>,&#x201d; in <conf-name>24th International sustainable development research society conference</conf-name>, <conf-loc>Messina, Italy</conf-loc>, <conf-date>June 13, 2018-July 15, 2018</conf-date>. </citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rezende</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Mohamed</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Variational inference with normalizing flows</article-title>. <comment>arXiv:1505.05770</comment>. </citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rubin</surname>
<given-names>D. B.</given-names>
</name>
</person-group> (<year>1976</year>). <article-title>Inference and missing data</article-title>. <source>Biometrika</source> <volume>63</volume>, <fpage>581</fpage>&#x2013;<lpage>592</lpage>. <pub-id pub-id-type="doi">10.1093/biomet/63.3.581</pub-id> </citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rubin</surname>
<given-names>D. B.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Causal inference using potential outcomes</article-title>. <source>J. Am. Stat. Assoc.</source> <volume>100</volume>, <fpage>322</fpage>&#x2013;<lpage>331</lpage>. <pub-id pub-id-type="doi">10.1198/016214504000001880</pub-id> </citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schwarz</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>1978</year>). <article-title>Estimating the Dimension of a Model</article-title>. <source>Annals Statistics</source> <volume>6</volume>, <fpage>461</fpage>&#x2013;<lpage>464</lpage>. <pub-id pub-id-type="doi">10.1214/aos/1176344136</pub-id> </citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Scutari</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Learning bayesian networks with the bnlearn r package</article-title>. <comment>arXiv:0908.3817</comment>. </citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Scutari</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Dirichlet bayesian network scores and the maximum relative entropy principle</article-title>. <source>Behaviormetrika</source>, <volume>45</volume>, <fpage>337</fpage>&#x2013;<lpage>362</lpage>. </citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Scutari</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Graafland</surname>
<given-names>C. E.</given-names>
</name>
<name>
<surname>Guti&#xe9;rrez</surname>
<given-names>J. M.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Who learns better bayesian network structures: Accuracy and speed of structure learning algorithms</article-title>. <source>Int. J. Approximate Reasoning</source> <volume>115</volume>, <fpage>235</fpage>&#x2013;<lpage>253</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijar.2019.10.003</pub-id> </citation>
</ref>
<ref id="B57">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sgaier</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>V. S.</given-names>
</name>
<name>
<surname>Charles</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <source>The case for causal AI. Stanford social innovation review (summer issue)</source>, <publisher-loc>Stanford, CA</publisher-loc>: <publisher-name>Stanford Social Information Review</publisher-name>, <fpage>50</fpage>&#x2013;<lpage>55</lpage>.</citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sharmanska</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Hendricks</surname>
<given-names>L. A.</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Quadrianto</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Contrastive examples for addressing the tyranny of the majority</article-title>. <comment>arXiv:2004.06524</comment>. </citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silander</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kontkanen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Myllymaki</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>On sensitivity of the map Bayesian network structure to the equivalent sample size parameter</article-title>. <comment>arXiv:1206.5293</comment>. </citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silander</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Lepp&#xe4;-Aho</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>J&#xe4;&#xe4;saari</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Roos</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Quotient normalized maximum likelihood criterion for learning Bayesian network structures</article-title>,&#x201d; in <conf-name>International Conference on Artificial Intelligence and Statistics</conf-name>, <conf-date>April 2018</conf-date>, <conf-loc>Playa Blanca, Lanzarote</conf-loc> <fpage>948</fpage>&#x2013;<lpage>957</lpage>. </citation>
</ref>
<ref id="B61">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smittenaar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ramesh</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Jain</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Blanchard</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kemp</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Engl</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Bringing greater precision to interactions between community health workers and households to improve maternal and newborn health outcomes in India</article-title>. <source>Global Health Sci. Practice</source> <volume>8</volume>, <fpage>358</fpage>&#x2013;<lpage>371</lpage>. <pub-id pub-id-type="doi">10.9745/GHSP-D-20-00027</pub-id> </citation>
</ref>
<ref id="B62">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Spirtes</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Glymour</surname>
<given-names>C. N.</given-names>
</name>
<name>
<surname>Scheines</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2000</year>). <source>
<italic>Causation, prediction, and search</italic>. Adaptive computation and machine learning</source>. <edition>2nd Edn</edition>. <publisher-loc>Cambridge, Mass</publisher-loc>: <publisher-name>MIT Press</publisher-name>.</citation>
</ref>
<ref id="B63">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tasaki</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sauerwine</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hoff</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Toyoshiba</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gaiteri</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Neto</surname>
<given-names>E. C.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Bayesian network reconstruction using systems genetics data: comparison of mcmc methods</article-title>. <source>Genetics</source> <volume>199</volume>, <fpage>973</fpage>&#x2013;<lpage>989</lpage>. <pub-id pub-id-type="doi">10.1534/genetics.114.172619</pub-id> </citation>
</ref>
<ref id="B65">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>van der Bles</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>van der Linden</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Freeman</surname>
<given-names>A. L.</given-names>
</name>
<name>
<surname>Mitchell</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Galvao</surname>
<given-names>A. B.</given-names>
</name>
<name>
<surname>Zaval</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Communicating uncertainty about facts, numbers and science</article-title>, <source>Royal Society Open Sci.</source>, <volume>6</volume>. <fpage>181870</fpage>. <pub-id pub-id-type="doi">10.1098/rsos.181870</pub-id> </citation>
</ref>
<ref id="B66">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Viinikka</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Eggeling</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Koivisto</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Intersection-validation: a method for evaluating structure learning without ground truth</article-title>,&#x201d; in <conf-name>Proceedings of the twenty-first international conference on artificial intelligence and statistics</conf-name>, <conf-date>April 2018</conf-date>, <conf-loc>Playa Blanca, Lanzarote</conf-loc>. Editors <person-group person-group-type="editor">
<name>
<surname>Storkey</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Perez-Cruz</surname>
<given-names>F.</given-names>
</name>
</person-group> (<publisher-loc>Playa Blanca, Lanzarote, Canary Islands</publisher-loc>: <publisher-name>PMLR Proceedings of Machine Learning Research</publisher-name>), <volume>Vol. 84</volume>, <fpage>1570</fpage>&#x2013;<lpage>1578</lpage>. </citation>
</ref>
<ref id="B67">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gelfand</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>A simulation-based approach to Bayesian sample size determination for performance under a given model and for separating models</article-title>. <source>Qual. Eng.</source> <volume>48</volume>, <fpage>505</fpage>&#x2013;<lpage>508</lpage>. <pub-id pub-id-type="doi">10.1214/ss/1030550861</pub-id> </citation>
</ref>
<ref id="B68">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Watts</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Strogatz</surname>
<given-names>S. H.</given-names>
</name>
</person-group> (<year>1998</year>). <article-title>Collective dynamics of &#x2018;small-world&#x2019;networks</article-title>. <source>Nature</source> <volume>393</volume>, <fpage>440</fpage>&#x2013;<lpage>442</lpage>. <pub-id pub-id-type="doi">10.1038/30918</pub-id> </citation>
</ref>
<ref id="B69">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Waxman</surname>
<given-names>B. M.</given-names>
</name>
</person-group> (<year>1988</year>). <article-title>Routing of multipoint connections</article-title>. <source>IEEE J. Selected Areas Commun.</source> <volume>6</volume>, <fpage>1617</fpage>&#x2013;<lpage>1622</lpage>. <pub-id pub-id-type="doi">10.1109/49.12889</pub-id> </citation>
</ref>
<ref id="B70">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wistuba</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Schilling</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Schmidt-Thieme</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Learning data set similarities for hyperparameter optimization initializations</article-title>. <source>Metasel@ pkdd/ecml</source> <volume>145</volume>, <fpage>15</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.5555/3053836.3053842</pub-id> </citation>
</ref>
<ref id="B71">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Rodrigues</surname>
<given-names>L. O.</given-names>
</name>
<name>
<surname>Narain</surname>
<given-names>N. R.</given-names>
</name>
<name>
<surname>Akmaev</surname>
<given-names>V. R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Baicis: a novel Bayesian network structural learning algorithm and its comprehensive performance evaluation against open-source software</article-title>. <source>J. Comput. Biol.</source> <volume>27</volume>, <fpage>698</fpage>&#x2013;<lpage>708</lpage>. <pub-id pub-id-type="doi">10.1089/cmb.2019.0210</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>
