<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fgene.2019.00239</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Rare Variants Imputation in Admixed Populations: Comparison Across Reference Panels and Bioinformatics Tools</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Sariya</surname> <given-names>Sanjeev</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/628773/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lee</surname> <given-names>Joseph H.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/111668/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Mayeux</surname> <given-names>Richard</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/5388/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Vardarajan</surname> <given-names>Badri N.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Reyes-Dumeyer</surname> <given-names>Dolly</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/706093/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Manly</surname> <given-names>Jennifer J.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Brickman</surname> <given-names>Adam M.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Lantigua</surname> <given-names>Rafael</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Medrano</surname> <given-names>Martin</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Jimenez-Velazquez</surname> <given-names>Ivonne Z.</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/704109/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Tosto</surname> <given-names>Giuseppe</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/706109/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Taub Institute for Research on Alzheimer&#x2019;s Disease and the Aging Brain, Vagelos College of Physicians and Surgeons, Columbia University</institution>, <addr-line>New York, NY</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>The Gertrude H. Sergievsky Center, College of Physicians and Surgeons, Columbia University</institution>, <addr-line>New York, NY</addr-line>, <country>United States</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Neurology, College of Physicians and Surgeons, New York-Presbyterian Hospital, Columbia University Medical Center</institution>, <addr-line>New York, NY</addr-line>, <country>United States</country></aff>
<aff id="aff4"><sup>4</sup><institution>Medicine College of Physicians and Surgeons, and The Department of Epidemiology, School of Public Health, Columbia University</institution>, <addr-line>New York, NY</addr-line>, <country>United States</country></aff>
<aff id="aff5"><sup>5</sup><institution>School of Medicine, Pontificia Universidad Catolica Madre y Maestra</institution>, <addr-line>Santiago</addr-line>, <country>Dominican Republic</country></aff>
<aff id="aff6"><sup>6</sup><institution>Department of Medicine, Geriatrics Program, University of Puerto Rico School of Medicine</institution>, <addr-line>San Juan</addr-line>, <country>Puerto Rico</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Vinicius Maracaja-Coutinho, Universidad de Chile, Chile</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Peng Zhang, Johns Hopkins University, United States; Daniela Albrecht-Eckardt, BioControl Jena GmbH, Germany</p></fn>
<corresp id="c001">&#x002A;Correspondence: Giuseppe Tosto, <email>gt2260@cumc.columbia.edu</email></corresp>
<fn fn-type="other" id="fn002"><p>This article was submitted to Bioinformatics and Computational Biology, a section of the journal Frontiers in Genetics</p></fn></author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>04</month>
<year>2019</year>
</pub-date>
<pub-date pub-type="collection">
<year>2019</year>
</pub-date>
<volume>10</volume>
<elocation-id>239</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>11</month>
<year>2018</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>03</month>
<year>2019</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2019 Sariya, Lee, Mayeux, Vardarajan, Reyes-Dumeyer, Manly, Brickman, Lantigua, Medrano, Jimenez-Velazquez and Tosto.</copyright-statement>
<copyright-year>2019</copyright-year>
<copyright-holder>Sariya, Lee, Mayeux, Vardarajan, Reyes-Dumeyer, Manly, Brickman, Lantigua, Medrano, Jimenez-Velazquez and Tosto</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<sec><title>Background</title>
<p>Imputation has become a standard approach in genome-wide association studies (GWAS) to infer <italic>in silico</italic> untyped markers. Although feasibility for common variants imputation is well established, we aimed to assess rare and ultra-rare variants&#x2019; imputation in an admixed Caribbean Hispanic population (CH).</p>
</sec>
<sec><title>Methods</title>
<p>We evaluated imputation accuracy in CH (<italic>N</italic> = 1,000), focusing on rare (0.1% &#x2264; minor allele frequency (MAF) &#x2264; 1%) and ultra-rare (MAF &#x003C; 0.1%) variants. We used two reference panels, the Haplotype Reference Consortium (HRC; <italic>N</italic> = 27,165) and 1000 Genome Project (1000G phase 3; <italic>N</italic> = 2,504) and multiple phasing (SHAPEIT, Eagle2) and imputation algorithms (IMPUTE2, MACH-Admix). To assess imputation quality, we reported: (a) high-quality variant counts according to imputation tools&#x2019; internal indexes (e.g., IMPUTE2 &#x201C;Info&#x201D; &#x2265; 80%). (b) Wilcoxon Signed-Rank Test comparing imputation quality for genotyped variants that were masked and imputed; (c) Cohen&#x2019;s kappa coefficient to test agreement between imputed and whole-exome sequencing (WES) variants; (d) imputation of G206A mutation in the <italic>PSEN1</italic> (ultra-rare in the general population an more frequent in CH) followed by confirmation genotyping. We also tested ancestry proportion (European, African and Native American) against WES-imputation mismatches in a Poisson regression fashion.</p>
</sec>
<sec><title>Results</title>
<p>SHAPEIT2 retrieved higher percentage of imputed high-quality variants than Eagle2 (rare: 51.02% vs. 48.60%; ultra-rare 0.66% vs. 0.65%, Wilcoxon <italic>p</italic>-value &#x003C; 0.001). SHAPEIT-IMPUTE2 employing HRC outperformed 1000G (64.50% vs. 59.17%; 1.69% vs. 0.75% for high-quality rare and ultra-rare variants, respectively, Wilcoxon <italic>p</italic>-value &#x003C; 0.001). SHAPEIT-IMPUTE2 outperformed MaCH-Admix. Compared to 1000G, HRC-imputation retrieved a higher number of high-quality rare and ultra-rare variants, despite showing lower agreement between imputed and WES variants (e.g., rare: 98.86% for HRC vs. 99.02% for 1000G). High Kappa (<italic>K</italic> = 0.99) was observed for both reference panels. Twelve G206A mutation carriers were imputed and all validated by confirmation genotyping. African ancestry was associated with higher imputation errors for uncommon and rare variants (<italic>p</italic>-value &#x003C; 1e-05).</p>
</sec>
<sec><title>Conclusion</title>
<p>Reference panels with larger numbers of haplotypes can improve imputation quality for rare and ultra-rare variants in admixed populations such as CH. Ethnic composition is an important predictor of imputation accuracy, with higher African ancestry associated with poorer imputation accuracy.</p>
</sec>
</abstract>
<kwd-group>
<kwd>rare variants</kwd>
<kwd>imputation</kwd>
<kwd>admixed population</kwd>
<kwd>GWAS</kwd>
<kwd>1000G</kwd>
</kwd-group>
<contract-sponsor id="cn001">National Institutes of Health<named-content content-type="fundref-id">10.13039/100000002</named-content></contract-sponsor>
<contract-sponsor id="cn002">BrightFocus Foundation<named-content content-type="fundref-id">10.13039/100006312</named-content></contract-sponsor>
<counts>
<fig-count count="2"/>
<table-count count="4"/>
<equation-count count="0"/>
<ref-count count="36"/>
<page-count count="10"/>
<word-count count="0"/>
</counts>
</article-meta>
</front>
<body>
<sec><title>Introduction</title>
<p>Genome-wide association studies (GWASs) are a major tool to identify common variants associated with complex diseases. GWAS can include 550 K to over 2 M Single Nucleotide Polymorphisms (SNPs) (<xref ref-type="bibr" rid="B10">Ha et al., 2014</xref>) to cover the human genome evenly. Although GWAS has shown to be a robust method to identify disease loci of interest, they rarely point to a causal coding variant. In fact, microarray SNP chips for GWAS are optimally designed to uncover common variants, often associated with small effect sizes mostly located in intronic and intergenic regions. The focus of genetic investigations has since shifted toward rarer alleles with larger effect sizes (<xref ref-type="bibr" rid="B9">Gibson, 2012</xref>). With the changing paradigm, imputation of rare variants has become an important topic to enhance the genome coverage in GWAS. Imputation is a process of inferring untyped SNP markers in the discovery population by using densely typed SNPs in external reference panel(s). These &#x2018;<italic>in silico</italic>&#x2019; markers increase the coverage of association tests while conducting genome-wide association analysis. In addition, large number of SNPs facilitate meta-analysis when merging data from different study cohorts.</p>
<p>The quality of imputation essentially depends on two parameters: available reference datasets and algorithms that employ those reference datasets. Previous studies have shown that imputation quality depends on how well reference panels reflect the study population. To respond to the needs, the 1000 Genome project (1000G), now in its third phase release, has proven to be one of the most frequently used reference panels (<xref ref-type="bibr" rid="B8">Genomes Project et al., 2015</xref>). Using these composite reference panels, a number of studies (<xref ref-type="bibr" rid="B26">Pei et al., 2010</xref>; <xref ref-type="bibr" rid="B13">Howie et al., 2012</xref>; <xref ref-type="bibr" rid="B32">Verma et al., 2014</xref>; <xref ref-type="bibr" rid="B20">Liu et al., 2015</xref>) have compared imputation accuracy using different imputation tools and algorithms, although the results are equivocal. Few studies (<xref ref-type="bibr" rid="B4">Browning and Browning, 2009</xref>; <xref ref-type="bibr" rid="B34">Zheng et al., 2012</xref>, <xref ref-type="bibr" rid="B35">2015</xref>) assessed the impact of reference panel size and input data&#x2019;s features - such as density of SNPs - to impute rare variants, suggesting larger size of reference panels work better. <xref ref-type="bibr" rid="B30">Surakka et al. (2016)</xref> assessed accuracy of imputed SNPs by evaluating rate of false polymorphisms in a Finnish population using global reference panels &#x2013; Haplotype Reference Consortium (HRC) release 1, 1000G phase 1 and a local reference panel. They concluded that higher false positive rate was observed in imputation from global reference panels compared to imputation performed using a local panel. Other studies (<xref ref-type="bibr" rid="B15">Huang et al., 2015</xref>; <xref ref-type="bibr" rid="B6">Das et al., 2016</xref>) found imputation accuracy increases with higher number of haplotypes, specifically for variants with MAF &#x2264; 0.5%. For Hispanic populations, <xref ref-type="bibr" rid="B25">Nelson et al. (2016)</xref> compared imputation performances with 1000G phase 1 (<italic>N</italic> = 1,092) vs. 1000G phase 3 (<italic>N</italic> = 2,504), concluding that phase 3 improved accuracy for variants with MAF &#x003C; 1% by. Further, <xref ref-type="bibr" rid="B24">Nagy et al. (2017)</xref> showed that HRC reference panel provides new insight for novel variants particularly for rare variants in a family-based Scottish study cohort. Aforementioned studies highlighted the need of a larger sized reference panel to improve imputation quality. <xref ref-type="bibr" rid="B12">Herzig et al. (2018)</xref> assessed tools for haplotype phasing and their impact on imputation in a population isolate of Campora in southern Italy, and showed that SHAPEIT2, SHAPEIT3 and EAGLE2 were highly accurate in phasing; MINIMAC3, IMPUTE4 and IMPUTE2 were found to be reliable for imputation. <xref ref-type="bibr" rid="B29">Roshyara et al. (2014)</xref> compared MaCH-Admix, IMPUTE2, MACH, MACH-Minimac in different ethnicities by evaluating accuracy of correctly imputed SNPs; MaCH-Minimac outperformed SHAPEIT-IMPUTE2 in subsamples of different ethnic groups. These studies demonstrated how employed imputation algorithm determines quality of inferred SNPs.</p>
<p>However, no study to our knowledge has evaluated reference panels in tandem with different imputation algorithms to assess imputation quality of inferred SNPs based on MAF in a three-way admixed population. Based on these findings, we assessed imputation quality, focusing on rare and ultra-rare variants, in a large dataset of Caribbean Hispanics (CH) leveraging available GWAS and sequencing data available for our cohort.</p>
</sec>
<sec id="s1" sec-type="materials|methods">
<title>Materials and Methods</title>
<p>We will refer SNPs with MAF between 1 and 5% as &#x201C;uncommon,&#x201D; 0.1&#x2013;1% as &#x201C;rare,&#x201D; and &#x2264; 0.1% as &#x201C;ultra-rare.&#x201D; We considered SNPs with IMPUTE-Info metric &#x2265; 0.40 as &#x201C;good-quality&#x201D; and &#x2265; 0.80 as &#x201C;high-quality.&#x201D;</p>
<sec><title>GWAS Samples and Genotyping</title>
<p>We selected randomly 1,000 Caribbean Hispanics as part of an original genotyped cohort of 3,138 individuals: genotyped data can be downloaded at dbGaP Study Accession: phs000496.v1.p1. 719 individuals were derived from Estudio Familiar Investigar Genetica de Alzheimer (EFIGA), a study of familial LOAD; and 281 individuals from the multiethnic longitudinal cohort, Washington Heights, Inwood, Columbia Aging Project (WHICAP). The information on study design, recruitment and GWAS methods for the EFIGA and WHICAP study was previously described in <xref ref-type="bibr" rid="B31">Tosto et al. (2015)</xref>.</p>
</sec>
<sec><title>GWAS Quality Control (QC)</title>
<p>Genotyped data underwent quality control using PLINK (v1.90b4.9 64-bit) (<xref ref-type="bibr" rid="B27">Purcell et al., 2007</xref>). Briefly, we excluded SNPs with missing rate &#x2265; 5% followed by exclusion of SNPs with MAF &#x2264; 1%. We then removed SNPs with <italic>P</italic>-value &#x003C; 1e-6 for Hardy-Weinberg Equilibrium. Samples with missing call rate &#x2265; 5% were excluded from analysis.</p>
</sec>
<sec><title>Global Ancestry Estimation and Selection of &#x201C;True Hispanics&#x201D;</title>
<p>Prior to imputation, we estimated global ancestry using the ADMIXTURE (v.1.3.0) software (<xref ref-type="bibr" rid="B1">Alexander et al., 2009</xref>; <xref ref-type="bibr" rid="B36">Zhou et al., 2011</xref>). We conducted supervised admixture analyses using three reference populations: African Yoruba (YRI) and non-Hispanic white of European Ancestry (CEU) from the HAPMAP project as representative of African and European ancestral populations; and eight Surui, 21 Maya, 14 Karitiana, 14 Pima and seven Colombian individuals from the Human Genome Diversity Project (HGDP) were used to represent native American ancestry (<xref ref-type="bibr" rid="B18">Li et al., 2008</xref>). We used &#x223C;80,000 autosomal SNPs that were: (I) genotyped in all three datasets (Caribbean Hispanics, 1000G and HGDP); (II) common (i.e., MAF > 5 %); and III) in linkage equilibrium. Supervised admixture analyses with the three reference populations (YRI, CEU, and Native Americans) revealed that European lineage accounted for most of the ancestral origins (59%), followed by African (33%) and native American ancestry (8%). We then selected only individuals with at least 1% of all three ancestral populations.</p>
</sec>
<sec><title>Reference Panels</title>
<p>HRC reference panel contained over 39M SNPs from 27,165 individuals who participated in 17 different studies (<xref ref-type="table" rid="T1">Table 1</xref>). The data were downloaded from the Wellcome Trust Sanger Institute (WTSI).</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>SNP counts in HRC and 1000G reference panel.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left">Reference Panel</th>
<th valign="top" align="left">Individuals</th>
<th valign="top" align="left">Autosomal variants</th>
<th valign="top" align="left">Bi-allelic SNPs</th>
<th valign="top" align="left">Multi-allelic SNPs</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1000G Phase 3</td>
<td valign="top" align="left">2,504</td>
<td valign="top" align="left">81,706,022</td>
<td valign="top" align="left">77,818,332</td>
<td valign="top" align="left">3,887,690</td>
</tr>
<tr>
<td valign="top" align="left">HRC</td>
<td valign="top" align="left">27,165<sup>&#x2217;</sup></td>
<td valign="top" align="left">39,131,600</td>
<td valign="top" align="left">39,131,600</td>
<td valign="top" align="left">NA</td></tr>
</tbody></table>
<table-wrap-foot>
<attrib><sup>&#x2217;</sup><italic>For Chromosome 1, the number of individuals were 22,691</italic>.</attrib>
</table-wrap-foot>
</table-wrap>
<p>1000G phase 3 reference panel contained over 81M SNPs from 2,504 individuals<sup><xref ref-type="fn" rid="fn01">1</xref></sup>. It includes 26 ethnic groups, with most variants rare, approximately 64 million had MAF &#x003C; 0.5%; approximately 12 million had a MAF between 0.5 and 5%; and approximately eight million have MAF > 5%. In order to perform imputation with MaCH-Admix, 1000G Phase 3 pre-formatted data were downloaded from <ext-link ext-link-type="uri" xlink:href="ftp://yunlianon:anon@rc-ns-ftp.its.unc.edu/ALL.phase3_v5.shapeit2_mvncall_integrated.noSingleton.tgz">ftp://yunlianon:anon@rc-ns-ftp.its.unc.edu/ALL.phase3_v5.shapeit2_mvncall_integrated.noSingleton.tgz</ext-link> that contained over 47M SNPs.</p>
<p>The subsequent analyses were restricted to autosomal chromosomes, only.</p>
</sec>
<sec><title>Phasing and Imputation Procedures</title>
<p>We compared SHAPEIT2 (<xref ref-type="bibr" rid="B7">Delaneau et al., 2013</xref>) and Eagle2 (<xref ref-type="bibr" rid="B21">Loh et al., 2016</xref>) by phasing and then imputing (see next section) a single chromosome (Chromosome 21), using both reference panels. We refer to SHAPEIT2 as SHAPEIT when used in tandem with IMPUTE2 for the remainder of paper.</p>
<p>Imputation was carried out using two bioinformatics tools: IMPUTE2 (<xref ref-type="bibr" rid="B14">Howie et al., 2009</xref>) and MaCH-Admix (<xref ref-type="bibr" rid="B19">Liu et al., 2013</xref>). For both, imputation quality ranged from 0 to 1, with 0 indicating complete uncertainty in imputed genotypes, and 1 indicating no uncertainty in imputed genotypes.</p>
<sec><title>IMPUTE2 (Version 2.3.2)</title>
<p>IMPUTE2 uses an MCMC algorithm to integrate over the space of possible phase reconstructions for genotypes data. We conducted imputation in non-overlapping 1MB chunk regions; chunk coordinates were specified using the &#x201C;<italic>&#x2013;int</italic>&#x201D; option. Other options were used with default parameters (Supplementary Section <xref ref-type="supplementary-material" rid="SM1">S1</xref>). Briefly, we used a default 250KB buffer region to avoid quality deterioration on the ends of chunk region. &#x201C;-Ne&#x201D; value as 2000 suggested for robust imputation which scales linkage disequilibrium and recombination error rate.</p>
</sec>
<sec><title>MaCH-Admix</title>
<p>We used MaCH-Admix because it uses a method based on IBS matching in a piecewise manner. The method breaks genomic region under investigation into small pieces and finds reference haplotypes that best represent every small piece, for each target individual separately. MaCH-Admix imputes in three steps: phasing, estimation of model parameter that includes error rare and recombination rate and lastly, haplotype-based imputation. MaCH-Admix (version Beta 2.0.185) was run on default parameters of 30 rounds, 100 states (&#x2013;autoFlip flag). Details can be found in Supplementary Section <xref ref-type="supplementary-material" rid="SM1">S1</xref>. We initially compared performance between MaCH-Admix and IMPUTE2 using the 1000G reference panel for Chromosome 21 only. We then proceeded to impute all remaining chromosomes with the tool that performed better.</p>
</sec></sec>
<sec><title>Imputation Performance Metrics</title>
<p>IMPUTE2 uses &#x201C;Info&#x201D; parameter to report imputation quality that measures relative statistical information about SNP allele frequency from imputed data. It reflects the information in imputed genotypes relative to the information if only the allele frequency were known. &#x201C;Info&#x201D; metric is used to filter poorly imputed SNPs from IMPUTE2 and is reported for all imputed SNPs. In addition, IMPUTE2 uses an internal metric known as R<sup>2</sup>, reported for genotyped SNPs only: it measures squared correlation between genotyped SNPs and the same SNPs that have been first masked internally and then imputed. MaCH-Admix uses <italic>Rsq</italic> to report imputation quality. The R<sup>2</sup> metric is also known as variance ratio, calculated as proportion of empirically observed variance (based on the imputation) to the expected binomial variance p(1-p), where p is the minor allele frequency. A threshold of 0.30 is recommended to filter out poorly imputed SNPs.</p>
<p>Despite quality measures from IMPUTE2 and MaCH-Admix being highly correlated (<xref ref-type="bibr" rid="B22">Marchini and Howie, 2010</xref>), we calculated a <italic>r2hat</italic> score to generate a single common metric to assess imputation quality across the software (<xref ref-type="bibr" rid="B11">Hancock et al., 2012</xref>) (v109)<sup><xref ref-type="fn" rid="fn02">2</xref></sup>.</p>
<p>We compared performance of MaCH-Admix and SHAPEIT-IMPUTE2 by: (a) Reporting raw SNP counts based on quality (MaCH-Admix &#x201C;Rsq&#x201D; and IMPUTE2 &#x201C;Info&#x201D;); (b) Comparing <italic>r2hat</italic> for overlapping imputed SNPs from both tools; (c) Conducting a Wilcoxon Signed-Rank Test (R v3.4.2) on <italic>r2hat</italic> value of overlapping SNPs.</p>
<p>We compared performance of Eagle2 and SHAPEIT2 phasing tools in tandem with IMPUTE2 as imputation tools across reference panels by: (a) Comparing their respective IMPUTE2 <italic>R</italic><sup>2</sup>: (b) Conducting a Wilcoxon Signed-Rank Test on <italic>R</italic><sup>2</sup> value; (c) Reporting raw counts of imputed SNPs based on IMPUTE2 &#x201C;Info&#x201D; metric and stratified by MAF bins (e.g., common, rare, ultra-rare).</p>
<p>In all comparisons, the MAFs are estimated from imputed data according to the reference panel employed. We retained monomorphic SNPs in our analyses for several reasons. A monomorphic SNP in one study might not be monomorphic in other cohorts. This has profound affects, for example, when performing meta-analysis across different studies. In addition, monomorphic SNPs provide information about MAF across studies. Without the information it is difficult to tell, for instance, if a SNP is monomorphic or failed quality control in that study.</p>
</sec>
<sec><title>Agreement Between Imputed and Sequence Data</title>
<p>To further test the quality of imputation -without relying on software&#x2019;s internal metrics (i.e., &#x201C;Info&#x201D; and <italic>R</italic><sup>2</sup>) - we calculated genotyped concordance between imputed and WES data using the VCF-compare tool (v0.1.14-12-gcdb80b8) (<xref ref-type="bibr" rid="B5">Danecek et al., 2011</xref>). First, we converted posterior probabilities obtained from imputation into genotype data using the PLINK software (v1.90b4.9) by applying a threshold of 0.9 (Supplementary Section <xref ref-type="supplementary-material" rid="SM1">S1</xref>), such that SNPs that failed on this criterion were left uncalled. For example, an imputed SNP with P(<italic>G</italic> = 0,1,2) = (0.01,0.9,0.09) would be called as a &#x2018;1&#x2019; (heterozygous), whereas an imputed SNP with P(<italic>G</italic> = 0,1,2) = (0.2, 0.6, 0.2) would be left uncalled. We restricted the comparison to overlapping SNPs between HRC, 1000G reference panels and whole-exome sequencing (WES) data for Chromosome 14 only, on SNPs with 0% missingness (plink &#x2013;missing flag) in WES data. We also assessed variants&#x2019; agreement according to different MAF bins for &#x201C;high-quality&#x201D; (&#x201C;Info&#x201D; &#x2265; 0.8) SNPs. The output resulted in number of variant &#x201C;mismatches,&#x201D; i.e., the count of allele not matching between imputed and sequenced variants per individual. Work-flow for VCF-compare can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure S1</xref>. To measure interrater reliability we computed Cohen&#x2019;s kappa coefficient (<xref ref-type="bibr" rid="B23">McHugh, 2012</xref>) for both the reference panels against WES data. Kappa coefficient &#x2264; 0 indicates no agreement, 0.01&#x2013;0.20 as none to slight, 0.21&#x2013;0.40 as fair, 0.41&#x2013;0.60 as moderate, 0.61&#x2013;0.80 as substantial, and 0.81&#x2013;1.00 as almost perfect agreement. Work-flow for Cohen&#x2019;s kappa coefficient calculation can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure S2</xref>.</p>
</sec>
<sec><title>Effects of Ancestry on Imputation Quality</title>
<p>To assess how ancestry affected imputation quality, we conducted a Poisson regression using R. We used percentage of global ancestry (European (CEU), Native (NAT) and African (YRI) as predictors, and total number of mismatches as the outcome; analyses were restricted to &#x201C;high-quality&#x201D; SNPs, only.</p>
</sec>
<sec><title>Imputation of G206A Mutation in PSEN1</title>
<p>To evaluate imputation performance of a specific rare variant, we examined a founder mutation, p.Gly206Ala (G206A - rs63750082) in the <italic>PSEN1</italic> gene (PSEN1-G206A) (<xref ref-type="bibr" rid="B3">Athan et al., 2001</xref>; <xref ref-type="bibr" rid="B17">Lee et al., 2015</xref>). The PSEN1-G206A mutation is a rare variant observed primarily in Puerto Ricans with familial early onset Alzheimer&#x2019;s disease (EOAD), but it is rare in Puerto Ricans and other populations with late-onset Alzheimer&#x2019;s disease (LOAD) (<xref ref-type="bibr" rid="B2">Arnold et al., 2013</xref>). The mutation was present in the 1000G phase 3 reference panel with an allele frequency of 0.001, but was absent in the HRC reference panel. To verify whether individuals who were found to carry the PSEN1-G206A mutation based on 1000G-imputation, they were genotyped using the KASP genotyping technology by LGC genomics<sup><xref ref-type="fn" rid="fn03">3</xref></sup>, which uses allele-specific PCR for SNP calling. Agreement between imputed and genotype data for the PSEN1-G206A mutation was then assessed. We also tested the effect on imputation quality based on different IMPUTE2-parameters settings, more specifically by modifying the chunk size (i.e., 1 MB vs. 5 MB).</p>
</sec>
</sec>
<sec><title>Results</title>
<sec><title>Comparison of Phasing Tools: Eagle2 vs. SHAPEIT2</title>
<p>To select the optimal tool for phasing, we compared SHAPEIT2 with Eagle2 using Chromosome 21 with 13,066 genotyped SNPs by performing subsequent imputation with IMPUTE2 on phased outputs, and using both reference panels. We found SHAPEIT2 better than Eagle2 when evaluated based on mean <italic>R</italic><sup>2</sup> and &#x201C;Info&#x201D; metric using either the reference panels. For instance, using the 1000G, we observed higher mean <italic>R</italic><sup>2</sup> for data phased with SHAPEIT2 as compared to Eagle2 (0.92 vs. 0.91; Wilcoxon <italic>p</italic>-value &#x003C; 0.001). Similarly, when HRC panel was employed, mean <italic>R</italic><sup>2</sup> of 0.89 was observed for SHAPEIT2 against 0.85 for Eagle2 (Wilcoxon Signed-Rank test <italic>p</italic>-value &#x003C; 0.001).</p>
<p>SNP count comparison details can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary Tables S1</xref>, <xref ref-type="supplementary-material" rid="SM1">S2</xref>. Regardless of the reference panel employed, we observed higher percentage of &#x201C;high-quality&#x201D; rare and ultra-rare SNPs for SHAPEIT-IMPUTE2 than Eagle2-IMPUTE2. For instance, 1000G-imputation retrieved 51.02% of &#x201C;high-quality&#x201D; rare SNPs using SHAPEIT-IMPUTE2 vs. 48.38% with Eagle2-IMPUTE2. Detailed comparisons for different MAF bins and quality threshold can be found in Supplementary Section <xref ref-type="supplementary-material" rid="SM1">S2</xref>. Nevertheless, we found Eagle2 faster than SHAPEIT2 when computation times were compared; for instance, with HRC Eagle2 was &#x223C;6 times faster than SHAPEIT2 (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table S3</xref>). We therefore imputed the remaining chromosomes on phased output from SHAPEIT2. Comparison of phasing tools by assessing switch error rate was beyond the scope of this paper due to limited resources, for e.g., availability of phased reference panel for an admixed population.</p>
</sec>
<sec><title>MaCH-Admix vs. IMPUTE2</title>
<p>We found that SHAPEIT-IMPUTE2 performed better than MaCH-Admix. For Chromosome 21, we imputed 1,104,648 and 646,594 SNPs for SHAPEIT-IMPUTE2 and MaCH-Admix, respectively, 549,091 SNPs were overlapping. For SHAPEIT-IMPUTE2 we observed 446,591 bi-allelic SNPs with &#x201C;Info&#x201D; &#x2265; 0.40, in contrast with 598,943 SNPs with Rsq &#x2265; 0.30 from MaCH-Admix (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table S4</xref>). SNP counts for different MAF bins based on platform-specific quality index can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table S5</xref>. When the two outputs were compared in terms of <italic>r2hat</italic>, SHAPEIT-IMPUTE2 showed a higheraverage r2hat of 0.62 against 0.36 from MaCH-Admix (Wilcoxon Signed-Rank test <italic>p</italic>-value &#x003C; 0.001). Also, MaCH-Admix was 109 times slower than IMPUTE2 (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table S6</xref>), thus, comparison between different panels using MaCH-Admix were excluded due to limited resources. For the remaining of this manuscript, we focused on imputation employing SHAPEIT-IMPUTE2, only.</p>
</sec>
<sec><title>Comparison Between HRC and 1000G Using SHAPEIT-IMPUTE2</title>
<p>Using SHAPEIT-IMPUTE2, we imputed 81,240,392 and 38,532,090 SNPs across all autosomal chromosomes with 1000G and HRC reference panels, respectively (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Type of imputed SNPs across reference panels.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left">Reference Panel</th>
<th valign="top" align="center" colspan="3">Multi-allelic SNPs<hr/></th>
<th valign="top" align="center" colspan="3">Bi-allelic SNPs<hr/></th>
<th valign="top" align="center" colspan="3">Total SNPs<hr/></th>
</tr>
<tr>
<td valign="top" align="left"></td>
<th valign="top" align="left">Total SNPs</th>
<th valign="top" align="left">Info &#x2265; 0.40 (%)</th>
<th valign="top" align="left">Info &#x2265; 0.80 (%)</th>
<th valign="top" align="left">Total SNPs</th>
<th valign="top" align="left">Info &#x2265; 0.40 (%)</th>
<th valign="top" align="left">Info &#x2265; 0.80 (%)</th>
<th valign="top" align="left">Total SNPs</th>
<th valign="top" align="left">Info &#x2265; 0.40 (%)</th>
<th valign="top" align="left">Info &#x2265; 0.80 (%)</th>
</tr>
<tr>
<td valign="top" align="left" colspan="9">All SNPs</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1000G</td>
<td valign="top" align="left">3,319,815</td>
<td valign="top" align="left">2,586,342 (77.90)</td>
<td valign="top" align="left">2,061,295 (62.09)</td>
<td valign="top" align="left">77,920,577</td>
<td valign="top" align="left">31,423,926 (40.32)</td>
<td valign="top" align="left">23,468,086 (30.11)</td>
<td valign="top" align="left">81,240,392</td>
<td valign="top" align="left">31,423,926 (41.86)</td>
<td valign="top" align="left">25,529,381 (31.42)</td>
</tr>
<tr>
<td valign="top" align="left">HRC</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">38,532,090</td>
<td valign="top" align="left">23,436,980 (60.82)</td>
<td valign="top" align="left">18,833,790 (48.87)</td>
<td valign="top" align="left">38,532,090</td>
<td valign="top" align="left">23,436,980 (60.82)</td>
<td valign="top" align="left">18,833,790 (48.79)</td>
</tr>
<tr>
<td valign="top" align="left" colspan="9"><bold>SNPs overlapping HRC and 1000G</bold></td>
</tr>
<tr>
<td valign="top" align="left">1000G</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">30,090,251</td>
<td valign="top" align="left">22,631,112 (75.21)</td>
<td valign="top" align="left">18,408,585 (61.17)</td>
<td valign="top" align="left">30,090,251</td>
<td valign="top" align="left">22,631,112 (75.21)</td>
<td valign="top" align="left">18,408,585 (61.17)</td>
</tr>
<tr>
<td valign="top" align="left">HRC</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">NA</td>
<td valign="top" align="left">30,090,251</td>
<td valign="top" align="left">22,438,268 (74.56)</td>
<td valign="top" align="left">18,395,036 (61.13)</td>
<td valign="top" align="left">30,090,251</td>
<td valign="top" align="left">22,438,268 (74.56)</td>
<td valign="top" align="left">18,395,036 (61.13)</td></tr>
</tbody>
</table>
</table-wrap>
<p>Overall, we observed slightly higher mean R<sup>2</sup> with 1000G than with HRC panel (0.94 vs. 0.92; Wilcoxon <italic>p</italic>-value &#x003C; 0.001). Nevertheless, when the analyses were restricted to only &#x201C;good-&#x201D; and &#x201C;high-quality&#x201D; SNPs, HRC consistently performed better: 60.82% of HRC-imputed SNPs were &#x201C;good-quality&#x201D; and 48.87% were &#x201C;high-quality&#x201D; (Wilcoxon Signed-Rank test <italic>p</italic>-value &#x003C; 0.001). On the contrary, 40.32% of 1000G imputed SNPs were &#x201C;good-quality&#x201D; and 30.11% were &#x201C;high-quality.&#x201D;</p>
<p>Further, we evaluated performance for uncommon, rare and ultra-rare SNPs. For &#x201C;good-&#x201D; and &#x201C;high-quality&#x201D; SNPs, HRC outperformed 1000G. For example, HRC panel produced 62.85% of &#x201C;high-quality&#x201D; rare SNPs, whereas 1000G had 53.83% (<xref ref-type="table" rid="T3">Table 3</xref>). When average imputation &#x201C;Info&#x201D; quality was evaluated, HRC-imputation again performed better than with 1000G (Wilcoxon <italic>p</italic>-value &#x003C; 0.001) (<xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>SNP Counts for all Bi-allelic uncommon, rare and ultra-rare SNPs.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left">MAF</th>
<th valign="top" align="center" colspan="3">1000G<hr/></th>
<th valign="top" align="center" colspan="3">HRC<hr/></th>
</tr>
<tr>
<td valign="top" align="left"></td>
<th valign="top" align="left">Info &#x2265; 0</th>
<th valign="top" align="left">Info &#x2265; 0.40 (%)</th>
<th valign="top" align="left">Info &#x2265; 0.80 (%)</th>
<th valign="top" align="left">Info &#x2265; 0</th>
<th valign="top" align="left">Info &#x2265; 0.40 (%)</th>
<th valign="top" align="left">Info &#x2265; 0.80 (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" colspan="7"><bold>All SNPs</bold></td></tr>
<tr>
<td valign="top" align="left">(1&#x2013;5%)</td>
<td valign="top" align="left">6,025,281</td>
<td valign="top" align="left">5,989,223 (98.90)</td>
<td valign="top" align="left">5,441,982 (90.31)</td>
<td valign="top" align="left">5,434,996</td>
<td valign="top" align="left">5,421,257 (99.84)</td>
<td valign="top" align="left">5,061,904 (93.13)</td>
</tr>
<tr>
<td valign="top" align="left">(0.1&#x2013;1%)</td>
<td valign="top" align="left">20,249,058</td>
<td valign="top" align="left">16,881,286 (83.36)</td>
<td valign="top" align="left">10,901,789 (53.83)</td>
<td valign="top" align="left">11,780,671</td>
<td valign="top" align="left">10,931,924 (92.79)</td>
<td valign="top" align="left">7,404,808 (62.85)</td>
</tr>
<tr>
<td valign="top" align="left">(0&#x2013;0.1%)</td>
<td valign="top" align="left">44,562,205</td>
<td valign="top" align="left">1,490,434 (3.34)</td>
<td valign="top" align="left">242,717 (0.544)</td>
<td valign="top" align="left">15,055,433</td>
<td valign="top" align="left">828,256 (5.50)</td>
<td valign="top" align="left">174,673 (1.16)</td>
</tr>
<tr>
<td valign="top" align="left" colspan="7"><bold>SNPs overlapping HRC and 1000G</bold></td></tr>
<tr>
<td valign="top" align="left">(1&#x2013;5%)</td>
<td valign="top" align="left">5,624,956</td>
<td valign="top" align="left">5,604,308 (99.63)</td>
<td valign="top" align="left">5,148,285 (91.52)</td>
<td valign="top" align="left">5,396,207</td>
<td valign="top" align="left">5,385,364 (99.79)</td>
<td valign="top" align="left">5,037,187 (93.34)</td>
</tr>
<tr>
<td valign="top" align="left">(0.1&#x2013;1%)</td>
<td valign="top" align="left">11,875,603</td>
<td valign="top" align="left">10,442,603 (87.93)</td>
<td valign="top" align="left">7,027,312 (59.17)</td>
<td valign="top" align="left">10,945,899</td>
<td valign="top" align="left">10,268,136 (93.80)</td>
<td valign="top" align="left">7,060,908 (64.50)</td>
</tr>
<tr>
<td valign="top" align="left">(0&#x2013;0.1%)</td>
<td valign="top" align="left">6,314,479</td>
<td valign="top" align="left">312,967 (4.95)</td>
<td valign="top" align="left">47,614 (0.75)</td>
<td valign="top" align="left">7,519,807</td>
<td valign="top" align="left">560,043 (7.44)</td>
<td valign="top" align="left">127,423 (1.69)</td></tr>
</tbody>
</table>
</table-wrap>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Comparison of average Info quality between HRC and 1000G reference panel for all autosomal chromosomes.</p></caption>
<graphic xlink:href="fgene-10-00239-g001.tif"/>
</fig>
<p>Next, we restricted our analyses to <italic>overlapping</italic> SNPs across the two reference panels only, based on their chromosome and position mapping, reference and non-reference alleles. For &#x201C;good-&#x201D;and &#x201C;high-quality&#x201D; SNPs, imputation in both panels performed similarly (<xref ref-type="table" rid="T2">Table 2</xref>). When restricted to uncommon, rare and ultra-rare SNPs, we observed higher percentage of &#x201C;good-&#x201D; and &#x201C;high-quality&#x201D; SNPs for HRC panel as compared to 1000G reference panel (<xref ref-type="table" rid="T3">Table 3</xref>). For example, 7.44% of HRC-imputed ultra-rare SNPs were &#x201C;good-quality&#x201D; vs. 4.95% with the 1000G. 1.69% of HRC-imputed ultra-rare SNPs were &#x201C;high-quality&#x201D; vs. 0.75% with the 1000G. Further, Wilcoxon test on &#x201C;Info&#x201D; value of &#x201C;high-quality&#x201D; ultra-rare SNPs (2,972) again showed better performances when HRC was employed vs. 1000G (<italic>P</italic>-value &#x003C; 0.001). Complete list of counts and percentages across reference panels, MAF bins and quality score can be found in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
</sec>
<sec><title>The Case of G206A and the Effect of Chromosomal Chunk Size on Imputation Quality</title>
<p>SNP rs63750082 is absent from HRC panel therefore no imputation was achieved. Using 1000G reference panel, 12 individuals were imputed as G206A carriers. SNP rs63750082 was imputed with an IMPUTE2 &#x201C;Info&#x201D; score of 0.48 using 1MB as chromosomal region parameter. When we increased the chunk size to 5MB, IMPUTE-Info score drastically improved to 0.94 (<xref ref-type="fig" rid="F2">Figure 2</xref>). Those patients labeled as mutation-carriers according to imputation were then genotyped: all 12 were confirmed to be G206A carriers, therefore achieving a perfect imputation prediction (100% agreement) for that specific SNP.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Comparison of average Info on CHR14: 70&#x2013;75 MB (5 MB) vs. 73&#x2013;74 MB (1 MB) region.</p></caption>
<graphic xlink:href="fgene-10-00239-g002.tif"/>
</fig>
</sec>
<sec><title>Genotype Concordance and Kappa Coefficient</title>
<p>Out of the 1,000 individuals included in our study, 262 had whole exome sequencing (WES) data available (<xref ref-type="bibr" rid="B28">Raghavan et al., 2018</xref>). We had 14,157 overlapping SNPs in WES, HRC and 1000G reference panels with 0% missingness in WES data on Chromosome 14; SNPs imputed with each reference panel were compared against WES data separately. When concordance was evaluated, HRC panel performed slightly poorer, despite showing higher number of &#x201C;high-quality&#x201D; variants as compared to 1000G (<xref ref-type="table" rid="T4">Table 4</xref>). Using 1000G, we observed 3,542 rare and 35 ultra-rare &#x201C;high-quality&#x201D; SNPs; across 262 samples, we counted 1,245 &#x007B;[(1,245/(3,542 &#x00D7; 262)] &#x00D7; 100 = 0.13%&#x007D; and 10 (0.10%) mismatches for rare and ultra-rare, respectively. Using HRC, we retrieved 3,759 rare and 93 ultra-rare &#x201C;high-quality&#x201D; variants; we observed 2,439 (0.24%) and 32 (0.13%) mismatches for rare and ultra-rare variants, respectively. Details about pipeline can be found in Supplementary Section <xref ref-type="supplementary-material" rid="SM1">S3</xref>.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Comparison for mismatch counts and Kappa (<italic>K</italic>) for HRC and 1000G using WES data on Chromosome 14.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left">MAF</th>
<th valign="top" align="center" colspan="4">1000G</th>
<th valign="top" align="center" colspan="4">HRC</th>
</tr>
<tr>
<td valign="top" align="left"></td>
<th valign="top" align="center" colspan="4">Info &#x2265; 0.80<hr/></th>
<th valign="top" align="center" colspan="4">Info &#x2265; 0.80<hr/></th>
</tr>
<tr>
<td valign="top" align="left"></td>
<th valign="top" align="left">SNP</th>
<th valign="top" align="left">Total SNPs in all persons<sup>&#x2217;</sup></th>
<th valign="top" align="left">Mismatch</th>
<th valign="top" align="left">Kappa <italic>(K)</italic></th>
<th valign="top" align="left">SNP</th>
<th valign="top" align="left">Total SNPs in all persons<sup>&#x2217;</sup></th>
<th valign="top" align="left">Mismatch</th>
<th valign="top" align="left">Kappa <italic>(K)</italic></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">(1&#x2013;5%)</td>
<td valign="top" align="left">2,354</td>
<td valign="top" align="left">610,550</td>
<td valign="top" align="left">7,397 (1.22%)</td>
<td valign="top" align="left">0.99</td>
<td valign="top" align="left">2,264</td>
<td valign="top" align="left">587,961</td>
<td valign="top" align="left">8,963 (1.52%)</td>
<td valign="top" align="left">0.99</td>
</tr>
<tr>
<td valign="top" align="left">(0.1&#x2013;1%)</td>
<td valign="top" align="left">3,542</td>
<td valign="top" align="left">926,109</td>
<td valign="top" align="left">1,245 (0.13%)</td>
<td valign="top" align="left">0.99</td>
<td valign="top" align="left">3,759</td>
<td valign="top" align="left">982,734</td>
<td valign="top" align="left">2,439 (0.24%)</td>
<td valign="top" align="left">0.99</td>
</tr>
<tr>
<td valign="top" align="left">(0&#x2013;0.1%)</td>
<td valign="top" align="left">35</td>
<td valign="top" align="left">9,163</td>
<td valign="top" align="left">10</td>
<td valign="top" align="left">0.99</td>
<td valign="top" align="left">93</td>
<td valign="top" align="left">24,348</td>
<td valign="top" align="left">32</td>
<td valign="top" align="left">0.99</td>
</tr>
<tr>
<td valign="top" align="left"></td>
<td valign="top" align="left"></td>
<td valign="top" align="left"></td>
<td valign="top" align="left">(0.10%)</td>
<td valign="top" align="left"></td>
<td valign="top" align="left"></td>
<td valign="top" align="left"></td>
<td valign="top" align="left">(0.13%)</td>
<td valign="top" align="left"></td></tr>
</tbody></table>
<table-wrap-foot>
<attrib><sup>&#x2217;</sup><italic>Less value than 262<sup>&#x2217;</sup>SNP because imputed with poor posterior probability failed to be converted from .gen to PLINK format</italic>.</attrib>
</table-wrap-foot>
</table-wrap>
<p>Next, we computed Cohen&#x2019;s kappa coefficient (<italic>K</italic>) for 14,157 imputed SNPs common in WES and the two reference panels. For both HRC and 1000G-imputation, we observed Kappa (<italic>K</italic>) of &#x223C;0.99 for both rare and ultra-rare &#x201C;high-quality&#x201D; variants (<xref ref-type="table" rid="T4">Table 4</xref>). Details about pipeline can be found in Supplementary Section <xref ref-type="supplementary-material" rid="SM1">S4</xref>.</p>
</sec>
<sec><title>Effects of Ancestry on Imputation Quality</title>
<p>We evaluated the effect of individual ancestral component separately on SNP mismatches for Chromosome 14 on 262 individuals. For both reference panels we found that higher African ancestry (YRI) was associated with higher number of mismatches (<xref ref-type="supplementary-material" rid="SM1">Supplementary Table S7</xref>). For instance, with 1000G reference panel, for rare variants (&#x201C;Info&#x201D; &#x2265; 0.80), we observed an estimate of 1.46 (<italic>P</italic>-value &#x003C; 0.001) for YRI component (indicating that for each unit increase in YRI ancestry, it results in 1.46 additional mismatches). Details on confidence intervals and robust standard errors can be found in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table S7</xref> and Supplementary Section <xref ref-type="supplementary-material" rid="SM1">S5</xref>). We did not observe significant effect of ancestry on &#x201C;high-quality&#x201D; ultra-rare variants in both panels.</p>
</sec>
</sec>
<sec><title>Discussion</title>
<p>This study examined imputation performances in a cohort Caribbean Hispanics, focusing on uncommon, rare and ultra-rare variant, by comparing different phasing and imputation tools, as well as evaluating the effects of different reference panels. Overall, uncommon and rare variants can be well imputed in this population, characterized by a unique genetic background. Caribbean Hispanics are admixed with 59% of their genetic component from European, 32% African, and 8% Native American ancestry (<xref ref-type="bibr" rid="B31">Tosto et al., 2015</xref>). Due to their genetic makeup and unique linkage disequilibrium patterns, admixed populations offer unique opportunity in studying complex diseases. First, disease prevalence varies across ethnic groups (<xref ref-type="bibr" rid="B16">Igartua et al., 2015</xref>) and certain admixed populations show higher incidence rates and prevalence (e.g., Alzheimer&#x2019;s disease, diabetes etc.) or lower ones (e.g., multiple sclerosis). Second, variants that are ethnic-specific may explain a higher prevalence of the disease of interest in admixed groups.</p>
<p>In the present study, we examined multiple parameters of imputation using the Caribbean Hispanics population. First, we found that imputation using SHAPEIT-IMPUTE2 phasing generated better results than Eagle2-IMPUTE2, and SHAPEIT-IMPUTE2 is superior to MaCH-Admix in terms of imputation performances and process time.</p>
<p>Using SHAPEIT-IMPUTE2, 1000G SNPs outnumbered HRC panel because of the higher number of SNPs included in the reference panel itself. However, when we restricted our analyses to overlapping &#x201C;good-&#x201D; and &#x201C;high-quality&#x201D; SNPs (i.e., those variants that most likely would be included in association analyses), HRC-imputation outperformed 1000G with higher. The superior performance of HRC over 1000G was confirmed also when we focused on uncommon, rare and ultra-rare SNPs only. Our findings confirm data in literature, i.e., reference panels with higher number haplotypes perform better in different scenarios.</p>
<p>Additional investigations are needed in order to apply our findings to other admixed and non-admixed populations.</p>
<p>Overall, higher quality of imputation for rare and ultra-rare variants was also confirmed when we tested results against sequencing data. Finally, higher YRI global ancestry was found to significantly impair SNP imputation, suggesting that imputation quality decreases with increased African ancestry.</p>
<p>Lastly, SHAPEIT-IMPUTE2 with 1000G reference panel was successful in identifying G206A mutation carriers. We also noticed that imputation quality drastically improved when imputation was conducted using large (5MB) chunk size as compared to small (1MB) chunks. This seems to contradict previous observation: <xref ref-type="bibr" rid="B33">Zhang et al. (2011)</xref> studied the effect of window size on imputation in an African-American. They concluded that window size of 1MB could be considered acceptable. Possible explanations for these different results might be the more complex admixture of CH compare to AA (three-way vs. two-way admixed) and a more complex LD pattern for the G206A region. Ultimately, we recommend to consider a wider window size to achieve high-quality imputation in specific variants that fail under default settings.</p>
<p>This work has limitations. First, we could carry out the comparison between the two reference panels restricting the analyses to overlapping variants only, limiting our observation to a subset of the variants included in the 1000G panel. This is a result of the HRC composition, which is composed by several studies and ended up including only a consensus number of variants. Second, we tested the agreement between imputed and sequenced variants in a smaller subset of individuals that had both GWAS and WES data available.</p>
</sec>
<sec><title>Data Availability</title>
<p>The datasets for this manuscript are not publicly available because data will be available soon through dbgap website. Requests to access the datasets should be directed to gt2260@cumc.columbia.edu.</p>
</sec>
<sec><title>Ethics Statement</title>
<p>All participants provided written informed consent. Ethical approval for this study was obtained from the Columbia University committee.</p>
</sec>
<sec><title>Author Contributions</title>
<p>SS and GT conceived and designed the study. SS, GT, JL, BV, RM, MM, RL, IJ-V, JM, AB, and DR-D acquired and analyzed the data and drafted the manuscript or figures.</p>
</sec>
<sec><title>Conflict of Interest Statement</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
</body>
<back>
<fn-group>
<fn fn-type="financial-disclosure">
<p><bold>Funding.</bold> This study was supported by funding from the National Institute on Aging [R21AG054832 (GT); 5R37AG015473 and RF1AG015473 (RM); R56 AG051876 and R01 AG058918 (JL)] and the BrightFocus Foundation [A2015633S (JL)].</p>
</fn>
</fn-group>
<ack>
<p>We thank the EFIGA study participants and the EFIGA research and support staff for their contributions to this study.</p>
</ack>
<sec sec-type="supplementary material">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2019.00239/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2019.00239/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.DOCX" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alexander</surname> <given-names>D. H.</given-names></name> <name><surname>Novembre</surname> <given-names>J.</given-names></name> <name><surname>Lange</surname> <given-names>K.</given-names></name></person-group> (<year>2009</year>). <article-title>Fast model-based estimation of ancestry in unrelated individuals.</article-title> <source><italic>Genome Res.</italic></source> <volume>19</volume> <fpage>1655</fpage>&#x2013;<lpage>1664</lpage>. <pub-id pub-id-type="doi">10.1101/gr.094052.109</pub-id> <pub-id pub-id-type="pmid">19648217</pub-id></citation></ref>
<ref id="B2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Arnold</surname> <given-names>S. E.</given-names></name> <name><surname>Vega</surname> <given-names>I. E.</given-names></name> <name><surname>Karlawish</surname> <given-names>J. H.</given-names></name> <name><surname>Wolk</surname> <given-names>D. A.</given-names></name> <name><surname>Nunez</surname> <given-names>J.</given-names></name> <name><surname>Negron</surname> <given-names>M.</given-names></name><etal/></person-group> (<year>2013</year>). <article-title>Frequency and clinicopathological characteristics of presenilin 1 Gly206Ala mutation in Puerto Rican Hispanics with dementia.</article-title> <source><italic>J. Alzheimers Dis.</italic></source> <volume>33</volume> <fpage>1089</fpage>&#x2013;<lpage>1095</lpage>. <pub-id pub-id-type="doi">10.3233/JAD-2012-121570</pub-id> <pub-id pub-id-type="pmid">23114514</pub-id></citation></ref>
<ref id="B3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Athan</surname> <given-names>E. S.</given-names></name> <name><surname>Williamson</surname> <given-names>J.</given-names></name> <name><surname>Ciappa</surname> <given-names>A.</given-names></name> <name><surname>Santana</surname> <given-names>V.</given-names></name> <name><surname>Romas</surname> <given-names>S. N.</given-names></name> <name><surname>Lee</surname> <given-names>J. H.</given-names></name><etal/></person-group> (<year>2001</year>). <article-title>A founder mutation in presenilin 1 causing early-onset Alzheimer disease in unrelated Caribbean Hispanic families.</article-title> <source><italic>JAMA</italic></source> <volume>286</volume> <fpage>2257</fpage>&#x2013;<lpage>2263</lpage>. <pub-id pub-id-type="doi">10.1001/jama.286.18.2257</pub-id> <pub-id pub-id-type="pmid">11710891</pub-id></citation></ref>
<ref id="B4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Browning</surname> <given-names>B. L.</given-names></name> <name><surname>Browning</surname> <given-names>S. R.</given-names></name></person-group> (<year>2009</year>). <article-title>A unified approach to genotype imputation and haplotype-phase inference for large data sets of trios and unrelated individuals.</article-title> <source><italic>Am. J. Hum. Genet.</italic></source> <volume>84</volume> <fpage>210</fpage>&#x2013;<lpage>223</lpage>. <pub-id pub-id-type="doi">10.1016/j.ajhg.2009.01.005</pub-id> <pub-id pub-id-type="pmid">19200528</pub-id></citation></ref>
<ref id="B5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Danecek</surname> <given-names>P.</given-names></name> <name><surname>Auton</surname> <given-names>A.</given-names></name> <name><surname>Abecasis</surname> <given-names>G.</given-names></name> <name><surname>Albers</surname> <given-names>C. A.</given-names></name> <name><surname>Banks</surname> <given-names>E.</given-names></name> <name><surname>DePristo</surname> <given-names>M. A.</given-names></name><etal/></person-group> (<year>2011</year>). <article-title>The variant call format and VCFtools.</article-title> <source><italic>Bioinformatics</italic></source> <volume>27</volume> <fpage>2156</fpage>&#x2013;<lpage>2158</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btr330</pub-id> <pub-id pub-id-type="pmid">21653522</pub-id></citation></ref>
<ref id="B6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Das</surname> <given-names>S.</given-names></name> <name><surname>Forer</surname> <given-names>L.</given-names></name> <name><surname>Schonherr</surname> <given-names>S.</given-names></name> <name><surname>Sidore</surname> <given-names>C.</given-names></name> <name><surname>Locke</surname> <given-names>A. E.</given-names></name> <name><surname>Kwong</surname> <given-names>A.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Next-generation genotype imputation service and methods.</article-title> <source><italic>Nat. Genet.</italic></source> <volume>48</volume> <fpage>1284</fpage>&#x2013;<lpage>1287</lpage>. <pub-id pub-id-type="doi">10.1038/ng.3656</pub-id> <pub-id pub-id-type="pmid">27571263</pub-id></citation></ref>
<ref id="B7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delaneau</surname> <given-names>O.</given-names></name> <name><surname>Zagury</surname> <given-names>J. F.</given-names></name> <name><surname>Marchini</surname> <given-names>J.</given-names></name></person-group> (<year>2013</year>). <article-title>Improved whole-chromosome phasing for disease and population genetic studies.</article-title> <source><italic>Nat. Methods</italic></source> <volume>10</volume> <fpage>5</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.2307</pub-id> <pub-id pub-id-type="pmid">23269371</pub-id></citation></ref>
<ref id="B8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Genomes Project</surname> <given-names>C.</given-names></name> <name><surname>Auton</surname> <given-names>A.</given-names></name> <name><surname>Brooks</surname> <given-names>L. D.</given-names></name> <name><surname>Durbin</surname> <given-names>R. M.</given-names></name> <name><surname>Garrison</surname> <given-names>E. P.</given-names></name> <name><surname>Kang</surname> <given-names>H. M.</given-names></name><etal/></person-group> (<year>2015</year>). <article-title>A global reference for human genetic variation.</article-title> <source><italic>Nature</italic></source> <volume>526</volume> <fpage>68</fpage>&#x2013;<lpage>74</lpage>. <pub-id pub-id-type="doi">10.1038/nature15393</pub-id> <pub-id pub-id-type="pmid">26432245</pub-id></citation></ref>
<ref id="B9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gibson</surname> <given-names>G.</given-names></name></person-group> (<year>2012</year>). <article-title>Rare and common variants: twenty arguments.</article-title> <source><italic>Nat. Rev. Genet.</italic></source> <volume>13</volume> <fpage>135</fpage>&#x2013;<lpage>145</lpage>. <pub-id pub-id-type="doi">10.1038/nrg3118</pub-id> <pub-id pub-id-type="pmid">22251874</pub-id></citation></ref>
<ref id="B10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ha</surname> <given-names>N. T.</given-names></name> <name><surname>Freytag</surname> <given-names>S.</given-names></name> <name><surname>Bickeboeller</surname> <given-names>H.</given-names></name></person-group> (<year>2014</year>). <article-title>Coverage and efficiency in current SNP chips.</article-title> <source><italic>Eur. J. Hum. Genet.</italic></source> <volume>22</volume> <fpage>1124</fpage>&#x2013;<lpage>1130</lpage>. <pub-id pub-id-type="doi">10.1038/ejhg.2013.304</pub-id> <pub-id pub-id-type="pmid">24448550</pub-id></citation></ref>
<ref id="B11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hancock</surname> <given-names>D. B.</given-names></name> <name><surname>Levy</surname> <given-names>J. L.</given-names></name> <name><surname>Gaddis</surname> <given-names>N. C.</given-names></name> <name><surname>Bierut</surname> <given-names>L. J.</given-names></name> <name><surname>Saccone</surname> <given-names>N. L.</given-names></name> <name><surname>Page</surname> <given-names>G. P.</given-names></name><etal/></person-group> (<year>2012</year>). <article-title>Assessment of genotype imputation performance using 1000 Genomes in African American studies.</article-title> <source><italic>PLoS One</italic></source> <volume>7</volume>:<issue>e50610</issue>. <pub-id pub-id-type="doi">10.1371/journal.pone.0050610</pub-id> <pub-id pub-id-type="pmid">23226329</pub-id></citation></ref>
<ref id="B12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Herzig</surname> <given-names>A. F.</given-names></name> <name><surname>Nutile</surname> <given-names>T.</given-names></name> <name><surname>Babron</surname> <given-names>M. C.</given-names></name> <name><surname>Ciullo</surname> <given-names>M.</given-names></name> <name><surname>Bellenguez</surname> <given-names>C.</given-names></name> <name><surname>Leutenegger</surname> <given-names>A. L.</given-names></name></person-group> (<year>2018</year>). <article-title>Strategies for phasing and imputation in a population isolate.</article-title> <source><italic>Genet. Epidemiol.</italic></source> <volume>42</volume> <fpage>201</fpage>&#x2013;<lpage>213</lpage>. <pub-id pub-id-type="doi">10.1002/gepi.22109</pub-id> <pub-id pub-id-type="pmid">29319195</pub-id></citation></ref>
<ref id="B13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Howie</surname> <given-names>B.</given-names></name> <name><surname>Fuchsberger</surname> <given-names>C.</given-names></name> <name><surname>Stephens</surname> <given-names>M.</given-names></name> <name><surname>Marchini</surname> <given-names>J.</given-names></name> <name><surname>Abecasis</surname> <given-names>G. R.</given-names></name></person-group> (<year>2012</year>). <article-title>Fast and accurate genotype imputation in genome-wide association studies through pre-phasing.</article-title> <source><italic>Nat. Genet.</italic></source> <volume>44</volume> <fpage>955</fpage>&#x2013;<lpage>959</lpage>. <pub-id pub-id-type="doi">10.1038/ng.2354</pub-id> <pub-id pub-id-type="pmid">22820512</pub-id></citation></ref>
<ref id="B14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Howie</surname> <given-names>B. N.</given-names></name> <name><surname>Donnelly</surname> <given-names>P.</given-names></name> <name><surname>Marchini</surname> <given-names>J.</given-names></name></person-group> (<year>2009</year>). <article-title>A flexible and accurate genotype imputation method for the next generation of genome-wide association studies.</article-title> <source><italic>PLoS Genet.</italic></source> <volume>5</volume>:<issue>e1000529</issue>. <pub-id pub-id-type="doi">10.1371/journal.pgen.1000529</pub-id> <pub-id pub-id-type="pmid">19543373</pub-id></citation></ref>
<ref id="B15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>J.</given-names></name> <name><surname>Howie</surname> <given-names>B.</given-names></name> <name><surname>McCarthy</surname> <given-names>S.</given-names></name> <name><surname>Memari</surname> <given-names>Y.</given-names></name> <name><surname>Walter</surname> <given-names>K.</given-names></name> <name><surname>Min</surname> <given-names>J. L.</given-names></name><etal/></person-group> (<year>2015</year>). <article-title>Improved imputation of low-frequency and rare variants using the UK10K haplotype reference panel.</article-title> <source><italic>Nat. Commun.</italic></source> <volume>6</volume>:<issue>8111</issue>. <pub-id pub-id-type="doi">10.1038/ncomms9111</pub-id> <pub-id pub-id-type="pmid">26368830</pub-id></citation></ref>
<ref id="B16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Igartua</surname> <given-names>C.</given-names></name> <name><surname>Myers</surname> <given-names>R. A.</given-names></name> <name><surname>Mathias</surname> <given-names>R. A.</given-names></name> <name><surname>Pino-Yanes</surname> <given-names>M.</given-names></name> <name><surname>Eng</surname> <given-names>C.</given-names></name> <name><surname>Graves</surname> <given-names>P. E.</given-names></name><etal/></person-group> (<year>2015</year>). <article-title>Ethnic-specific associations of rare and low-frequency DNA sequence variants with asthma.</article-title> <source><italic>Nat. Commun.</italic></source> <volume>6</volume>:<issue>5965</issue>. <pub-id pub-id-type="doi">10.1038/ncomms6965</pub-id> <pub-id pub-id-type="pmid">25591454</pub-id></citation></ref>
<ref id="B17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>J. H.</given-names></name> <name><surname>Cheng</surname> <given-names>R.</given-names></name> <name><surname>Vardarajan</surname> <given-names>B.</given-names></name> <name><surname>Lantigua</surname> <given-names>R.</given-names></name> <name><surname>Reyes-Dumeyer</surname> <given-names>D.</given-names></name> <name><surname>Ortmann</surname> <given-names>W.</given-names></name><etal/></person-group> (<year>2015</year>). <article-title>Genetic modifiers of age at onset in carriers of the G206A mutation in PSEN1 with familial Alzheimer disease among caribbean hispanics.</article-title> <source><italic>JAMA Neurol.</italic></source> <volume>72</volume> <fpage>1043</fpage>&#x2013;<lpage>1051</lpage>. <pub-id pub-id-type="doi">10.1001/jamaneurol.2015.1424</pub-id> <pub-id pub-id-type="pmid">26214276</pub-id></citation></ref>
<ref id="B18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J. Z.</given-names></name> <name><surname>Absher</surname> <given-names>D. M.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Southwick</surname> <given-names>A. M.</given-names></name> <name><surname>Casto</surname> <given-names>A. M.</given-names></name> <name><surname>Ramachandran</surname> <given-names>S.</given-names></name><etal/></person-group> (<year>2008</year>). <article-title>Worldwide human relationships inferred from genome-wide patterns of variation.</article-title> <source><italic>Science</italic></source> <volume>319</volume> <fpage>1100</fpage>&#x2013;<lpage>1104</lpage>. <pub-id pub-id-type="doi">10.1126/science.1153717</pub-id> <pub-id pub-id-type="pmid">18292342</pub-id></citation></ref>
<ref id="B19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>E. Y.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name></person-group> (<year>2013</year>). <article-title>MaCH-admix: genotype imputation for admixed populations.</article-title> <source><italic>Genet. Epidemiol.</italic></source> <volume>37</volume> <fpage>25</fpage>&#x2013;<lpage>37</lpage>. <pub-id pub-id-type="doi">10.1002/gepi.21690</pub-id> <pub-id pub-id-type="pmid">23074066</pub-id></citation></ref>
<ref id="B20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Q.</given-names></name> <name><surname>Cirulli</surname> <given-names>E. T.</given-names></name> <name><surname>Han</surname> <given-names>Y.</given-names></name> <name><surname>Yao</surname> <given-names>S.</given-names></name> <name><surname>Liu</surname> <given-names>S.</given-names></name> <name><surname>Zhu</surname> <given-names>Q.</given-names></name></person-group> (<year>2015</year>). <article-title>Systematic assessment of imputation performance using the 1000 Genomes reference panels.</article-title> <source><italic>Brief Bioinform.</italic></source> <volume>16</volume> <fpage>549</fpage>&#x2013;<lpage>562</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbu035</pub-id> <pub-id pub-id-type="pmid">25246238</pub-id></citation></ref>
<ref id="B21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Loh</surname> <given-names>P. R.</given-names></name> <name><surname>Danecek</surname> <given-names>P.</given-names></name> <name><surname>Palamara</surname> <given-names>P. F.</given-names></name> <name><surname>Fuchsberger</surname> <given-names>C.</given-names></name> <name><surname>Reshef</surname> <given-names>Y. A.</given-names></name> <name><surname>Finucane</surname> <given-names>H. K.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Reference-based phasing using the haplotype reference consortium panel.</article-title> <source><italic>Nat. Genet.</italic></source> <volume>48</volume> <fpage>1443</fpage>&#x2013;<lpage>1448</lpage>. <pub-id pub-id-type="doi">10.1038/ng.3679</pub-id> <pub-id pub-id-type="pmid">27694958</pub-id></citation></ref>
<ref id="B22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Marchini</surname> <given-names>J.</given-names></name> <name><surname>Howie</surname> <given-names>B.</given-names></name></person-group> (<year>2010</year>). <article-title>Genotype imputation for genome-wide association studies.</article-title> <source><italic>Nat. Rev. Genet.</italic></source> <volume>11</volume> <fpage>499</fpage>&#x2013;<lpage>511</lpage>. <pub-id pub-id-type="doi">10.1038/nrg2796</pub-id> <pub-id pub-id-type="pmid">20517342</pub-id></citation></ref>
<ref id="B23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>McHugh</surname> <given-names>M. L.</given-names></name></person-group> (<year>2012</year>). <article-title>Interrater reliability: the kappa statistic.</article-title> <source><italic>Biochem. Med.</italic></source> <volume>22</volume> <fpage>276</fpage>&#x2013;<lpage>282</lpage>. <pub-id pub-id-type="doi">10.11613/BM.2012.031</pub-id></citation></ref>
<ref id="B24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nagy</surname> <given-names>R.</given-names></name> <name><surname>Boutin</surname> <given-names>T. S.</given-names></name> <name><surname>Marten</surname> <given-names>J.</given-names></name> <name><surname>Huffman</surname> <given-names>J. E.</given-names></name> <name><surname>Kerr</surname> <given-names>S. M.</given-names></name> <name><surname>Campbell</surname> <given-names>A.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>Exploration of haplotype research consortium imputation for genome-wide association studies in 20,032 Generation Scotland participants.</article-title> <source><italic>Genome Med.</italic></source> <volume>9</volume>:<issue>23</issue>. <pub-id pub-id-type="doi">10.1186/s13073-017-0414-4</pub-id> <pub-id pub-id-type="pmid">28270201</pub-id></citation></ref>
<ref id="B25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nelson</surname> <given-names>S. C.</given-names></name> <name><surname>Stilp</surname> <given-names>A. M.</given-names></name> <name><surname>Papanicolaou</surname> <given-names>G. J.</given-names></name> <name><surname>Taylor</surname> <given-names>K. D.</given-names></name> <name><surname>Rotter</surname> <given-names>J. I.</given-names></name> <name><surname>Thornton</surname> <given-names>T. A.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Improved imputation accuracy in Hispanic/Latino populations with larger and more diverse reference panels: applications in the Hispanic Community Health Study/Study of Latinos (HCHS/SOL).</article-title> <source><italic>Hum. Mol. Genet.</italic></source> <volume>25</volume> <fpage>3245</fpage>&#x2013;<lpage>3254</lpage>. <pub-id pub-id-type="doi">10.1093/hmg/ddw174</pub-id> <pub-id pub-id-type="pmid">27346520</pub-id></citation></ref>
<ref id="B26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pei</surname> <given-names>Y. F.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Deng</surname> <given-names>H. W.</given-names></name></person-group> (<year>2010</year>). <article-title>Analyses and comparison of imputation-based association methods.</article-title> <source><italic>PLoS One</italic></source> <volume>5</volume>:<issue>e10827</issue>. <pub-id pub-id-type="doi">10.1371/journal.pone.0010827</pub-id> <pub-id pub-id-type="pmid">20520814</pub-id></citation></ref>
<ref id="B27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Purcell</surname> <given-names>S.</given-names></name> <name><surname>Neale</surname> <given-names>B.</given-names></name> <name><surname>Todd-Brown</surname> <given-names>K.</given-names></name> <name><surname>Thomas</surname> <given-names>L.</given-names></name> <name><surname>Ferreira</surname> <given-names>M. A.</given-names></name> <name><surname>Bender</surname> <given-names>D.</given-names></name><etal/></person-group> (<year>2007</year>). <article-title>PLINK: a tool set for whole-genome association and population-based linkage analyses.</article-title> <source><italic>Am. J. Hum. Genet.</italic></source> <volume>81</volume> <fpage>559</fpage>&#x2013;<lpage>575</lpage>. <pub-id pub-id-type="doi">10.1086/519795</pub-id> <pub-id pub-id-type="pmid">17701901</pub-id></citation></ref>
<ref id="B28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Raghavan</surname> <given-names>N. S.</given-names></name> <name><surname>Brickman</surname> <given-names>A. M.</given-names></name> <name><surname>Andrews</surname> <given-names>H.</given-names></name> <name><surname>Manly</surname> <given-names>J. J.</given-names></name> <name><surname>Schupf</surname> <given-names>N.</given-names></name> <name><surname>Lantigua</surname> <given-names>R.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>Whole-exome sequencing in 20,197 persons for rare variants in Alzheimer&#x2019;s disease.</article-title> <source><italic>Ann. Clin. Transl. Neurol.</italic></source> <volume>5</volume> <fpage>832</fpage>&#x2013;<lpage>842</lpage>. <pub-id pub-id-type="doi">10.1002/acn3.582</pub-id> <pub-id pub-id-type="pmid">30847376</pub-id></citation></ref>
<ref id="B29"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roshyara</surname> <given-names>N. R.</given-names></name> <name><surname>Kirsten</surname> <given-names>H.</given-names></name> <name><surname>Horn</surname> <given-names>K.</given-names></name> <name><surname>Ahnert</surname> <given-names>P.</given-names></name> <name><surname>Scholz</surname> <given-names>M.</given-names></name></person-group> (<year>2014</year>). <article-title>Impact of pre-imputation SNP-filtering on genotype imputation results.</article-title> <source><italic>BMC Genet.</italic></source> <volume>15</volume>:<issue>88</issue>. <pub-id pub-id-type="doi">10.1186/s12863-014-0088-5</pub-id> <pub-id pub-id-type="pmid">25112433</pub-id></citation></ref>
<ref id="B30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Surakka</surname> <given-names>I.</given-names></name> <name><surname>Sarin</surname> <given-names>A.-P.</given-names></name> <name><surname>Ruotsalainen</surname> <given-names>S. E.</given-names></name> <name><surname>Durbin</surname> <given-names>R.</given-names></name> <name><surname>Salomaa</surname> <given-names>V.</given-names></name> <name><surname>Daly</surname> <given-names>M. J.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>The rate of false polymorphisms introduced when imputing genotypes from global imputation panels.</article-title> <source><italic>bioRxiv</italic></source> [Preprint]. <pub-id pub-id-type="doi">10.1101/080770</pub-id></citation></ref>
<ref id="B31"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tosto</surname> <given-names>G.</given-names></name> <name><surname>Fu</surname> <given-names>H.</given-names></name> <name><surname>Vardarajan</surname> <given-names>B. N.</given-names></name> <name><surname>Lee</surname> <given-names>J. H.</given-names></name> <name><surname>Cheng</surname> <given-names>R.</given-names></name> <name><surname>Reyes-Dumeyer</surname> <given-names>D.</given-names></name><etal/></person-group> (<year>2015</year>). <article-title>F-box/LRR-repeat protein 7 is genetically associated with Alzheimer&#x2019;s disease.</article-title> <source><italic>Ann. Clin. Transl. Neurol.</italic></source> <volume>2</volume> <fpage>810</fpage>&#x2013;<lpage>820</lpage>. <pub-id pub-id-type="doi">10.1002/acn3.223</pub-id> <pub-id pub-id-type="pmid">26339675</pub-id></citation></ref>
<ref id="B32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Verma</surname> <given-names>S. S.</given-names></name> <name><surname>de Andrade</surname> <given-names>M.</given-names></name> <name><surname>Tromp</surname> <given-names>G.</given-names></name> <name><surname>Kuivaniemi</surname> <given-names>H.</given-names></name> <name><surname>Pugh</surname> <given-names>E.</given-names></name> <name><surname>Namjou-Khales</surname> <given-names>B.</given-names></name><etal/></person-group> (<year>2014</year>). <article-title>Imputation and quality control steps for combining multiple genome-wide datasets.</article-title> <source><italic>Front. Genet.</italic></source> <volume>5</volume>:<issue>370</issue>. <pub-id pub-id-type="doi">10.3389/fgene.2014.00370</pub-id> <pub-id pub-id-type="pmid">25566314</pub-id></citation></ref>
<ref id="B33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>B.</given-names></name> <name><surname>Zhi</surname> <given-names>D.</given-names></name> <name><surname>Zhang</surname> <given-names>K.</given-names></name> <name><surname>Gao</surname> <given-names>G.</given-names></name> <name><surname>Limdi</surname> <given-names>N. N.</given-names></name> <name><surname>Liu</surname> <given-names>N.</given-names></name></person-group> (<year>2011</year>). <article-title>Practical consideration of genotype imputation: sample size, window size, reference choice, and untyped rate.</article-title> <source><italic>Stat. Interface</italic></source> <volume>4</volume> <fpage>339</fpage>&#x2013;<lpage>352</lpage>. <pub-id pub-id-type="doi">10.4310/SII.2011.v4.n3.a8</pub-id> <pub-id pub-id-type="pmid">22308193</pub-id></citation></ref>
<ref id="B34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>H. F.</given-names></name> <name><surname>Ladouceur</surname> <given-names>M.</given-names></name> <name><surname>Greenwood</surname> <given-names>C. M.</given-names></name> <name><surname>Richards</surname> <given-names>J. B.</given-names></name></person-group> (<year>2012</year>). <article-title>Effect of genome-wide genotyping and reference panels on rare variants imputation.</article-title> <source><italic>J. Genet. Genom.</italic></source> <volume>39</volume> <fpage>545</fpage>&#x2013;<lpage>550</lpage>. <pub-id pub-id-type="doi">10.1016/j.jgg.2012.07.002</pub-id> <pub-id pub-id-type="pmid">23089364</pub-id></citation></ref>
<ref id="B35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>H. F.</given-names></name> <name><surname>Rong</surname> <given-names>J. J.</given-names></name> <name><surname>Liu</surname> <given-names>M.</given-names></name> <name><surname>Han</surname> <given-names>F.</given-names></name> <name><surname>Zhang</surname> <given-names>X. W.</given-names></name> <name><surname>Richards</surname> <given-names>J. B.</given-names></name><etal/></person-group> (<year>2015</year>). <article-title>Performance of genotype imputation for low frequency and rare variants from the 1000 genomes.</article-title> <source><italic>PLoS One</italic></source> <volume>10</volume>:<issue>e0116487</issue>. <pub-id pub-id-type="doi">10.1371/journal.pone.0116487</pub-id> <pub-id pub-id-type="pmid">25621886</pub-id></citation></ref>
<ref id="B36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>H.</given-names></name> <name><surname>Alexander</surname> <given-names>D.</given-names></name> <name><surname>Lange</surname> <given-names>K.</given-names></name></person-group> (<year>2011</year>). <article-title>A quasi-Newton acceleration for high-dimensional optimization algorithms.</article-title> <source><italic>Stat. Comput.</italic></source> <volume>21</volume> <fpage>261</fpage>&#x2013;<lpage>273</lpage>. <pub-id pub-id-type="doi">10.1007/s11222-009-9166-3</pub-id> <pub-id pub-id-type="pmid">21359052</pub-id></citation></ref>
</ref-list>
<fn-group>
<fn id="fn01"><label>1</label><p><ext-link ext-link-type="uri" xlink:href="https://mathgen.stats.ox.ac.uk/impute/1000GP_Phase3.tgz">https://mathgen.stats.ox.ac.uk/impute/1000GP_Phase3.tgz</ext-link></p></fn>
<fn id="fn02"><label>2</label><p><ext-link ext-link-type="uri" xlink:href="http://csg.sph.umich.edu/yli/r2_hat.v107.tgz">http://csg.sph.umich.edu/yli/r2_hat.v107.tgz</ext-link></p></fn>
<fn id="fn03"><label>3</label><p><ext-link ext-link-type="uri" xlink:href="https://www.lgcgroup.com">https://www.lgcgroup.com</ext-link></p></fn>
</fn-group>
</back>
</article>