<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comput. Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Computer Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comput. Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-9898</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fcomp.2025.1710121</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Enhancing RAPTOR with semantic chunking and adaptive graph clustering</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Yan</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Xie</surname>
<given-names>Xiaodong</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3205809"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wan</surname>
<given-names>Xin</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3280405"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Pan</surname>
<given-names>Yi</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Cheng</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/632926"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>College of Computer Science and Technology, Huaqiao University</institution>, <city>Xiamen</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>School of Computer Science and Engineering, Changsha University</institution>, <city>Changsha</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Xiaodong Xie, <email xlink:href="mailto:Xiaodongxie@hqu.edu.cn">Xiaodongxie@hqu.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-12">
<day>12</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>7</volume>
<elocation-id>1710121</elocation-id>
<history>
<date date-type="received">
<day>21</day>
<month>09</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>06</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>15</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Liu, Xie, Wan, Pan and Wang.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Liu, Xie, Wan, Pan and Wang</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-12">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec id="sec1001">
<title>Introduction</title>
<p>While Retrieval-Augmented Generation (RAG) enhances language models, its application to long documents is often hampered by simplistic retrieval strategies that fail to capture hierarchical context. Although the RAPTOR framework addresses this through a recursive tree-structured approach, its effectiveness is constrained by semantic fragmentation from fixed-token chunking and a static clustering methodology that is suboptimal for organizing the hierarchy.</p>
</sec>
<sec id="sec1002">
<title>Methods</title>
<p>In this paper, we propose a comprehensive two-stage enhancement framework to address these limitations. We first employ Semantic Segmentation to generate coherent foundational leaf nodes, and subsequently introduce an Adaptive Graph Clustering (AGC) strategy. This strategy leverages the Leiden algorithm with a novel layer-aware dual-adaptive parameter mechanism to dynamically tailor clustering granularity.</p>
</sec>
<sec id="sec1003">
<title>Results</title>
<p>Extensive experiments on the narrative QuALITY benchmark and the scientific Qasper dataset demonstrate the robustness and domain generalization of our framework. Our full model achieves a peak accuracy of 65.5% on QuALITY and demonstrates superior semantic validity on Qasper, significantly outperforming the baseline. Comparative ablation studies further reveal that our graph-topological approach outperforms traditional distance-based, density-based, and distribution-based clustering methods. Additionally, our approach constructs a dramatically more compact hierarchy, reducing the number of required summary nodes by up to 76%.</p>
</sec>
<sec id="sec1004">
<title>Discussion</title>
<p>This work underscores the critical importance of a holistic, semantic-first approach to building more effective and efficient retrieval trees for complex RAG tasks.</p>
</sec>
</abstract>
<kwd-group>
<kwd>adaptive clustering</kwd>
<kwd>graph clustering</kwd>
<kwd>hierarchical retrieval</kwd>
<kwd>RAPTOR</kwd>
<kwd>retrieval-augmented generation (RAG)</kwd>
<kwd>semantic segmentation</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This study was supported by Fujian Province (2024HZ022013), Xiamen (XJK2025-1-2), and Quanzhou (2025QZNS001, 2023GZ5).</funding-statement>
</funding-group>
<counts>
<fig-count count="2"/>
<table-count count="4"/>
<equation-count count="3"/>
<ref-count count="24"/>
<page-count count="11"/>
<word-count count="7926"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Software</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>Retrieval-Augmented Generation (RAG) has emerged as a powerful paradigm for enhancing Large Language Models (LLMs) (<xref ref-type="bibr" rid="ref5">Brown et al., 2020</xref>; <xref ref-type="bibr" rid="ref9">Chowdhery et al., 2023</xref>) by providing them with external, up-to-date, and verifiable knowledge (<xref ref-type="bibr" rid="ref16">Lewis et al., 2020</xref>). This approach mitigates issues of hallucination and allows LLMs to reason over information not present in their training data (<xref ref-type="bibr" rid="ref14">Jiang et al., 2020</xref>). However, as the length and complexity of source documents increase, standard RAG systems face significant challenges (<xref ref-type="bibr" rid="ref4">Barnett et al., 2024</xref>), primarily due to the fixed context window of LLMs and the difficulty of identifying relevant information scattered across long texts (<xref ref-type="bibr" rid="ref17">Liu et al., 2024</xref>).</p>
<p>The RAPTOR framework (<xref ref-type="bibr" rid="ref22">Sarthi et al., 2024</xref>) introduced an innovative solution to this problem by proposing a tree-structured, hierarchical approach to document representation. Through a recursive &#x201C;embed-cluster-summarize&#x201D; process, RAPTOR creates a multi-layered abstraction of the text, enabling efficient retrieval of information at varying levels of granularity, from specific details to high-level themes. This architecture has demonstrated significant potential for long-document question answering.</p>
<p>Despite its novel design, the effectiveness of the RAPTOR tree is contingent upon two fundamental stages, both of which present opportunities for significant improvement. The first is the leaf node generation. RAPTOR&#x2019;s reliance on a fixed-token chunking strategy is oblivious to the semantic boundaries of the text, often resulting in the fragmentation of coherent logical units. This creates a weak and semantically disjointed foundation for the entire tree. The second limitation lies in the hierarchical clustering process itself. The use of conventional clustering algorithms, such as Gaussian Mixture Models (GMM), with static parameters is often suboptimal for the complex, non-spherical manifolds of text embeddings. A rigid clustering strategy struggles to adapt to the different levels of semantic abstraction required at different depths of the tree.</p>
<p>To address these dual limitations, we propose a comprehensive, two-stage enhancement framework for RAPTOR. First, we replace the fixed-token chunking with a semantic segmentation strategy, ensuring that the foundational leaf nodes are semantically coherent and self-contained. Second, we introduce a novel adaptive graph clustering methodology. This approach leverages the state-of-the-art Leiden algorithm for community detection and incorporates a layer-aware dual-adaptive parameter strategy, which dynamically adjusts the clustering granularity to match the level of abstraction at each layer of the tree.</p>
<p>The main contributions of this work are as follows:</p>
<list list-type="bullet">
<list-item>
<p>We introduce a holistic, two-stage enhancement framework that optimizes both the foundational leaf nodes and the internal hierarchical structure of the RAPTOR tree.</p>
</list-item>
<list-item>
<p>We demonstrate that employing semantic segmentation for initial chunking provides a superior foundation, leading to significant performance improvements in downstream retrieval tasks.</p>
</list-item>
<list-item>
<p>We design and implement an adaptive graph clustering algorithm that constructs a more compact, efficient, and semantically meaningful hierarchy, showing a strong synergistic effect when combined with high-quality leaf nodes.</p>
</list-item>
<list-item>
<p>Through extensive experiments on both the narrative QuALITY benchmark and the scientific Qasper dataset, we validate the robustness and domain generalization of our framework. Our results show that the full model consistently outperforms the original RAPTOR baseline. Furthermore, a comparative ablation study against distance-based (Agglomerative) and density-based (HDBSCAN) clustering methods demonstrates the superior efficacy of our graph-topological approach in organizing complex semantic information.</p>
</list-item>
</list>
<p>The remainder of this paper is organized as follows: Section 2 reviews related work in text chunking and clustering. Section 3 details our proposed two-stage methodology. Section 4 presents our experimental setup, results, and analysis. Finally, Section 5 concludes the paper and discusses future work.</p>
</sec>
<sec id="sec2">
<label>2</label>
<title>Background</title>
<sec id="sec3">
<label>2.1</label>
<title>The chunking challenges in RAG</title>
<p>The performance of RAG systems hinges on how documents are segmented into chunks. An effective strategy must balance two competing demands:</p>
<list list-type="simple">
<list-item>
<p><italic>Relevance</italic>: Small chunks improve retrieval precision by reducing noise.</p>
</list-item>
<list-item>
<p><italic>Contextual Integrity</italic>: Overly fine-grained chunks lose logical connections between paragraphs.</p>
</list-item>
</list>
<p>Existing methods attempt to address this trade-off with varying limitations. Fixed-size chunking simply splits text by token count, often breaking semantic units (<xref ref-type="bibr" rid="ref24">Zhang et al., 2023</xref>). While recursive chunking relies on heuristic, rule-based delimiters like paragraphs or sentences, semantic chunking methods (<xref ref-type="bibr" rid="ref7">Chen et al., 2024a</xref>) leverage embeddings to quantify coherence through cosine similarity, dynamically aligning chunks with topic boundaries.</p>
<p>Traditional semantic chunking methods mainly rely on lexical cohesion (<xref ref-type="bibr" rid="ref13">Hearst, 1997</xref>) to detect discourse boundaries. However, these methods perform poorly on text paragraphs with rich lexical variations but consistent themes. With the development of deep learning, modern text segmentation techniques have generally shifted towards semantic representation methods based on pre-trained language models (PLMs). The core idea within this paradigm is to encode text units (such as sentences or paragraphs) into high dimensional dense embedding vectors (<xref ref-type="bibr" rid="ref15">Karpukhin et al., 2020</xref>) through language models like BERT or Sentence-BERT (<xref ref-type="bibr" rid="ref21">Reimers and Gurevych, 2019</xref>; <xref ref-type="bibr" rid="ref19">Nair et al., 2023</xref>). These embedding vectors map the semantic content of the text to specific coordinates in the vector space, enabling semantic associations to be quantified through the geometric relationships between vectors. When quantifying the semantic coherence between adjacent text units, cosine similarity has become a de-facto standard metric. It measures the consistency in semantic direction between two embedding vectors by calculating the cosine of the angle between them. Its formal definition is as follows:</p>
<disp-formula id="E1">
<mml:math id="M1">
<mml:mi mathvariant="italic">si</mml:mi>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>cos</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="true">&#x2016;</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="true">&#x2016;</mml:mo>
<mml:mo stretchy="true">&#x2016;</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">&#x2016;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula>
<p>The key advantage of cosine similarity lies in its magnitude invariance. This means that it only focuses on the direction of vectors (i.e., the theme), while ignoring differences in vector magnitude caused by factors such as sentence length or lexical complexity. This makes it particularly robust when comparing texts with varying levels of detail but consistent themes.</p>
<p>Based on this, researchers typically use cosine distance, which is defined as</p>
<disp-formula id="E2">
<mml:math id="M2">
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="italic">si</mml:mi>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>cos</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>,</mml:mo>
</mml:math>
</disp-formula>
<p>transforming the similarity problem into a distance metric. A smaller distance value indicates a high degree of semantic continuity. When the distance value exceeds a certain preset threshold or a local peak occurs, it is considered that semantic discontinuity has occurred. Based on this signal, the algorithm can infer the boundaries of paragraphs at the corresponding positions. Although other metrics such as Euclidean distance (L2 Norm) can also be used, due to their sensitivity to vector magnitude, they are not as widely applied in the field of text semantic analysis as cosine distance. This segmentation strategy based on semantic distance has become one of the mainstream techniques for text chunking in current long document understanding, information retrieval, and Retrieval Augmented Generation (RAG) systems.</p>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Clustering methods for text representation</title>
<p>Clustering is a fundamental unsupervised learning technique for organizing text documents by grouping semantically similar items (<xref ref-type="bibr" rid="ref2">Aggarwal and Zhai, 2012</xref>). The choice of algorithm is crucial as it directly influences the quality of the resulting topical hierarchy.</p>
<p>A prevalent category of algorithms, including K-Means and Gaussian Mixture Models (GMMs), operates on geometric or distributional assumptions. These methods aim to partition the embedding space into clusters that are geometrically compact or fit a predefined probability distribution. However, they often presuppose convex or ellipsoidal cluster shapes, a constraint that is frequently violated by the complex, manifold-like structures of thematic groups in textual data (<xref ref-type="bibr" rid="ref18">McInnes et al., 2018</xref>). Furthermore, their efficacy can be limited in the high-dimensional spaces of modern text embeddings, where geometric assumptions may not hold (<xref ref-type="bibr" rid="ref1">Aggarwal et al., 2001</xref>; <xref ref-type="bibr" rid="ref3">Aljaloud et al., 2024</xref>).</p>
<p>An alternative and more robust paradigm is graph-based clustering, often framed as community detection. Advanced topic modeling frameworks like BERTopic (<xref ref-type="bibr" rid="ref12">Grootendorst, 2022</xref>) similarly leverage the rich representations from deep embeddings to uncover complex structures, moving beyond simple geometric assumptions. This approach models documents as nodes in a graph, with edge weights representing semantic similarity. The objective is to identify densely interconnected communities of nodes. This method is agnostic to cluster shape and is thus highly effective at uncovering complex thematic structures. The Leiden algorithm (<xref ref-type="bibr" rid="ref23">Traag et al., 2019</xref>) represents the state-of-the-art in this domain, recognized for its efficiency and its ability to yield well-connected, high-quality communities, making it particularly suitable for discovering latent topics in large text corpora.</p>
</sec>
<sec id="sec5">
<label>2.3</label>
<title>RAPTOR system</title>
<p>To address the challenge of long-document understanding, the RAPTOR system (<xref ref-type="bibr" rid="ref22">Sarthi et al. 2024</xref>; <xref ref-type="bibr" rid="ref6">Cao and Wang, 2022</xref>) introduces a tree-structured indexing approach. It hierarchically organizes information through a recursive &#x201C;embed-cluster-summarize&#x201D; process. The system first generates leaf nodes from initial text chunks, and then recursively groups them using clustering algorithms such as Gaussian Mixture Models (GMM). The nodes in each cluster are then summarized by a large language model to form a parent node at a higher level of abstraction (<xref ref-type="bibr" rid="ref11">Gidi and Cohen, 2022</xref>). This architecture effectively creates a multi-layered semantic hierarchy, from fine-grained details to high-level themes.</p>
<p>While this framework is powerful, its performance is highly dependent on the quality of both its foundational leaf nodes and its structural integrity. This exposes two potential limitations. First, its reliance on a fixed-token chunking strategy can fragment semantically coherent text units, compromising the quality of the leaf nodes. Second, as discussed in Section 2.2, the use of a conventional, distribution-based clustering algorithm like GMM may not optimally capture the complex, non-spherical thematic structures often present in text embedding spaces.</p>
<p>These limitations in both the leaf node generation and the hierarchical clustering stages motivate our work. In this paper, we propose a two-stage enhancement to build more semantically robust and structurally sound retrieval trees.</p>
</sec>
</sec>
<sec id="sec6">
<label>3</label>
<title>An enhanced RAPTOR tree construction framework</title>
<p>In this section, we present our two-stage framework for enhancing the RAPTOR tree construction process. Our approach is designed to build a more semantically robust and structurally coherent retrieval tree by optimizing both the foundational leaf node generation and the subsequent hierarchical clustering. <xref ref-type="fig" rid="fig1">Figure 1</xref> illustrates the overall workflow of our proposed method.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>The architecture of the enhanced RAPTOR framework. <bold>(A)</bold> Semantic chunking mechanism: illustrates the &#x201C;Leaf Node Generation&#x201D; stage using semantic segmentation. The system calculates the cosine similarity between adjacent sentence embeddings. A segmentation boundary (indicated by the scissor icons) is established only when the similarity drops below the predefined semantic threshold <italic>&#x03C4;</italic> (e.g., 0.7). This dynamic strategy preserves &#x201C;coherent logical units,&#x201D; effectively preventing the &#x201C;context fragmentation&#x201D; often caused by fixed-token chunking. <bold>(B)</bold> Dual-adaptive graph clustering: depicts the construction of the hierarchical structure using the Leiden algorithm driven by a layer-aware dual-adaptive parameter strategy. As the hierarchy ascends from the bottom (Layer 0) to the top: 1. The <italic>neighbor parameter</italic> (<italic>k</italic>) increases linearly (blue arrow) to expand the topological receptive field and capture broader global relationships. 2. The <italic>resolution parameter</italic> (<inline-formula>
<mml:math id="M3">
<mml:mi>&#x03B3;</mml:mi>
</mml:math>
</inline-formula>) decreases linearly (purple arrow) to coarsen granularity and encourage high-level thematic aggregation. Detected communities are summarized by an LLM Agent to form parent nodes for the subsequent layer. <bold>(C)</bold> The enhanced tree: shows the final &#x201C;multi-layered semantic hierarchy.&#x201D; This structure integrates the semantically robust leaf nodes from Panel <bold>A</bold> with the optimized topological clusters from Panel <bold>B</bold>, creating a compact and efficient index for top-down retrieval.</p>
</caption>
<graphic xlink:href="fcomp-07-1710121-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram of the Enhanced RAPTOR Framework with three main components: A) Semantic Chunking Mechanism shows cosine similarity graph with a threshold line and scissor icons cutting sentence nodes. B) Dual-Adaptive Graph Clustering illustrates a pyramid structure with layers, summarization agents, and parameters k and &#x03B3;. C) The Enhanced Tree displays a hierarchical diagram with a root node expanding into multiple levels of sentence nodes.</alt-text>
</graphic>
</fig>
<sec id="sec7">
<label>3.1</label>
<title>Leaf node generation via semantic segmentation</title>
<p>The structural integrity and retrieval accuracy of the entire RAPTOR tree are fundamentally dependent on the quality of its foundational leaf nodes. The original method employs a fixed-token chunking strategy (e.g., 100 tokens per chunk), which, while simple, is oblivious to the underlying semantic structure of the text. This can lead to the fragmentation of coherent logical units, severely impacting the performance of subsequent clustering, summarization, and retrieval tasks.</p>
<p>This limitation is starkly illustrated by an example from the QuALITY dataset, in the text &#x201C;LOST IN TRANSLATION By LARRY M. HARRIS.&#x201D; The fixed-token chunking method partitions a single, causally-linked conversation into three separate chunks (107, 108, and 109). This division severs the logical connection between the premises of the conversation and its conclusion. Consequently, a retrieval query is likely to fetch only the chunk containing the final conclusion (109), while missing the crucial context from the preceding chunks. This results in an incomplete and misleading context for the language model, leading directly to an incorrect answer for the associated question.</p>
<p>To overcome this critical issue, we propose a semantic segmentation strategy for initial node generation. Instead of relying on arbitrary token counts, this method identifies and preserves semantically coherent blocks of text. The core of this approach is to partition the document based on its intrinsic thematic shifts. The process first decomposes the input text into sentences and generates their corresponding vector embeddings. It then iteratively calculates the semantic distance between adjacent sentence embeddings to detect topic boundaries. A new chunk is formed whenever this distance exceeds a predefined threshold, <italic>&#x03C4;</italic>, or a maximum token limit is reached.</p>
<p>When applied to the aforementioned example, our semantic segmentation method correctly groups the entire related conversation into a single, cohesive chunk. By preserving the logical integrity of the text, this approach provides the language model with the complete context necessary for accurate inference. As a result, our enhanced model successfully provides the correct answer to the question that the original RAPTOR failed. The detailed chunking results for this specific example, comparing both methods, can be found in <xref ref-type="supplementary-material" rid="SM1">Appendices A, B</xref>.</p>
<p>This semantic-first approach ensures that each leaf node represents a coherent and self-contained unit of information, providing a high-quality foundation for the subsequent clustering stage. The complete process is formalized in <xref ref-type="statement" rid="algo1">Algorithm 1</xref>.</p>
<statement id="algo1" content-type="algorithm">
<title>ALGORITHM 1 Semantic segment algorithm.</title>
<p>
<fig>
<graphic xlink:href="fcomp-07-1710121-g003.tif" position="anchor">
<alt-text content-type="machine-generated">Flowchart for text chunking algorithm. It details steps for splitting an input string into sentence sequences, obtaining embedding vectors, and appending to result sets based on semantic drift and token count limits. The process ends with outputting the list of text chunks.</alt-text>
</graphic></fig></p>
</statement>
</sec>
<sec id="sec8">
<label>3.2</label>
<title>Construction of the hierarchical structure</title>
<p>With a robust foundation of semantically coherent leaf nodes established in Stage 1, we proceed to construct the tree&#x2019;s internal hierarchical structure. This stage introduces a novel methodology that replaces conventional clustering techniques with a more sophisticated and adaptive graph-based approach, designed to better capture the complex relational structure of textual data.</p>
<sec id="sec9">
<label>3.2.1</label>
<title>Graph-based clustering via community detection</title>
<p>Traditional clustering algorithms, such as GMM, operate under geometric or distributional assumptions that often fail to adequately model the complex, non-spherical manifolds where text embeddings reside. To overcome this, we reframe the clustering problem as a community detection task.</p>
<p>For any given layer of nodes, we first construct a k-Nearest Neighbor (k-NN) graph. In this structure, nodes represent text units (chunks or summaries), and edges signify the semantic proximity between them. The choice of cosine similarity as the edge weighting metric is deliberate. In high-dimensional embedding spaces, Euclidean distance becomes less discriminative due to the curse of dimensionality (<xref ref-type="bibr" rid="ref1">Aggarwal et al., 2001</xref>) and is often sensitive to vector magnitude, which correlates with sentence length rather than meaning. In contrast, cosine similarity captures the directional alignment of semantic vectors. This ensures that our graph topology relies purely on thematic consistency independent of text length, providing a robust foundation for community detection.</p>
<p>We then employ the Leiden algorithm (<xref ref-type="bibr" rid="ref23">Traag et al., 2019</xref>) to partition this graph. Unlike traditional methods, Leiden is agnostic to cluster shape and guarantees well-connected communities. Specifically, we utilize the RBConfigurationVertexPartition method, which optimizes a Potts model and allows for precise control over community density&#x2014;a feature we exploit in our adaptive strategy. Each detected community is then treated as a single cluster, and its constituent nodes are summarized by a Large Language Model to form a parent node in the subsequent, higher layer of the tree.</p>
</sec>
<sec id="sec10">
<label>3.2.2</label>
<title>Dual-adaptive strategy for multi-resolution clustering</title>
<p>A central innovation of our framework is the recognition that a single, static clustering granularity is suboptimal for a multi-layered hierarchy. We introduce a dual-adaptive strategy to dynamically adjust the resolution of the clustering process in correspondence with the level of semantic abstraction.</p>
<p>Our guiding hypothesis is that different layers of the tree demand different notions of semantic proximity. At lower layers, containing specific granular content, the system must prioritize local, strong connections to form tight thematic clusters. Conversely, at higher layers composed of abstract summaries, the system must expand its scope to identify broader, long-range relationships that connect disparate sub-topics into a cohesive whole.</p>
<p>To implement this multi-resolution clustering, we first dynamically adjust the number of neighbors, k, as a linear function of the tree&#x2019;s layer depth (layer_id):</p>
<disp-formula id="E3">
<mml:math id="M4">
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mtext mathvariant="italic">current</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mtext mathvariant="italic">base</mml:mtext>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext mathvariant="italic">layer</mml:mtext>
<mml:mo>_</mml:mo>
<mml:mi mathvariant="italic">id</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mtext mathvariant="italic">step</mml:mtext>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</disp-formula>
<p>In this formulation, <inline-formula>
<mml:math id="M5">
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mtext mathvariant="italic">base</mml:mtext>
</mml:msub>
</mml:math>
</inline-formula> defines the initial fine-grained connectivity for the leaf layer, while <inline-formula>
<mml:math id="M6">
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mtext mathvariant="italic">step</mml:mtext>
</mml:msub>
</mml:math>
</inline-formula> controls the rate at which the topological search radius expands as the tree ascends. We employ this linear progression strategy as a heuristic to model the &#x201C;Cone of Abstraction&#x201D; inherent in document hierarchies. As the tree ascends, the semantic &#x201C;field of view&#x201D; required to aggregate sub-topics naturally widens. While more complex functions could be hypothesized, a linear increase represents the most parsimonious and robust assumption for general discourse structures, providing a stable expansion of the receptive field without introducing the overfitting risks associated with higher-order hyperparameters.</p>
<p>Complementing this topological adaptation, we simultaneously introduce a dynamic resolution parameter, <italic>&#x03B3;</italic>, for the Leiden algorithm. While the k-value dictates the connectivity of the graph, &#x03B3; controls the granularity of the community detection itself. We initialize &#x03B3; at a higher value to strictly partition local details at the bottom and linearly decay it to encourage the merging of broader communities at higher levels.</p>
<p>This dual-adaptive mechanism ensures that the structural organization of the tree is contextually sensitive to the level of abstraction at each layer, resulting in a more logically sound and semantically meaningful hierarchy. Integrating the graph-based community detection detailed in Section 3.2.1 with the adaptive parameter strategy proposed above, we present the comprehensive workflow for our tree construction. This iterative process, which transforms semantic leaf nodes into a unified hierarchical structure, is formalized in <xref ref-type="statement" rid="algo2">Algorithm 2</xref>.</p>
<statement id="algo2" content-type="algorithm">
<title>ALGORITHM 2 Adaptive graph clustering and hierarchy construction.</title>
<p>
<fig>
<graphic xlink:href="fcomp-07-1710121-g004.tif" position="anchor">
<alt-text content-type="machine-generated">Flowchart showing a hierarchical tree construction algorithm. Input includes leaf nodes and parameters for neighbors and resolution. The output is a hierarchical tree. Steps: 1) Initialize tree with nodes and depth zero. 2) Update parameters for dual-adaptive process, construct graph, and detect community, then summarize the next layer. Move to the next layer. 3) Output tree, ending the algorithm.</alt-text>
</graphic></fig></p>
</statement>
</sec>
</sec>
</sec>
<sec id="sec11">
<label>4</label>
<title>Experiments</title>
<sec id="sec12">
<label>4.1</label>
<title>Experimental setup</title>
<p>To rigorously evaluate the proposed framework&#x2019;s performance and generalization capabilities, we conducted experiments across two distinct datasets representing different domains and task formats.</p>
<sec id="sec13">
<label>4.1.1</label>
<title>Datasets</title>
<p><italic>QuALITY (Narrative Long-Context Understanding)</italic>: We utilize the QuALITY dataset (<xref ref-type="bibr" rid="ref20">Pang et al., 2022</xref>) as our primary benchmark for evaluating narrative comprehension. This dataset consists of long-form documents (average 5&#x202F;k tokens) with complex, cross-paragraph questions requiring reasoning over disparate parts of the text. The task is formatted as multiple-choice question answering.</p>
<p><italic>Qasper (Scientific Literature QA)</italic>: To assess the model&#x2019;s domain generalization and robustness in processing highly structured, logic-dense text, we extend our evaluation to the Qasper dataset (<xref ref-type="bibr" rid="ref10">Dasigi et al., 2021</xref>). Qasper focuses on information-seeking questions over full-text computer science research papers. Unlike QuALITY, Qasper requires open-ended question answering, challenging the retrieval system to synthesize precise answers from technical content containing formulas, figures, and complex citations.</p>
</sec>
<sec id="sec14">
<label>4.1.2</label>
<title>Evaluation metrics</title>
<p>Given the differing nature of the tasks, we employ task-specific metrics:</p>
<list list-type="simple">
<list-item>
<p><italic>Accuracy (for QuALITY)</italic>: Following standard benchmarks, we report Accuracy as the primary metric for the multiple-choice questions in QuALITY.</p>
</list-item>
<list-item>
<p><italic>Lexical and Semantic Metrics (for Qasper)</italic>: For the open-ended generation tasks in Qasper, we adopt a dual-faceted evaluation strategy:</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p><italic>Lexical Overlap Metrics</italic>: We utilize Token F1 Score, ROUGE-1, and ROUGE-L to quantify the surface-level lexical match between the generated answers and the ground truth. ROUGE-1 assesses information coverage (unigram overlap), while ROUGE-L evaluates structural coherence (longest common subsequence).</p>
</list-item>
<list-item>
<p><italic>LLM Score (Semantic Evaluation)</italic>: Recognizing that lexical overlap metrics may penalize semantically correct but phrased-differently answers, we introduce a model-based metric, LLM Score. We employ DeepSeek-V3 as an expert evaluator to rate the generated answer against the gold reference on a 5-point Likert scale (1: Bad to 5: Perfect). This metric specifically prioritizes information completeness and logical correctness over mere string matching. The specific evaluation prompt used is detailed in <xref ref-type="supplementary-material" rid="SM1">Appendix C</xref>.</p>
</list-item>
</list>
</sec>
<sec id="sec15">
<label>4.1.3</label>
<title>Implementation details</title>
<p><italic>Models</italic>: We utilize the BAAI/bge-m3 model (<xref ref-type="bibr" rid="ref8">Chen et al., 2024b</xref>) for all text embeddings. The deepseek-v3-0324 model is accessed via its official API to perform both the summarization of clusters and the final question-answering tasks. The specific prompt templates used for these tasks are detailed in <xref ref-type="supplementary-material" rid="SM1">Appendix C</xref>.</p>
<p><italic>Clustering Configuration</italic>: For the adaptive graph clustering stage, we employed the Leiden algorithm utilizing the RBConfigurationVertexPartition method to optimize the community structure. The graph construction relies on a k-Nearest Neighbor (k-NN) approach where edges are weighted by the cosine similarity between node embeddings. To implement our dual-adaptive parameter strategy, we configured the parameters as follows:</p>
<list list-type="bullet">
<list-item>
<p><italic>Adaptive Neighbors (k)</italic>: We set the initial neighbor count <inline-formula>
<mml:math id="M7">
<mml:msub>
<mml:mi mathvariant="normal">k</mml:mi>
<mml:mtext>base</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>15</mml:mn>
</mml:math>
</inline-formula> for the leaf layer, increasing by <inline-formula>
<mml:math id="M8">
<mml:msub>
<mml:mi mathvariant="normal">k</mml:mi>
<mml:mtext>step</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>5</mml:mn>
</mml:math>
</inline-formula> for each subsequent layer to expand the topological receptive field.</p>
</list-item>
<list-item>
<p><italic>Adaptive Resolution</italic> (<inline-formula>
<mml:math id="M9">
<mml:mi>&#x03B3;</mml:mi>
</mml:math>
</inline-formula>): We initialized the resolution parameter at <inline-formula>
<mml:math id="M10">
<mml:msub>
<mml:mi>&#x03B3;</mml:mi>
<mml:mtext>base</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>1.0</mml:mn>
</mml:math>
</inline-formula> and linearly decayed it by <inline-formula>
<mml:math id="M11">
<mml:msub>
<mml:mi>&#x03B3;</mml:mi>
<mml:mtext>step</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mn>0.2</mml:mn>
</mml:math>
</inline-formula> per layer (minimum 0.1) to encourage broader semantic aggregation at higher levels.</p>
</list-item>
<list-item>
<p><italic>Constraints</italic>: To manage the context window limits of the summarization model, we imposed a strict maximum cluster size of 100 nodes. Any community exceeding this threshold was recursively re-clustered using the same adaptive logic.</p>
</list-item>
</list>
<p>To ensure reproducibility, we fixed the random seed to 224 for all sampling, clustering, and embedding processes.</p>
</sec>
<sec id="sec16">
<label>4.1.4</label>
<title>Comparative configurations</title>
<p>We evaluate three distinct configurations to isolate the contributions of each component:</p>
<list list-type="bullet">
<list-item>
<p><italic>Original RAPTOR (Baseline)</italic>: Employs fixed-token chunking (100 tokens) and its default GMM-based clustering.</p>
</list-item>
<list-item>
<p><italic>RAPTOR + SC</italic>: Integrates our semantic chunking (SC) method. We test a range of semantic thresholds <italic>&#x03C4;</italic> &#x2208; {0.3, 0.4, 0.5, 0.6, 0.7, 0.8}.</p>
</list-item>
<list-item>
<p><italic>Our Full Model (RAPTOR + SC + AGC)</italic>: Our complete model, combining semantic chunking with our adaptive graph clustering (AGC) algorithm, utilizing the dual-adaptive parameter settings described above. We also evaluate a variant <italic>(Fixed Chunking + AGC)</italic> on the Qasper dataset to verify the independent effectiveness of the graph clustering algorithm.</p>
</list-item>
</list>
</sec>
</sec>
<sec id="sec17">
<label>4.2</label>
<title>Performance on narrative long-context QA (QuALITY)</title>
<sec id="sec18">
<label>4.2.1</label>
<title>Impact of semantic segmentation</title>
<p>To isolate the effect of our first contribution, we first replace RAPTOR&#x2019;s fixed-token chunking with our semantic segmentation approach. We conducted a parameter sweep across various semantic thresholds (<italic>&#x03C4;</italic>) to identify the optimal configuration. The results are presented in <xref ref-type="fig" rid="fig2">Figure 2</xref>.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Model accuracy on the QuALITY dataset as a function of the semantic segmentation threshold. The dashed line represents the Original RAPTOR baseline.</p>
</caption>
<graphic xlink:href="fcomp-07-1710121-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Line graph titled "Accuracy Comparison of RAPTOR + SC vs. Original RAPTOR" showing accuracy percentages on the Y-axis and semantic segmentation thresholds on the X-axis. Original RAPTOR accuracy is marked by a red dashed line at 49.67%. A blue line with error bars represents RAPTOR + SC accuracy, increasing from approximately 45% to 55% before dropping at higher thresholds.</alt-text>
</graphic>
</fig>
<p>The results illustrate a distinct non-monotonic relationship between model performance and the chosen semantic threshold. Accuracy improves steadily as <italic>&#x03C4;</italic> increases from 0.3, peaking at &#x03C4;&#x202F;=&#x202F;0.7 with an accuracy of 55.17%, before declining at <italic>&#x03C4;</italic>&#x202F;=&#x202F;0.8. This peak represents a significant 5.5 percentage point improvement over the fixed-token baseline (49.67%).</p>
<p>The peak performance at <italic>&#x03C4;</italic>&#x202F;=&#x202F;0.7 suggests that this threshold represents an optimal equilibrium point between granularity and coherence:</p>
<list list-type="bullet">
<list-item>
<p><italic>Below 0.7 (Over-segmentation)</italic>: At lower thresholds (<italic>&#x03C4;</italic>&#x202F;&#x2264;&#x202F;0.6), the segmentation algorithm is overly sensitive to minor lexical changes. This triggers excessive splitting, shattering coherent logical units&#x2014;such as a narrative event or a premise-conclusion pair&#x2014;into disjoint fragments. This fragmentation forces the retrieval system to piece together scattered context, significantly increasing the risk of missing critical links required for complex reasoning.</p>
</list-item>
<list-item>
<p><italic>Above 0.7 (Semantic Drift)</italic>: Conversely, at higher thresholds (e.g., <italic>&#x03C4;</italic>&#x202F;=&#x202F;0.8), the chunking becomes too lenient. The algorithm fails to detect subtle topic shifts, allowing distinct, unrelated themes to merge into noisy, multi-topical blocks. This semantic drift dilutes the specific embedding of the leaf node, making precise retrieval more difficult.</p>
</list-item>
<list-item>
<p><italic>Optimal (&#x03C4;&#x202F;=&#x202F;0.7)</italic>: Therefore, &#x03C4;&#x202F;=&#x202F;0.7 appears to align most closely with the natural semantic pulse of human-written text. It effectively captures complete reasoning chains within a single node while maintaining thematic purity, providing a high-quality foundation for the subsequent clustering stage.</p>
</list-item>
</list>
<p>This analysis validates the general effectiveness of the semantic chunking approach and identifies <inline-formula>
<mml:math id="M12">
<mml:mi mathvariant="normal">&#x03C4;</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>0.7</mml:mn>
</mml:math>
</inline-formula> as its optimal operating point. In the following section, we will evaluate our full model, which incorporates adaptive graph clustering, across this same range of thresholds to assess its cumulative impact and consistency.</p>
</sec>
<sec id="sec19">
<label>4.2.2</label>
<title>Combined effect with adaptive graph clustering</title>
<p>Having established the efficacy of semantic chunking, we now evaluate our full model, which integrates adaptive graph clustering (AGC) on top of this foundation. To provide a comprehensive comparison, we test both the RAPTOR + SC model and Our Full Model (RAPTOR + SC&#x202F;+&#x202F;AGC) across the full range of semantic thresholds. The results are presented in <xref ref-type="table" rid="tab1">Table 1</xref>.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Performance comparison across different semantic thresholds (<italic>&#x03C4;</italic>).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Semantic threshold (&#x03C4;)</th>
<th align="center" valign="top">RAPTOR + SC (accuracy, %)</th>
<th align="center" valign="top">Our full model (accuracy, %)</th>
<th align="center" valign="top">Performance gain from AGC (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">0.3</td>
<td align="center" valign="middle">46.33</td>
<td align="center" valign="middle">38.50</td>
<td align="center" valign="middle">&#x2212;7.83</td>
</tr>
<tr>
<td align="left" valign="middle">0.4</td>
<td align="center" valign="middle">45.50</td>
<td align="center" valign="middle">45.00</td>
<td align="center" valign="middle">&#x2212;0.5</td>
</tr>
<tr>
<td align="left" valign="middle">0.5</td>
<td align="center" valign="middle">49.00</td>
<td align="center" valign="middle">51.00</td>
<td align="center" valign="middle">+2.00</td>
</tr>
<tr>
<td align="left" valign="middle">0.6</td>
<td align="center" valign="middle">51.83</td>
<td align="center" valign="middle">57.50</td>
<td align="center" valign="middle">+5.67</td>
</tr>
<tr>
<td align="left" valign="middle"><bold>0.7</bold></td>
<td align="center" valign="middle"><bold>55.17</bold></td>
<td align="center" valign="middle"><bold>65.50</bold></td>
<td align="center" valign="middle">+10.33</td>
</tr>
<tr>
<td align="left" valign="middle">0.8</td>
<td align="center" valign="middle">48.33</td>
<td align="center" valign="middle">64.00</td>
<td align="center" valign="middle"><bold>+15.67</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the highest accuracy achieved in each column.</p>
</table-wrap-foot>
</table-wrap>
<p>The results reveal several crucial insights:</p>
<list list-type="simple">
<list-item>
<p><italic>Consistent Improvement in Optimal Range</italic>: When the semantic threshold is within a reasonable range (<italic>&#x03C4;</italic>&#x202F;&#x2265;&#x202F;0.5), our full model consistently outperforms the RAPTOR + SC model. This demonstrates that the adaptive graph clustering provides a significant performance enhancement when operating on a foundation of well-formed, coherent leaf nodes.</p>
</list-item>
<list-item>
<p><italic>Peak Performance and Synergistic Effect</italic>: The performance of our full model also peaks at <italic>&#x03C4;</italic>&#x202F;=&#x202F;0.7, reaching a final accuracy of 65.5%. At this optimal operating point, the introduction of AGC yields an absolute performance gain of 10.33 percentage points over semantic chunking alone. This substantial improvement strongly suggests a synergistic effect: the adaptive graph clustering algorithm is able to fully capitalize on the high-quality semantic chunks, leading to a much more effective retrieval hierarchy than either enhancement could achieve in isolation.</p>
</list-item>
<list-item>
<p><italic>Behavior at Extreme Thresholds</italic>: At lower thresholds (<italic>&#x03C4;</italic>&#x202F;&#x003C;&#x202F;0.5), where semantic chunking leads to over-segmentation, the performance of the full model degrades. This is expected, as even a superior clustering algorithm cannot effectively group overly fragmented and context-poor leaf nodes. Interestingly, at a very high threshold (<italic>&#x03C4;</italic>&#x202F;=&#x202F;0.8), while the performance of RAPTOR + SC drops, our full model maintains a high accuracy. This suggests that the robust graph clustering mechanism may be more resilient to the noise introduced by slightly over-lenient chunking compared to the default GMM.</p>
</list-item>
</list>
<p>Overall, our complete model, integrating both enhancements, significantly outperforms the original RAPTOR baseline (49.67%) by a margin of 16.83 percentage points, confirming the substantial value of our two-stage optimization framework.</p>
</sec>
<sec id="sec20">
<label>4.2.3</label>
<title>Impact of clustering strategy (ablation study)</title>
<p>To validate the necessity of our proposed Adaptive Graph Clustering (AGC), we conducted an ablation study comparing it against other prevalent clustering algorithms. To ensure a fair comparison, all methods were evaluated using the same high-quality leaf nodes generated by Semantic Chunking with the optimal threshold (<italic>&#x03C4;</italic>&#x202F;=&#x202F;0.7).</p>
<p>We compared the following clustering methodologies:</p>
<list list-type="simple">
<list-item>
<p><italic>Agglomerative Clustering</italic>: A standard bottom-up hierarchical approach (distance threshold&#x202F;=&#x202F;0.9).</p>
</list-item>
<list-item>
<p><italic>HDBSCAN</italic>: A density-based algorithm (min_cluster_size&#x202F;=&#x202F;2) known for handling noise.</p>
</list-item>
<list-item>
<p><italic>Gaussian Mixture Models (GMM)</italic>: The probabilistic clustering method used in the original RAPTOR framework.</p>
</list-item>
<list-item>
<p><italic>Adaptive Graph Clustering (Ours)</italic>: Our proposed graph-based community detection method.</p>
</list-item>
</list>
<p>The results are summarized in <xref ref-type="table" rid="tab2">Table 2</xref>.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Accuracy comparison of different clustering algorithms on the QuALITY dataset (fixed semantic chunking &#x03C4;&#x202F;=&#x202F;0.7).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Clustering algorithm</th>
<th align="center" valign="top">Methodology type</th>
<th align="center" valign="top">Accuracy (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Agglomerative (<italic>t</italic> =&#x202F;0.9)</td>
<td align="center" valign="top">Distance-based (hierarchical)</td>
<td align="center" valign="top">47.50</td>
</tr>
<tr>
<td align="left" valign="top">HDBSCAN (min_size&#x202F;=&#x202F;2)</td>
<td align="center" valign="top">Density-based</td>
<td align="center" valign="top">49.00</td>
</tr>
<tr>
<td align="left" valign="middle">GMM (RAPTOR baseline)</td>
<td align="center" valign="middle">Distribution-based (Probabilistic)</td>
<td align="center" valign="middle">55.17</td>
</tr>
<tr>
<td align="left" valign="top">Adaptive graph clustering</td>
<td align="center" valign="top">Graph-based (topological)</td>
<td align="center" valign="top">65.50</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The substantial performance gap between the algorithms highlights the critical role of structural organization in hierarchical retrieval:</p>
<list list-type="simple">
<list-item>
<p><italic>Failure of Distance and Density Metrics</italic>: Both Agglomerative Clustering (47.50%) and HDBSCAN (49.00%) underperformed significantly, falling below the GMM baseline. Agglomerative clustering suffers from the rigidity of fixed distance thresholds in high-dimensional embedding spaces. Similarly, HDBSCAN&#x2019;s mechanism of classifying sparse data points as &#x201C;noise&#x201D; is detrimental in the RAG context, as outliers often contain unique, query-specific details that are essential for retrieval. Discarding them leads to information loss.</p>
</list-item>
<list-item>
<p><italic>Limitations of Geometric Assumptions</italic>: While GMM (55.17%) performs respectably due to its soft-clustering nature, it is constrained by the assumption that semantic topics form spherical Gaussian distributions&#x2014;a simplification that often fails to capture the complex, irregular manifold of natural language representations.</p>
</list-item>
<list-item>
<p><italic>Superiority of Graph Topology</italic>: Our Adaptive Graph Clustering (65.50%) outperforms the next best method (GMM) by over 10 percentage points. This improvement stems from the method&#x2019;s ability to model semantic relationships as a topological graph structure rather than geometric clusters. By leveraging the Leiden algorithm with our dual-adaptive strategy, it preserves the connectivity of the semantic manifold and ensures that every node is meaningfully integrated into the hierarchy, avoiding both the information loss of density methods and the rigid assumptions of geometric methods.</p>
</list-item>
</list>
</sec>
</sec>
<sec id="sec21">
<label>4.3</label>
<title>Generalization on scientific literature (Qasper)</title>
<p>To investigate the domain generalization capabilities of our framework, we extended our evaluation from the narrative texts of QuALITY to the highly technical and logic-dense scientific papers of the Qasper dataset. This experiment aims to verify whether our proposed enhancements&#x2014;Semantic Chunking (SC) and Adaptive Graph Clustering (AGC)&#x2014;maintain their effectiveness in retrieval scenarios that require synthesizing information from complex academic discourse.</p>
<sec id="sec22">
<label>4.3.1</label>
<title>Performance comparison</title>
<p>We evaluated the Original RAPTOR baseline, an ablation model utilizing Fixed Chunking with AGC, and our Full Model across a range of semantic thresholds (<italic>&#x03C4;</italic>). <xref ref-type="table" rid="tab3">Table 3</xref> presents the performance comparison using both lexical overlap metrics (Token F1, ROUGE) and the semantic-aware LLM Score.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Performance comparison on the Qasper dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model configuration</th>
<th align="center" valign="top">Token F1</th>
<th align="center" valign="top">ROUGE-1 (F1)</th>
<th align="center" valign="top">ROUGE-L (F1)</th>
<th align="center" valign="top">LLM score (1&#x2013;5)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Original RAPTOR (Baseline)</td>
<td align="center" valign="top">6.45</td>
<td align="center" valign="top">8.10</td>
<td align="center" valign="top">5.90</td>
<td align="center" valign="top">3.08</td>
</tr>
<tr>
<td align="left" valign="top">Fixed Chunking + AGC</td>
<td align="center" valign="top"><bold>7.45</bold></td>
<td align="center" valign="top">9.41</td>
<td align="center" valign="top">7.46</td>
<td align="center" valign="top">3.23</td>
</tr>
<tr>
<td align="left" valign="top">Full Model (&#x03C4;&#x202F;=&#x202F;0.3)</td>
<td align="center" valign="top">7.09</td>
<td align="center" valign="top">9.67</td>
<td align="center" valign="top">7.41</td>
<td align="center" valign="top">3.21</td>
</tr>
<tr>
<td align="left" valign="top">Full Model (&#x03C4;&#x202F;=&#x202F;0.4)</td>
<td align="center" valign="top">6.51</td>
<td align="center" valign="top">8.99</td>
<td align="center" valign="top">6.72</td>
<td align="center" valign="top">3.22</td>
</tr>
<tr>
<td align="left" valign="top">Full Model (&#x03C4;&#x202F;=&#x202F;0.5)</td>
<td align="center" valign="top">6.00</td>
<td align="center" valign="top">8.51</td>
<td align="center" valign="top">6.47</td>
<td align="center" valign="top">3.24</td>
</tr>
<tr>
<td align="left" valign="top">Full Model (&#x03C4;&#x202F;=&#x202F;0.6)</td>
<td align="center" valign="top">7.14</td>
<td align="center" valign="top"><bold>9.83</bold></td>
<td align="center" valign="top"><bold>7.84</bold></td>
<td align="center" valign="top">3.22</td>
</tr>
<tr>
<td align="left" valign="top">Full Model (&#x03C4;&#x202F;=&#x202F;0.7)</td>
<td align="center" valign="top">6.88</td>
<td align="center" valign="top">9.17</td>
<td align="center" valign="top">7.18</td>
<td align="center" valign="top"><bold>3.26</bold></td>
</tr>
<tr>
<td align="left" valign="top">Full Model (&#x03C4;&#x202F;=&#x202F;0.8)</td>
<td align="center" valign="top">7.04</td>
<td align="center" valign="top">9.50</td>
<td align="center" valign="top">7.17</td>
<td align="center" valign="top">3.25</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The &#x201C;LLM Score&#x201D; (1&#x2013;5) evaluates semantic accuracy and completeness as rated by an expert LLM judge. The highest scores in each category are highlighted in bold.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec23">
<label>4.3.2</label>
<title>Analysis of results</title>
<p>The experimental results on Qasper reveal three critical insights regarding the structural and semantic advantages of our framework:</p>
<list list-type="simple">
<list-item>
<p><italic>Efficacy of Adaptive Graph Clustering</italic>: Comparing the Original RAPTOR baseline with the Fixed Chunking + AGC ablation model demonstrates the independent contribution of our clustering algorithm. Even without semantic segmentation, replacing GMM with Adaptive Graph Clustering significantly improves performance across all metrics, raising the LLM Score from 3.08 to 3.23 and Token F1 from 6.45 to 7.45%. This confirms that the graph-based hierarchical structure is intrinsically better suited for organizing the complex, non-spherical topic manifolds found in scientific literature, resulting in better information retrieval regardless of the chunking strategy.</p>
</list-item>
<list-item>
<p><italic>Robustness of the Optimal Threshold (&#x03C4;&#x202F;=&#x202F;0.7)</italic>: Consistent with our findings on the QuALITY dataset (Section 4.2), the Full Model achieves its highest semantic performance (LLM Score: 3.26) at a threshold of <italic>&#x03C4;</italic>&#x202F;=&#x202F;0.7. This recurrence suggests that &#x03C4;&#x202F;=&#x202F;0.7 represents a robust equilibrium point for text segmentation across different domains, effectively balancing the granularity required for detail retrieval with the coherence needed for logical reasoning.</p>
</list-item>
<list-item>
<p><italic>Divergence between Lexical and Semantic Metrics</italic>: A notable observation in <xref ref-type="table" rid="tab2">Table 2</xref> is the divergence between exact-match metrics (Token F1, ROUGE) and the semantic LLM Score. While Fixed Chunking + AGC achieves the highest Token F1 (7.45%), it falls short in the LLM Score (3.23) compared to the Full Model at &#x03C4;&#x202F;=&#x202F;0.7 (3.26). This discrepancy highlights the limitation of N-gram overlap metrics in complex QA tasks. Fixed-token chunking often severs semantic dependencies (e.g., separating a hypothesis from its result), leading to retrieved contexts that contain correct keywords (high F1) but lack logical continuity. In contrast, our semantic segmentation ensures that retrieval units are self-contained logical blocks. Although this may result in slightly lower surface-level lexical overlap, it provides the generation model with a more coherent context, enabling it to synthesize answers that are semantically superior and more logically accurate, as reflected by the expert LLM evaluation.</p>
</list-item>
</list>
<p>In conclusion, the Qasper experiments validate the domain generalization of our framework. By prioritizing semantic integrity through segmentation and structural optimization through graph clustering, our model outperforms the baseline in generating high-quality, logic-driven answers for scientific queries.</p>
</sec>
</sec>
<sec id="sec24">
<label>4.4</label>
<title>Computational cost analysis</title>
<p>To assess the practical implications of our proposed enhancements, we conducted a detailed analysis of the computational costs associated with the tree construction process. We logged time and token consumption across three different document lengths (~6&#x202F;k, ~18&#x202F;k, and ~65&#x202F;k tokens) for both the RAPTOR + SC and our Full Model configurations. The key findings are summarized in <xref ref-type="table" rid="tab4">Table 4</xref>, focusing on the most challenging ~65&#x202F;k token document as a representative case.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Computational cost comparison for a ~65&#x202F;k token document at the optimal threshold (&#x03C4;&#x202F;=&#x202F;0.7) versus the fixed-token baseline.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Metric</th>
<th align="center" valign="top">Original RAPTOR (fixed-token)</th>
<th align="center" valign="top">RAPTOR + SC (&#x03C4;&#x202F;=&#x202F;0.7)</th>
<th align="center" valign="top">Our full model (&#x03C4;&#x202F;=&#x202F;0.7)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Initial Embedding Cost (Stage 1)</td>
<td/>
<td/>
<td/>
</tr>
<tr>
<td align="left" valign="middle">time_stage1_sent_embed</td>
<td align="center" valign="middle">N/A</td>
<td align="center" valign="middle">71.64</td>
<td align="center" valign="middle">67.86</td>
</tr>
<tr>
<td align="left" valign="middle">tokens_stage1_sent_embed</td>
<td align="center" valign="middle">N/A</td>
<td align="center" valign="middle">81,311</td>
<td align="center" valign="middle">86,617</td>
</tr>
<tr>
<td align="left" valign="middle">Tree Structure Complexity</td>
<td/>
<td/>
<td/>
</tr>
<tr>
<td align="left" valign="middle">num_leaf_nodes</td>
<td align="center" valign="middle">726</td>
<td align="center" valign="middle">714</td>
<td align="center" valign="middle">714</td>
</tr>
<tr>
<td align="left" valign="middle">num_summary_nodes</td>
<td align="center" valign="middle">134</td>
<td align="center" valign="middle">141</td>
<td align="center" valign="middle">34</td>
</tr>
<tr>
<td align="left" valign="middle">Tree Build Cost (Stage 3)</td>
<td/>
<td/>
<td/>
</tr>
<tr>
<td align="left" valign="middle">time_stage3_tree_build_s</td>
<td align="center" valign="middle">118.78</td>
<td align="center" valign="middle">131.0</td>
<td align="center" valign="middle">261.23</td>
</tr>
<tr>
<td align="left" valign="middle">tokens_to_summarize</td>
<td align="center" valign="middle">95,001</td>
<td align="center" valign="middle">92,927</td>
<td align="center" valign="middle">73,282</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Based on the empirical data and theoretical modeling, we analyze the cost-performance dynamics from three perspectives:</p>
<list list-type="simple">
<list-item>
<p><italic>Empirical Cost Breakdown</italic>: The introduction of semantic chunking incurs an upfront computational cost (Stage 1), increasing the initial processing time from negligible in the baseline to approximately 68&#x202F;s. This is due to the necessity of embedding all sentences to detect semantic boundaries. However, this investment yields a dramatic return in structural efficiency. The Adaptive Graph Clustering (AGC) constructs a significantly more compact hierarchy, requiring only 34 summary nodes compared to 141 in the baseline, a 76% reduction. Consequently, the token consumption for LLM summarization (Stage 3), which is typically the most expensive component of the RAPTOR tree construction, drops from 95,001 to 73,282 tokens.</p>
</list-item>
<list-item>
<p><italic>Theoretical Complexity Analysis</italic>: To understand the scalability of our approach, we analyze the time complexity with respect to the document length <italic>N</italic> (in tokens). Let C denote the number of chunks, where <inline-formula>
<mml:math id="M13">
<mml:mi mathvariant="normal">C</mml:mi>
<mml:mo>&#x221D;</mml:mo>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:math>
</inline-formula>.</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p><italic>Embedding</italic> (<inline-formula>
<mml:math id="M14">
<mml:mi mathvariant="normal">O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>): Our semantic segmentation requires passing the full text through the embedding model, introducing a linear complexity <inline-formula>
<mml:math id="M15">
<mml:mi mathvariant="normal">O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>. This explains the upfront time cost observed.</p>
</list-item>
<list-item>
<p><italic>Clustering</italic> (<inline-formula>
<mml:math id="M16">
<mml:mi>O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msup>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>): The graph construction involves a k-NN search, theoretically scaling as <inline-formula>
<mml:math id="M17">
<mml:mi>O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msup>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>, followed by the Leiden algorithm with near-linear complexity <inline-formula>
<mml:math id="M18">
<mml:mi>O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>. While <inline-formula>
<mml:math id="M19">
<mml:mi>O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msup>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula> appears computationally intensive, C represents chunks rather than tokens(e.g., C&#x202F;&#x2248;&#x202F;700 for N&#x202F;=&#x202F;65&#x202F;k). Thus, the actual computation time is trivial compared to LLM inference.</p>
</list-item>
<list-item>
<p><italic>Summarization</italic> (<inline-formula>
<mml:math id="M20">
<mml:mi>O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi mathvariant="italic">sum</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>): The dominant factor in total latency is the LLM summarization, scaling as<inline-formula>
<mml:math id="M21">
<mml:mspace width="0.25em"/>
<mml:mi>O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi mathvariant="italic">sum</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>, where <inline-formula>
<mml:math id="M22">
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi mathvariant="italic">sum</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> is the total number of summary nodes. Our Dual-Adaptive strategy minimizes <inline-formula>
<mml:math id="M23">
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi mathvariant="italic">sum</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, effectively reducing the coefficient of the most expensive term in the total cost equation.</p>
<list list-type="simple">
<list-item>
<p><italic>The Strategic Trade-off: Prioritizing Structure for Efficiency</italic></p>
</list-item>
</list>
</list-item>
</list>
<p>While our Full Model incurs a higher upfront time cost due to semantic embedding and graph construction, this represents a deliberate optimization of the RAPTOR framework: allocating more resources to the low-latency structuring phase to improve the efficiency of the high-latency generation phase.</p>
<list list-type="bullet">
<list-item>
<p><italic>Cost-Efficiency</italic>: By leveraging the Leiden algorithm to construct a denser hierarchy, we exchange a modest increase in CPU-based clustering time for a favorable reduction in API token consumption. This shift effectively lowers the computational burden on the most expensive component of the pipeline, the LLM summarization.</p>
</list-item>
<list-item>
<p><italic>Information Density</italic>: The enhanced clustering process serves to improve semantic coherence. Instead of summarizing text fragments that may be arbitrarily segmented, our method guides the LLM to process well-grouped, thematically related communities. This likely mitigates the propagation of noise and increases the informational value of each generated summary node.</p>
</list-item>
<list-item>
<p><italic>Performance ROI</italic>: Crucially, this investment in structural integrity translates into substantial retrieval improvements. The peak accuracy gain of 15.83% over the RAPTOR baseline suggests that a refined tree structure is highly beneficial for complex reasoning tasks, effectively justifying the additional preprocessing overhead.</p>
</list-item>
</list>
</sec>
</sec>
<sec id="sec25">
<label>5</label>
<title>Conclusion and future work</title>
<sec id="sec26">
<label>5.1</label>
<title>Conclusion</title>
<p>In this work, we enhanced the RAPTOR framework by addressing limitations in context fragmentation and hierarchical organization. We proposed a two-stage approach integrating <italic>Semantic Segmentation</italic> to preserve logical units and <italic>Adaptive Graph Clustering (AGC)</italic> to optimize tree topology.</p>
<p>Extensive evaluations on both the <italic>QuALITY</italic> (narrative) and <italic>Qasper</italic> (scientific) datasets demonstrate the robustness and generalization of our method.</p>
<list list-type="simple">
<list-item>
<p><italic>Performance</italic>: Our model achieved a peak accuracy of 65.5% on QuALITY 1 and demonstrated superior semantic validity (LLM Score: 3.26) on the Qasper benchmark, consistently peaking at a semantic threshold of <italic>&#x03C4;</italic>&#x202F;=&#x202F;0.7.</p>
</list-item>
<list-item>
<p><italic>Structural Efficacy</italic>: Ablation studies confirm that our graph-topological approach significantly outperforms traditional distance-based (Agglomerative), density-based (HDBSCAN), and distribution-based (GMM) clustering methods.</p>
</list-item>
</list>
<p>These results underscore the efficacy of a &#x201C;semantic-first&#x201D; strategy, proving that optimizing both foundational leaf nodes and structural organization yields a more coherent and efficient retrieval hierarchy for complex RAG tasks.</p>
</sec>
<sec id="sec27">
<label>5.2</label>
<title>Future work</title>
<p>While our proposed framework has demonstrated significant improvements on both narrative and scientific datasets, the critical role of the clustering structure revealed in our experiments suggests several promising avenues for future research:</p>
<list list-type="simple">
<list-item>
<p><italic>Optimization of Adaptive Parameter Strategies</italic>: Our ablation studies confirmed that the topological structure of the retrieval tree is a decisive factor in performance. Currently, our dual-adaptive strategy employs a heuristic linear function to adjust the neighbor count (k) and resolution (<inline-formula>
<mml:math id="M24">
<mml:mi>&#x03B3;</mml:mi>
</mml:math>
</inline-formula>). Future work should investigate non-linear adaptation schemes (e.g., exponential or logarithmic scaling) to better model the &#x201C;Cone of Abstraction.&#x201D; Furthermore, we propose exploring data-driven adaptation, where clustering parameters are dynamically tuned based on the intrinsic density or manifold curvature of the specific document&#x2019;s embeddings, rather than relying on fixed layer-based rules.</p>
</list-item>
<list-item>
<p><italic>Automated Hyperparameter Tuning</italic>: Our experiments identified <italic>&#x03C4;</italic>&#x202F;=&#x202F;0.7 as a robust threshold across domains. However, manual grid search is inefficient for diverse real-world applications. Developing a lightweight, unsupervised metric to automatically estimate the optimal segmentation threshold (&#x03C4;) and clustering density for unseen domains would be a significant advancement.</p>
</list-item>
<list-item>
<p><italic>Scalability and Efficiency</italic>: Although effective, the exact k-NN graph construction in our approach incurs a computational cost of <inline-formula>
<mml:math id="M25">
<mml:mi>O</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msup>
<mml:mi>C</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula>. Integrating Approximate Nearest Neighbor (ANN) algorithms, such as HNSW, could dramatically accelerate graph construction with negligible accuracy loss, making the framework scalable to massive corpora.</p>
</list-item>
<list-item>
<p><italic>Impact of Embedding Manifolds</italic>: Since graph topology is derived from embedding similarities, the choice of the embedding model fundamentally dictates the cluster quality. Future research should systematically evaluate how different embedding architectures (e.g., dense vs. sparse, general vs. domain-specific) interact with our graph clustering algorithms to further optimize the semantic structure.</p>
</list-item>
</list>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec28">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. The QuALITY dataset can be found at: <ext-link xlink:href="https://github.com/nyu-mll/QuALITY" ext-link-type="uri">https://github.com/nyu-mll/QuALITY</ext-link>. The Qasper dataset can be found at: <ext-link xlink:href="https://allenai.org/data/qasper" ext-link-type="uri">https://allenai.org/data/qasper</ext-link>. The source code presented in this study is publicly available at: <ext-link xlink:href="https://github.com/Xin5643/Graph-raptor" ext-link-type="uri">https://github.com/Xin5643/Graph-raptor</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="sec29">
<title>Author contributions</title>
<p>YL: Conceptualization, Writing &#x2013; original draft, Project administration. XX: Conceptualization, Methodology, Writing &#x2013; review &#x0026; editing, Supervision. XW: Software, Visualization, Writing &#x2013; review &#x0026; editing, Validation. YP: Writing &#x2013; review &#x0026; editing, Data curation, Formal analysis. CW: Funding acquisition, Resources, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="sec30">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec31">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec32">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec33">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fcomp.2025.1710121/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fcomp.2025.1710121/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Aggarwal</surname><given-names>C. C.</given-names></name> <name><surname>Hinneburg</surname><given-names>A.</given-names></name> <name><surname>Keim</surname><given-names>D. A.</given-names></name></person-group> (<year>2001</year>). <article-title>On the surprising behavior of distance metrics in high dimensional space</article-title>. In <conf-name>Database Theory &#x2014; ICDT 2001: 8th International Conference London, UK, January 4&#x2013;6, 2001 Proceedings 8</conf-name>, pp. <fpage>420</fpage>&#x2013;<lpage>434</lpage>.</mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Aggarwal</surname><given-names>C. C.</given-names></name> <name><surname>Zhai</surname><given-names>C.</given-names></name></person-group> (eds.) (<year>2012</year>). &#x201C;<article-title>A survey of text clustering algorithms</article-title>&#x201D; in <source>Mining text data</source> (<publisher-loc>Boston, MA</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>77</fpage>&#x2013;<lpage>128</lpage>.</mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Aljaloud</surname><given-names>A. S.</given-names></name> <name><surname>Al-Dhelaan</surname><given-names>A. M.</given-names></name> <name><surname>Al-Rodhaan</surname><given-names>M. A.</given-names></name></person-group> (<year>2024</year>). <article-title>Deep clustering: a comprehensive survey</article-title>. <source>IEEE Trans. Neural Networks Learn. Syst.</source> <volume>36</volume>, <fpage>5858</fpage>&#x2013;<lpage>5878</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2024.3403155</pub-id></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Barnett</surname><given-names>S.</given-names></name> <name><surname>Cohn</surname><given-names>T.</given-names></name> <name><surname>Baldwin</surname><given-names>T.</given-names></name></person-group> (<year>2024</year>). Seven failure points when engineering a retrieval augmented generation system. arXiv:2401.05856 [cs.CL]. doi: <pub-id pub-id-type="doi">10.1145/3644815.3644945</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Brown</surname><given-names>T.</given-names></name> <name><surname>Mann</surname><given-names>B.</given-names></name> <name><surname>Ryder</surname><given-names>N.</given-names></name> <name><surname>Subbiah</surname><given-names>M.</given-names></name> <name><surname>Kaplan</surname><given-names>J. D.</given-names></name> <name><surname>Dhariwal</surname><given-names>P.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Language models are few-shot learners</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>33</volume>, <fpage>1877</fpage>&#x2013;<lpage>1901</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Cao</surname><given-names>S.</given-names></name> <name><surname>Wang</surname><given-names>L.</given-names></name></person-group> (<year>2022</year>). <article-title>HiBiRds: attention with hierarchical biases for structure-aware long document summarization</article-title>. <conf-name>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics, Dublin, Ireland</conf-name>, pp. <fpage>786</fpage>&#x2013;<lpage>807</lpage>.</mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>J.</given-names></name> <name><surname>Goldberg</surname><given-names>Y.</given-names></name> <name><surname>Zbib</surname><given-names>R.</given-names></name></person-group> (<year>2024a</year>). From chunks to propositions: meaning-based content representation for RAG. arXiv:2405.02503 [cs.CL].</mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>J.</given-names></name> <name><surname>Xiao</surname><given-names>S.</given-names></name> <name><surname>Zhang</surname><given-names>P.</given-names></name> <name><surname>Luo</surname><given-names>K.</given-names></name> <name><surname>Lian</surname><given-names>D.</given-names></name> <name><surname>Liu</surname><given-names>Z.</given-names></name></person-group> (<year>2024b</year>). BGE M3-embedding: multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation. arXiv:2402.03216 [cs.CL]. doi: <pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.137</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chowdhery</surname><given-names>A.</given-names></name> <name><surname>Narang</surname><given-names>S.</given-names></name> <name><surname>Devlin</surname><given-names>J.</given-names></name> <name><surname>Bosma</surname><given-names>M.</given-names></name> <name><surname>Mishra</surname><given-names>G.</given-names></name> <name><surname>Roberts</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>PaLM: scaling language modeling with pathways</article-title>. <source>J. Mach. Learn. Res.</source> <volume>24</volume>, <fpage>1</fpage>&#x2013;<lpage>113</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2204.02311</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Dasigi</surname><given-names>P.</given-names></name> <name><surname>Lo</surname><given-names>K.</given-names></name> <name><surname>Beltagy</surname><given-names>I.</given-names></name> <name><surname>Cohan</surname><given-names>A.</given-names></name> <name><surname>Smith</surname><given-names>N. A.</given-names></name> <name><surname>Gardner</surname><given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>A dataset of natural language queries, answers, and citations over NLP papers</article-title>. <conf-name>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Online</conf-name>, pp. <fpage>235</fpage>&#x2013;<lpage>245</lpage>.</mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Gidi</surname><given-names>C.</given-names></name> <name><surname>Cohen</surname><given-names>S. B.</given-names></name></person-group> (<year>2022</year>). <article-title>Query-focused abstractive summarization: a survey</article-title>. <conf-name>Proceedings of the 29th International Conference on Computational Linguistics</conf-name>, pp. <fpage>3236</fpage>&#x2013;<lpage>3248</lpage>.</mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Grootendorst</surname><given-names>M.</given-names></name></person-group> (<year>2022</year>). BERTopic: neural topic modeling with a class-based TF-IDF procedure. arXiv [Preprint]. <italic>arXiv:2203.05794</italic>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2203.05794</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hearst</surname><given-names>M. A.</given-names></name></person-group> (<year>1997</year>). <article-title>Texttiling: a quantitative approach to discourse segmentation</article-title>. <source>Comput. Linguist.</source> <volume>23</volume>, <fpage>33</fpage>&#x2013;<lpage>73</lpage>.</mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jiang</surname><given-names>Z.</given-names></name> <name><surname>Xu</surname><given-names>F. F.</given-names></name> <name><surname>Araki</surname><given-names>J.</given-names></name> <name><surname>Neubig</surname><given-names>G.</given-names></name></person-group> (<year>2020</year>). <article-title>How can we know what language models know?</article-title> <source>Trans. Assoc. Comput. Linguist.</source> <volume>8</volume>, <fpage>423</fpage>&#x2013;<lpage>438</lpage>. doi: <pub-id pub-id-type="doi">10.1162/tacl_a_00324</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Karpukhin</surname><given-names>V.</given-names></name> <name><surname>Oguz</surname><given-names>B.</given-names></name> <name><surname>Min</surname><given-names>S.</given-names></name> <name><surname>Lewis</surname><given-names>P.</given-names></name> <name><surname>Wu</surname><given-names>L.</given-names></name> <name><surname>Edunov</surname><given-names>S.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Dense passage retrieval for open-domain question answering</article-title>. <conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing</conf-name>, pp. <fpage>6769</fpage>&#x2013;<lpage>6781</lpage>.</mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lewis</surname><given-names>P.</given-names></name> <name><surname>Perez</surname><given-names>E.</given-names></name> <name><surname>Piktus</surname><given-names>A.</given-names></name> <name><surname>Petroni</surname><given-names>F.</given-names></name> <name><surname>Karpukhin</surname><given-names>V.</given-names></name> <name><surname>Goyal</surname><given-names>N.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>33</volume>, <fpage>9459</fpage>&#x2013;<lpage>9474</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2005.11401</pub-id></mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>N. F.</given-names></name> <name><surname>Lin</surname><given-names>K.</given-names></name> <name><surname>Hewitt</surname><given-names>J.</given-names></name> <name><surname>Paranjape</surname><given-names>A.</given-names></name> <name><surname>Bevilacqua</surname><given-names>M.</given-names></name> <name><surname>Petroni</surname><given-names>F.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Lost in the middle: how language models use long contexts</article-title>. <source>Trans. Assoc. Comput. Linguist.</source> <volume>12</volume>, <fpage>157</fpage>&#x2013;<lpage>173</lpage>. doi: <pub-id pub-id-type="doi">10.1162/tacl_a_00638</pub-id>, <pub-id pub-id-type="pmid">40316710</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McInnes</surname><given-names>L.</given-names></name> <name><surname>Healy</surname><given-names>J.</given-names></name> <name><surname>Melville</surname><given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>UMAP: uniform manifold approximation and projection for dimension reduction</article-title>. <source>J. Open Source Softw.</source> <volume>3</volume>:<fpage>861</fpage>. doi: <pub-id pub-id-type="doi">10.21105/joss.00861</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Nair</surname><given-names>I.</given-names></name> <name><surname>Garimella</surname><given-names>A.</given-names></name> <name><surname>Srinivasan</surname><given-names>B. V.</given-names></name> <name><surname>Modani</surname><given-names>N.</given-names></name> <name><surname>Chhaya</surname><given-names>N.</given-names></name> <name><surname>Karanam</surname><given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>A neural CRF-based hierarchical approach for linear text segmentation</article-title>. <conf-name>Findings of the Association for Computational Linguistics: EACL 2023, Dubrovnik, Croatia</conf-name>, pp. <fpage>883</fpage>&#x2013;<lpage>893</lpage>.</mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Pang</surname><given-names>R. Y.</given-names></name> <name><surname>Parrish</surname><given-names>A.</given-names></name> <name><surname>Joshi</surname><given-names>N.</given-names></name> <name><surname>Nangia</surname><given-names>N.</given-names></name> <name><surname>Phang</surname><given-names>J.</given-names></name> <name><surname>Chen</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>QuALITY: question answering with long input texts, yes!</article-title> <conf-name>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Seattle, WA</conf-name>, pp. <fpage>5336</fpage>&#x2013;<lpage>5358</lpage>.</mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Reimers</surname><given-names>N.</given-names></name> <name><surname>Gurevych</surname><given-names>I.</given-names></name></person-group> (<year>2019</year>). <article-title>Sentence-BERT: sentence embeddings using Siamese BERT networks</article-title>. <conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, Hong Kong, China</conf-name>, pp. <fpage>3982</fpage>&#x2013;<lpage>3992</lpage>.</mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Sarthi</surname><given-names>P.</given-names></name> <name><surname>Abdullah</surname><given-names>S.</given-names></name> <name><surname>Tuli</surname><given-names>A.</given-names></name> <name><surname>Khanna</surname><given-names>S.</given-names></name> <name><surname>Goldie</surname><given-names>A.</given-names></name> <name><surname>Manning</surname><given-names>C. D.</given-names></name></person-group> (<year>2024</year>). <article-title>RAPTOR: recursive abstractive processing for tree-organized retrieval</article-title>. <conf-name>Proceedings of the Twelfth International Conference on Learning Representations, Vienna, Austria</conf-name>.</mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Traag</surname><given-names>V. A.</given-names></name> <name><surname>Waltman</surname><given-names>L.</given-names></name> <name><surname>van Eck</surname><given-names>N. J.</given-names></name></person-group> (<year>2019</year>). <article-title>From Louvain to Leiden: guaranteeing well-connected communities</article-title>. <source>Sci. Rep.</source> <volume>9</volume>:<fpage>5233</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-019-41695-z</pub-id>, <pub-id pub-id-type="pmid">30914743</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Zhang</surname><given-names>S.</given-names></name> <name><surname>Wan</surname><given-names>D.</given-names></name> <name><surname>Bansal</surname><given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>Extractive is not faithful: an investigation of broad unfaithfulness problems in extractive summarization</article-title>. <conf-name>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, Toronto, Canada</conf-name>, pp. <fpage>2153</fpage>&#x2013;<lpage>2174</lpage>.</mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2664194/overview">Marlon Santiago Vi&#x00F1;&#x00E1;n-Lude&#x00F1;a</ext-link>, Catholic University of the North, Chile</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3264569/overview">Esmaeil Narimissa</ext-link>, University of Liverpool, United Kingdom</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3266774/overview">Zihan Wang</ext-link>, Helmholtz Association of German Research Centres (HZ), Germany</p>
</fn>
</fn-group>
</back>
</article>