<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2021.725321</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Modelling Speaker Attribution in Narrative Texts With Biased and Bias-Adjustable Neural Networks</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>D&#x000F6;nicke</surname> <given-names>Tillmann</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1248113/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Varachkina</surname> <given-names>Hanna</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Weimer</surname> <given-names>Anna Mareike</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>G&#x000F6;deke</surname> <given-names>Luisa</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1245297/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Barth</surname> <given-names>Florian</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1594141/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Gittel</surname> <given-names>Benjamin</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x02021;</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Holler</surname> <given-names>Anke</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x02021;</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Sporleder</surname> <given-names>Caroline</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn003"><sup>&#x02021;</sup></xref>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>G&#x000F6;ttingen Centre for Digital Humanities, University of G&#x000F6;ttingen</institution>, <addr-line>G&#x000F6;ttingen</addr-line>, <country>Germany</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of German Philology, University of G&#x000F6;ttingen</institution>, <addr-line>G&#x000F6;ttingen</addr-line>, <country>Germany</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Tommaso Caselli, University of Groningen, Netherlands</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: F. Amilcar Cardoso, University of Coimbra, Portugal; Roisin Loughran, Dundalk Institute of Technology, Ireland</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Tillmann D&#x000F6;nicke <email>tillmann.doenicke&#x00040;uni-goettingen.de</email></corresp>
<fn fn-type="other" id="fn001"><p>This article was submitted to Language and Computation, a section of the journal Frontiers in Artificial Intelligence</p></fn>
<fn fn-type="equal" id="fn002"><p>&#x02020;These authors have contributed equally to this work</p></fn>
<fn fn-type="equal" id="fn003"><p>&#x02021;These authors share senior authorship</p></fn></author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>02</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>4</volume>
<elocation-id>725321</elocation-id>
<history>
<date date-type="received">
<day>15</day>
<month>06</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>12</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2022 D&#x000F6;nicke, Varachkina, Weimer, G&#x000F6;deke, Barth, Gittel, Holler and Sporleder.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>D&#x000F6;nicke, Varachkina, Weimer, G&#x000F6;deke, Barth, Gittel, Holler and Sporleder</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Literary narratives regularly contain passages that different readers attribute to different speakers: a character, the narrator, or the author. Since literary narratives are highly ambiguous constructs, it is often impossible to decide between diverging attributions of a specific passage by hermeneutic means. Instead, we hypothesise that attribution decisions are often influenced by annotator bias, in particular an annotator&#x00027;s literary preferences and beliefs. We present first results on the correlation between the literary attitudes of an annotator and their attribution choices. In a second set of experiments, we present a neural classifier that is capable of imitating individual annotators as well as a common-sense annotator, and reaches accuracies of up to 88% (which improves the majority baseline by 23%).</p></abstract>
<kwd-group>
<kwd>narrative understanding</kwd>
<kwd>annotation</kwd>
<kwd>bias</kwd>
<kwd>questionnaire</kwd>
<kwd>subjectivity</kwd>
<kwd>text classification</kwd>
</kwd-group>
<counts>
<fig-count count="2"/>
<table-count count="12"/>
<equation-count count="0"/>
<ref-count count="42"/>
<page-count count="12"/>
<word-count count="9522"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>Humans have different habits when it comes to reading and interpreting literature. Therefore, human annotators bring different assumptions and beliefs to the annotation task and introduce an <italic>annotator bias</italic> (e.g., Geva et al., <xref ref-type="bibr" rid="B13">2019</xref>) in the data creation. Annotator bias has been studied in regard to several phenomena of natural language understanding (see Geva et al., <xref ref-type="bibr" rid="B13">2019</xref>; Akhtar et al., <xref ref-type="bibr" rid="B2">2020</xref>; Kuwatly et al., <xref ref-type="bibr" rid="B23">2020</xref>), but not yet in the literary domain. This is surprising, because a basic problem in literary studies is the inability to hermeneutically examine the influence of the recipients&#x00027; world knowledge on the reception process. Since literary narratives are highly ambiguous constructs, it is often impossible to choose between diverging interpretations of a specific passage using the hermeneutic approach, which is also reflected in the annotation of literary phenomena. Following Sober&#x000F3;n et al. (<xref ref-type="bibr" rid="B37">2013</xref>), we view annotation disagreements as a valuable source of information that allows us to explore readers&#x00027; perceptions of texts and the factors influencing these. Concretely, we will show that disagreements regarding the annotation of literary phenomena are not randomly distributed but fall into discernible patterns that can be attributed to differing literature-specific preferences and implicit beliefs of readers. To this end, we compute correlations between questionnaire-based data on readers&#x00027; preferences and beliefs, and the annotations produced by them. In a second step, we develop a bias-adjusted (and bias-adjustable) classifier, that takes into account literature-specific attitudes of annotators and show that this outperforms an annotator-agnostic classifier.</p>
<p>Section Theoretical Background introduces the concrete literary phenomenon we are dealing with: the attribution of so-called reflective passages to a character, the narrator, or the author of a narrative text of fiction. As the concept of a &#x0201C;reflective passage&#x0201D; is not yet well formalised in literary theory, we annotated three phenomena which we consider to be strong indicators for reflective passages: comment (Bonheim, <xref ref-type="bibr" rid="B6">1975</xref>), non-fictional speech (Konrad, <xref ref-type="bibr" rid="B20">2017</xref>) and generalisation (Leslie and Lerner, <xref ref-type="bibr" rid="B24">2016</xref>). Annotators had to identify these and attribute them to one or more of the three attribution classes (character/narrator/author). In the third section, we introduce our corpus (German narrative fiction from 1650 to 1950) as well as our questionnaire on literary attitudes (which expands the Literary Response Questionnaire from Miall and Kuiken, <xref ref-type="bibr" rid="B26">1995</xref>), we describe procedures of data preparation, feature selection and specify our neural classifier including the different conditions it has been trained under. In the fourth section, we report our results concerning a) correlations between literary attitudes of the annotators and the annotations they produce and b) the classification of attributions by our models. In the last section, we give a short overview over existing work that relates to our study, discuss our results and point out further routes for research.</p>
</sec>
<sec id="s2">
<title>2. Theoretical Background</title>
<sec>
<title>2.1. Reflective Passages</title>
<p>Narrative texts consist of sentences and passages that serve different functions in discourse. Some sentences convey plot elements, describe scenes, or consist of direct character speech. There are also passages that are characterised in particular by the fact that no action is reproduced in them, but rather the impression of a narrative pause (i.e., a pause in the plot) is created. These passages, also called &#x0201C;theoretical sentences&#x0201D; (Martinez and Scheffel, <xref ref-type="bibr" rid="B25">2016</xref>, p. 105) or &#x0201C;essayistic passages&#x0201D; (Gittel, <xref ref-type="bibr" rid="B14">2015</xref>), are often regarded as comments on the fictional world. Often, but not always, they include generalising statements. One example is found in <italic>Geschichte des Agathon</italic> (Wieland, <xref ref-type="bibr" rid="B41">2012</xref>):</p>
<list list-type="simple">
<list-item><p>(1) <italic>Der Gebrauch der Sprache h&#x000F6;rt auf, wenn sich die Seelen einander unmittelbar mitteilen, sich unmittelbar anschauen und ber&#x000FC;hren, und in einem Augenblick mehr empfinden, als die Zunge der Musen selbst in ganzen Jahren auszusprechen verm&#x000F6;chte</italic>.</p>
<p>&#x02018;The use of language ceases when souls communicate directly with each other, look at each other and touch each other directly, and feel more in an instant than the tongue of the muses itself could express in whole years.&#x00027;</p></list-item>
</list>
<p>This sentence, that clearly does not advance the plot, can be understood in different ways. On the one hand, it can be understood as a description of the characters&#x00027; feelings in the fictional situation, and on the other hand, as an assertion about the connection of souls in general, even outside the fictional world. Both readings share that they are based on processes of reflection. We call those passages that represent a narrative pause and at the same time contain a generalisation, comment on the events of the story and/or suggest theses about the real world, &#x0201C;reflective passages&#x0201D; (see Gittel, <xref ref-type="bibr" rid="B15">2022</xref>). We thus associate the phenomena generalisation, comment, and non-fictional speech with reflective passages.</p>
</sec>
<sec>
<title>2.2. Uncertain Attribution</title>
<p>It applies to both spoken and written language that an utterance can only be understood properly if it is clear who takes responsibility for the conveyed information. This may seem trivial for every-day language, since the interlocutors most often share direct contact in exchange of information. Nevertheless, even in every-day language, speakers use techniques to convey that someone else needs to be understood as the original speaker of an upcoming information, e.g., by using inquits like <italic>she said</italic>, or informal expressions such as <italic>you won&#x00027;t believe what my daughter said to me</italic>. However, in narrative fictional texts the communication framework is much more complex, since the communication is multi-layered. This is due to the fact that, by creating a fictional narrative, the author invites the reader to imagine a fictive narrator that tells the story. According to this (recently contested) narratological standard-view (e.g., Currie, <xref ref-type="bibr" rid="B7">2010</xref>; K&#x000F6;ppe and St&#x000FC;hring, <xref ref-type="bibr" rid="B21">2011</xref>) there are two levels of communication: 1) a text-external level where an author communicates with the reader and 2) a text-internal, &#x0201C;fictional&#x0201D; level where a fictive narrator introduces certain characters which may communicate with the narrator or with other characters. The narrator is the speaker who reports the plot of a story, describes the scenery and provides background information on the characters. It can, but does not have to, appear explicitly in the text, e.g., by expressing itself in a self-referential manner or by making comments about the reported events. It can also appear in a personalised manner (e.g., as homodiegetic narrator) or as a named character in the story. In a story, the narrator and characters can be speakers, but the boundaries might be fluid:</p>
<list list-type="simple">
<list-item><p>(2) &#x0226B;<italic><underline>Es gibt keinen Sandmann</underline></italic><italic>, mein liebes Kind</italic>&#x0226A;<italic>; erwiderte die Mutter</italic>, &#x0226B;<italic>wenn ich sage, der Sandmann kommt, so will das nur hei</italic>&#x003B2;<italic>en, ihr seid schl&#x000E4;frig und k&#x000F6;nnt die Augen nicht offen behalten, als h&#x000E4;tte man euch Sand hineingestreut</italic>.&#x0226A; (Hoffmann, <xref ref-type="bibr" rid="B19">2012</xref>)</p>
<p>&#x0201C;&#x02018;<underline>There is no sandman</underline>, my dear child&#x0201D; replied the mother, &#x0201C;when I say that the sandman is coming, it only means that you are sleepy and cannot keep your eyes open, as if sand had been put into them.&#x0201D;&#x00027;</p></list-item>
</list>
<p>For the introductory verbum dicendi (<italic>erwiderte</italic> &#x02018;replied&#x00027;), the narrator is the speaker, reporting what the character (the mother) was doing&#x02014;in this case, that the mother said something. The direct speech itself is character-attributed, since the mother is the speaker and the narrator does not intervene with her speech here&#x02014;which is indicated by the quotation marks. Whenever a character is speaking directly, we define the character as the attributable speaker. However, it is important to note that certain literary storytelling techniques qualify automatically as character-attributed, e.g., inner monologues or streams of consciousness. In the next example, on the other hand, the narrator is the speaker by reproducing the character&#x00027;s speech indirectly:</p>
<list list-type="simple">
<list-item><p>(3) <italic>Stechlins Eintritt ins Regiment fiel so ziemlich mit dem Regierungsantritt Friedrich Wilhelms IV. zusammen, und wenn er dessen erw&#x000E4;hnte, so hob er, sich selbst persiflierend, gerne hervor</italic>, &#x0226B;<italic><underline>da&#x003B2; alles Gro&#x003B2;e seine Begleiterscheinungen habe</underline></italic>&#x0226A;. (Fontane, <xref ref-type="bibr" rid="B12">2012</xref>)</p>
<p>&#x02018;Stechlin&#x00027;s entry into the regiment pretty much coincided with Friedrich Wilhelm IV&#x00027;s accession to power, and when he mentioned it, he liked to point out, satirising himself, &#x0201C;<underline>that everything great has its side effects</underline>.&#x0201D;&#x02019;</p></list-item>
</list>
<p>Therein, the speakers overlap: the character who said something and the narrator reformulating it. Whenever narrator and character overlap, we understand this as uncertain attribution. We find such combined attributions in story-telling techniques such as indirect speech, stream of consciousness, and free indirect discourse. Thereby, the reader is not confronted with pure character speech but with a version of it, which is impacted by the narrator&#x00027;s point of view.</p>
<p>Up to this point we were only confronted with the text-internal communication. Now, we are turning to the text-external communication. Let us take a look at example (1) of a reflective passage again which does not contain characters&#x00027; speech. The content of the statement can be understood as a description of the fictional world. So in this understanding the narrator is the speaker who is conveying the information. Considering the story was invented by an author, the utterance might also be interpreted as a statement about the real world (of which the author is part of). In this case, not only characters of the fictional world would be described, but persons in general. If a reader understands the passage here as non-fictional assertive speech, we must assume that the information can no longer be attributed to the narrator alone, since the narrator is a construct and knows nothing about the real world. One might assume that only the author knows about the real world and therefore needs to be at least one of the associated speakers here.</p>
<p>So let us summarise: If more than one speaker is attributed for one passage, this indicates one of the following cases or a combination of both: 1) the content of the passage is either rendered and cannot be unambiguously attributed to the narrator or a character alone and 2) the passage leads the reader to believe that it conveys the author&#x00027;s assertions or hypotheses. We assume that the multi-layered communication of narrative texts, especially in reflective passages, is characterised by differing speakers and thus by uncertain attribution. We speak of uncertain attribution whenever it is unresolvable whether the speaker (i.e., character, narrator and in certain cases the author) is identifiable as the entity to whom the information can be unambiguously attributed. Presumptively, the attribution of text passages to authors is especially impacted by literary theoretical beliefs and world knowledge of the recipients, particularly knowledge about the author. This problem cannot be solved hermeneutically (Schmid, <xref ref-type="bibr" rid="B34">2011</xref>, p. 131 f.), but will be investigated empirically through our annotation.</p>
</sec>
</sec>
<sec sec-type="materials" id="s3">
<title>3. Materials</title>
<sec>
<title>3.1. Corpus and Annotation Guidelines</title>
<p>We currently construct a diachronic corpus of German fictional literature from 1650 to 1950. As of now, we annotated 10 texts (2,701 sentences / 61,979 tokens). CATMA<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref> appeared to be most suitable for annotating fictional texts and became our tool of choice. In order to create a versatile dataset and save resources, we annotate only the beginning of every text (usually about the first 200 sentences).</p>
<p>In this study, we are interested in annotation disagreements that are grounded in the annotators&#x00027; literary preferences and implicit beliefs. We hypothesise that this applies to attribution, as discussed in the previous section, but not necessarily to the annotation of comment, generalisation, or non-fictional speech. For these phenomena, annotation disagreements are more likely to arise from <italic>textual</italic> (e.g., syntactic or semantic) ambiguity or vagueness, which is typical for natural language. Attribution is context-sensitive but not restricted to certain types of text passages, and is thus best viewed as a second annotation dimension that can be combined with various phenomena<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref>.</p>
<p>Our annotation procedure consists of three steps: 1. annotation of reflective passages, 2. creation of a gold standard, and 3. annotation of speaker attribution.</p>
<sec>
<title>3.1.1. Annotation of Reflective Passages</title>
<p>First the annotators identify the three phenomena generalisation, comment, and non-fictional speech, which we consider to be potential indicators of reflective passages (see section Reflective Passages). We developed detailed annotation guidelines for these three phenomena [see D&#x000F6;nicke et al. (<xref ref-type="bibr" rid="B10">2021</xref>) for generalisation, and our annotation guidelines in Barth et al. (<xref ref-type="bibr" rid="B5">2021</xref>) for the other phenomena], which require reflective passages to be annotated at the clause or supra-clause level. If it is appropriate based on the phenomenon, a passage can comprise a single clause, several clauses or sentences, or even whole paragraphs. Furthermore, passages might be nested or overlap as we will see below.</p>
<p>Our annotators are students with a background in German philology and literature. All of them have several months of experience in annotating for our project and can thus be seen as expert annotators. We have six annotators overall, which means two annotators per phenomenon for a text. <xref ref-type="table" rid="T1">Table 1</xref> shows Fleiss&#x00027; inter-annotator agreement coefficient &#x003BA; (Fleiss et al., <xref ref-type="bibr" rid="B11">2003</xref>) for all three phenomena and averaged over texts. All phenomena are, as typical for natural-language annotation tasks, affected by textual ambiguity and therefore to some degree subject to the reader&#x00027;s interpretation. For comment, this is even more the case than for the other two phenomena, which explains the only moderate agreement in the comment annotation whereas generalisation and non-fictional speech are annotated with substantial agreement. The comparatively high standard deviations presumably result from the varying complexity of the literary texts and the language variant in which a text is written; we also change the constellation of annotators for every text, which leads to variation in inter-annotator agreement.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Clause-level Fleiss&#x00027; &#x003BA; for the phenomena marking reflective passages.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Phenomenon</bold></th>
<th valign="top" align="center"><bold><bold>&#x003BC;(&#x003BA;)</bold></bold></th>
<th valign="top" align="center"><bold><bold>&#x003C3;(&#x003BA;)</bold></bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Generalisation</td>
<td valign="top" align="center">0.69</td>
<td valign="top" align="center">0.20</td>
</tr>
<tr>
<td valign="top" align="left">Comment</td>
<td valign="top" align="center">0.47</td>
<td valign="top" align="center">0.16</td>
</tr>
<tr>
<td valign="top" align="left">Non-fictional speech</td>
<td valign="top" align="center">0.70</td>
<td valign="top" align="center">0.16</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>&#x003BC; is the average agreement over all texts; &#x003C3; is the corresponding standard deviation</italic>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>3.1.2. Creation of a Gold Standard</title>
<p>Having verified that comment, generalisation, and non-fictional speech can indeed be identified with moderate to substantial agreement, we next create a gold standard for this annotation that will then serve as the input for the attribution annotation. The gold standard is created through an adjudication step in which two researchers inspect the annotated texts, focusing on the passages that were labelled by at least one annotator. The adjudicators discuss the annotations, aiming to identify one prevalent interpretation (or, in exceptional cases, mark passages as textually ambiguous if two interpretations are equally plausible), and create the gold standard annotation.</p>
</sec>
<sec>
<title>3.1.3. Annotation of Speaker Attribution</title>
<p>Finally, the annotators attribute the identified segments of the gold standard passages, using our attribution categories (i.e., character, narrator, and/or author). Attribution annotation is also performed on clause-level. As we are interested in investigating annotator bias for this task, we provide annotation guidelines which give annotators a lot of freedom to use their own judgement when attributing a passage. Specifically, the guidelines (Barth et al., <xref ref-type="bibr" rid="B5">2021</xref>) only specify how labels should be assigned given an annotator&#x00027;s interpretation but they do not provide guidance on how to arrive at an interpretation. We also do not create a gold standard for attribution as we are interested in variation not homogeneity.</p>
<p>Example (4) shows an annotation example from <italic>Ein Kampf um Rom</italic> (Dahn, <xref ref-type="bibr" rid="B8">2012</xref>) with three overlapping reflective passages (in the gold standard): <graphic xlink:href="frai-04-725321-i0001.tif"/></p>
<p>There is a generalisation passage covering one clause, a comment passage covering three clauses, and a passage of non-fictional speech covering two clauses<xref ref-type="fn" rid="fn0003"><sup>3</sup></xref>. One of our annotators assigned the attribution label &#x0201C;character&#x0201D; to all of the three involved clauses (Punctuation at span boundaries must not be annotated due to our guidelines). The first clause <italic>endlich rief Licinius</italic> &#x02018;finally Licinius shouted&#x00027; is not annotated with attribution because it is not part of a reflective passage.</p>
<p>As of now, the entire corpus contains 1,712 reflective passages, which consist of 3,565 unique clauses<xref ref-type="fn" rid="fn0004"><sup>4</sup></xref>.</p>
</sec>
</sec>
<sec>
<title>3.2. Data Preparation</title>
<p>Since attribution labels are assigned on clause-level, we prepare our corpus data for a classification task which is to predict the labels for a specific clause in a certain context. As context of a clause, we use the sentence that contains the clause as well as its preceding and succeeding sentence. Since one and the same context possibly contains several annotated clauses, for which the annotators give individual attribution labels, the initial sample format is context &#x021A6; {clause &#x021A6; (annotator &#x021A6; labels)}. To reduce the input size for the neural classifier (see section Method), we only keep contexts with a maximum of 100 tokens, totalling to 1,058 contexts (79% of all 1,340 unique contexts). We randomly split the contexts into a training set, a development set and a test set which have 80%, 10% and 10% of the data.</p>
<p>In a second step, we construct samples of the form (context, clause) &#x021A6; {annotator &#x021A6; labels}. Transforming the data into this format increases the sample size since each context contributes as many samples as it has annotated clauses (see <xref ref-type="table" rid="T2">Table 2</xref>). For example, the first two annotated clauses in (4) appear within the same sentence and thus have the same context, which happens to be the entire text of the example (one additional preceding and succeeding sentence). Therefore, this context contributes two samples. The third annotated clause in (4) receives another context with the context window moved one sentence to the right. The training, development and test sets then contain 1,897 (78%), 267 (11%), and 254 (11%) clauses, respectively.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Number of samples in training, development and test set.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Sample format</bold></th>
<th valign="top" align="right"><bold>Train set</bold></th>
<th valign="top" align="right"><bold>Dev set</bold></th>
<th valign="top" align="right"><bold>Test set</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">context &#x021A6; {clause &#x021A6; (annotator &#x021A6; labels)}</td>
<td valign="top" align="right">846</td>
<td valign="top" align="right">106</td>
<td valign="top" align="right">106</td>
</tr>
<tr>
<td valign="top" align="left">(context, clause) &#x021A6; {annotator &#x021A6; labels}</td>
<td valign="top" align="right">1,897</td>
<td valign="top" align="right">267</td>
<td valign="top" align="right">254</td>
</tr>
<tr>
<td valign="top" align="left">(context, clause, annotator) &#x021A6; labels</td>
<td valign="top" align="right">11,382</td>
<td valign="top" align="right">1,602</td>
<td valign="top" align="right">1,524</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Finally, we construct samples of the form (context, clause, annotator) &#x021A6; labels, which increases the sample size by a factor of 6 (i.e., the number of annotators). The distribution of labels in the data is shown in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Label distributions in training, development and test set.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th/>
<th valign="top" align="center"><bold>Character</bold></th>
<th valign="top" align="center"><bold>Narrator</bold></th>
<th valign="top" align="center"><bold>Author</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Train set</td>
<td valign="top" align="center">0.46</td>
<td valign="top" align="center">0.61</td>
<td valign="top" align="center">0.20</td>
</tr>
<tr>
<td valign="top" align="left">Dev set</td>
<td valign="top" align="center">0.44</td>
<td valign="top" align="center">0.63</td>
<td valign="top" align="center">0.24</td>
</tr>
<tr>
<td valign="top" align="left">Test set</td>
<td valign="top" align="center">0.47</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.18</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>3.3. Questionnaire</title>
<p>To determine correlations between literary beliefs and annotators&#x00027; annotation behaviour, we constructed a questionnaire which focuses on revealing biases regarding literary beliefs. The questionnaire is composed of two parts: part A consists of the Literary Response Questionnaire (LRQ) from Miall and Kuiken (<xref ref-type="bibr" rid="B26">1995</xref>), a psychological questionnaire that captures seven different aspects of readers&#x00027; orientation towards literary text. Questions in it refer to insight, empathy, imagery vividness, leisure escape, concern with the author, story-driven reading, and rejection of literary values. The LRQ is supplemented by part B, which we developed specifically for this work. We aim to capture biases towards literary texts, especially with respect to all attribution categories (character, narrator, author). We assume that literary beliefs have a great influence on annotation behaviour because, for example, annotating the author as a speaker is more or less likely depending on literary background. Therefore, part B of the questionnaire asks about (implicit) theoretical literary beliefs related to the narrator, the author as a decisive criterion for reading a text, the text as mean of communication with the author or authorial intention, narrator theories, implicit authorial instances, the relationship between the author and characters, and the relationship between the author and the narrator.</p>
<p>Since the focus of our research lies on German literature and our annotators are all German native speakers, we prepared part B of the questionnaire in German and also translated part A into German (the complete questionnaire is contained in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Materials</xref>). The entire questionnaire consists of 14 thematically coherent groups of questions (A1,..., A7; B1,..., B7) and a group of filler questions (B8), with a total number of 94 questions. When the questionnaire was given to the annotators, questions appeared in randomised order; at first the questions from part A, followed by the questions from part B, but without letting the participants know when part B started. All answers are given on a Likert scale from 1 (&#x0201C;I totally disagree&#x0201D;) to 5 (&#x0201C;I fully agree&#x0201D;).</p>
</sec>
</sec>
<sec id="s4">
<title>4. Methods and Results</title>
<sec>
<title>4.1. Bias Analysis</title>
<sec>
<title>4.1.1. Method</title>
<p>Our first experiment aims at finding correlations between implicit literary convictions of the annotators and annotations that they produce. Therefore, we randomly split the 1,897 training clauses into 18 batches of size 100 (omitting 97 samples) and compute the distribution of the labels for each batch&#x02013;annotator pair (see <xref ref-type="table" rid="T4">Table 4</xref>). We then extend each row with the questionnaire answers of the corresponding annotator as well as the mean values for each question group. We use this data to calculate correlations for each label and each question or question group, e.g., by calculating the correlation between the columns for the label &#x0201C;character&#x0201D; and the question A1F1. For the computation, we use Pearson&#x00027;s correlation coefficient.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Excerpt from the data for the correlation experiment: Label distributions of a specific annotator in a specific batch, and questionnaire answers of the corresponding annotator.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th/>
<th/>
<th valign="top" align="center" colspan="3" style="border-bottom: thin solid #000000;"><bold>Labels</bold></th>
<th valign="top" align="center" colspan="3" style="border-bottom: thin solid #000000;"><bold>Questions</bold></th>
<th valign="top" align="center" colspan="3" style="border-bottom: thin solid #000000;"><bold>Group means</bold></th>
</tr>
<tr>
<th valign="top" align="left"><bold>Batch</bold></th>
<th valign="top" align="left"><bold>Annotator</bold></th>
<th valign="top" align="left"><bold>Character</bold></th>
<th valign="top" align="left"><bold>Narrator</bold></th>
<th valign="top" align="left"><bold>Author</bold></th>
<th valign="top" align="left"><bold>A1F1</bold></th>
<th valign="top" align="left"><bold>&#x02026;</bold></th>
<th valign="top" align="left"><bold>B8F5</bold></th>
<th valign="top" align="left"><bold>A1</bold></th>
<th valign="top" align="left"><bold>&#x02026;</bold></th>
<th valign="top" align="left"><bold>B7</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">&#x00023;1</td>
<td valign="top" align="left">An1</td>
<td valign="top" align="left">0.45</td>
<td valign="top" align="left">0.60</td>
<td valign="top" align="left">0.24</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">&#x02026;</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">3.8</td>
<td valign="top" align="left">&#x02026;</td>
<td valign="top" align="left">4.0</td>
</tr>
<tr>
<td valign="top" align="left">&#x00023;1</td>
<td valign="top" align="left">An2</td>
<td valign="top" align="left">0.56</td>
<td valign="top" align="left">0.50</td>
<td valign="top" align="left">0.20</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">&#x02026;</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">3.0</td>
<td valign="top" align="left">&#x02026;</td>
<td valign="top" align="left">5.0</td>
</tr>
<tr>
<td valign="top" align="left">&#x022EE;</td>
<td valign="top" align="left">&#x022EE;</td>
<td valign="top" align="left">&#x022EE;</td>
<td valign="top" align="left">&#x022EE;</td>
<td valign="top" align="left">&#x022EE;</td>
<td valign="top" align="left">&#x022EE;</td>
<td valign="top" align="left">&#x022F1;</td>
<td valign="top" align="left">&#x022EE;</td>
<td valign="top" align="left">&#x022EE;</td>
<td valign="top" align="left">&#x022F1;</td>
<td valign="top" align="left">&#x022EE;</td>
</tr>
<tr>
<td valign="top" align="left">&#x00023;18</td>
<td valign="top" align="left">An5</td>
<td valign="top" align="left">0.45</td>
<td valign="top" align="left">0.56</td>
<td valign="top" align="left">0.02</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">&#x02026;</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">3.2</td>
<td valign="top" align="left">&#x02026;</td>
<td valign="top" align="left">3.7</td>
</tr>
<tr>
<td valign="top" align="left">&#x00023;18</td>
<td valign="top" align="left">An6</td>
<td valign="top" align="left">0.55</td>
<td valign="top" align="left">0.59</td>
<td valign="top" align="left">0.04</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">&#x02026;</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">3.0</td>
<td valign="top" align="left">&#x02026;</td>
<td valign="top" align="left">4.0</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>4.1.2. Results</title>
<p><xref ref-type="table" rid="T5">Table 5</xref> shows Pearson correlation coefficients between labels and features (i.e., questions or question groups), calculated over all 18 &#x000D7; 6 batch&#x02013;annotator combinations. Since we are interested in features that show significant differences between annotators, we exclude features whose answers show a low variance among annotators. For example, question A1F4 received the answers (4, 2, 4, 4, 4, 4) by our annotators, which corresponds to a variance of 0.67. We only keep features with a variance greater than 0.8, where 0.8 is the variance of the combinations {(1, 1, 2, 2, 3, 3), (2, 2, 3, 3, 4, 4), (3, 3, 4, 4, 5, 5)} (and their permutations).</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Correlations between labels and questions with an absolute value of 0.5 and higher.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Label</bold></th>
<th valign="top" align="left"><bold>Question</bold></th>
<th valign="top" align="left"><bold>Question group</bold></th>
<th valign="top" align="center"><bold>Variance</bold></th>
<th valign="top" align="center"><bold>Correlation coefficient</bold></th>
<th valign="top" align="center"><bold>Rank</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Character</td>
<td valign="top" align="left">A3F5</td>
<td valign="top" align="left">Imagery</td>
<td valign="top" align="center">1.81</td>
<td valign="top" align="center">&#x02212;0.62</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Character</td>
<td valign="top" align="left">A5F9</td>
<td valign="top" align="left">Concern with author</td>
<td valign="top" align="center">1.58</td>
<td valign="top" align="center">&#x02212;0.59</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Character</td>
<td valign="top" align="left">A2F4</td>
<td valign="top" align="left">Empathy</td>
<td valign="top" align="center">1.47</td>
<td valign="top" align="center">&#x02212;0.57</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Character</td>
<td valign="top" align="left">B5F1</td>
<td valign="top" align="left">Implicit author</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">&#x02212;0.57</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Character</td>
<td valign="top" align="left">A1F5</td>
<td valign="top" align="left">Insight</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">&#x02212;0.56</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Character</td>
<td valign="top" align="left">A7F4</td>
<td valign="top" align="left">Rejection of literary values</td>
<td valign="top" align="center">1.14</td>
<td valign="top" align="center">0.56</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Character</td>
<td valign="top" align="left">A1F10</td>
<td valign="top" align="left">Insight</td>
<td valign="top" align="center">1.22</td>
<td valign="top" align="center">&#x02212;0.54</td>
<td/>
</tr>
<tr style="border-bottom: thin solid #000000;">
<td valign="top" align="left">Character</td>
<td valign="top" align="left">A7F8</td>
<td valign="top" align="left">Rejection of literary values</td>
<td valign="top" align="center">1.47</td>
<td valign="top" align="center">0.52</td>
<td/>
</tr> <tr>
<td valign="top" align="left">Narrator</td>
<td valign="top" align="left">A5F2</td>
<td valign="top" align="left">Concern with author</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.62</td>
<td valign="top" align="center">10</td>
</tr>
<tr>
<td valign="top" align="left">Narrator</td>
<td valign="top" align="left">B3F1</td>
<td valign="top" align="left">Text as message of the author</td>
<td valign="top" align="center">1.14</td>
<td valign="top" align="center">0.56</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Narrator</td>
<td valign="top" align="left">A1F13</td>
<td valign="top" align="left">Insight</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.51</td>
<td/>
</tr>
<tr style="border-bottom: thin solid #000000;">
<td valign="top" align="left">Narrator</td>
<td valign="top" align="left">B1F3</td>
<td valign="top" align="left">Preferences for manifest narrators</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.50</td>
<td/>
</tr> <tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A5F5</td>
<td valign="top" align="left">Concern with author</td>
<td valign="top" align="center">1.89</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">1</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A4F6</td>
<td valign="top" align="left">Leisure Escape</td>
<td valign="top" align="center">1.14</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">2</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A2F5</td>
<td valign="top" align="left">Empathy</td>
<td valign="top" align="center">1.22</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">3</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A3F3</td>
<td valign="top" align="left">Imagery</td>
<td valign="top" align="center">1.22</td>
<td valign="top" align="center">&#x02212;0.79</td>
<td valign="top" align="center">4</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">B5F3</td>
<td valign="top" align="left">Implicit author</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.71</td>
<td valign="top" align="center">5</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">B1F3</td>
<td valign="top" align="left">Preferences for manifest narrators</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.67</td>
<td valign="top" align="center">6</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">B1F1</td>
<td valign="top" align="left">Preferences for manifest narrators</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.66</td>
<td valign="top" align="center">7</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A7F8</td>
<td valign="top" align="left">Rejection of literary values</td>
<td valign="top" align="center">1.47</td>
<td valign="top" align="center">0.65</td>
<td valign="top" align="center">8</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A1F13</td>
<td valign="top" align="left">Insight</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.63</td>
<td valign="top" align="center">9</td>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A6F3</td>
<td valign="top" align="left">Story-driven reading</td>
<td valign="top" align="center">1.56</td>
<td valign="top" align="center">0.62</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A4F3</td>
<td valign="top" align="left">Leisure escape</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.61</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A4F2</td>
<td valign="top" align="left">Leisure escape</td>
<td valign="top" align="center">1.56</td>
<td valign="top" align="center">0.58</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Author</td>
<td valign="top" align="left">A6F8</td>
<td valign="top" align="left">Story-driven reading</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.54</td>
<td/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>The rank column indicates the overall top-10 features</italic>.</p>
</table-wrap-foot>
</table-wrap>
<p>Correlations with an absolute value &#x02265; 0.5 can be considered as high. It can be seen that the label &#x0201C;author&#x0201D; shows a high correlation with more questions (13 questions) than the label &#x0201C;character&#x0201D; (8 questions), which in turn shows a high correlation with more questions than the label &#x0201C;narrator&#x0201D; (4 questions). This is in line with what we would expect: Whether the author is interpreted as taking responsibility for the information conveyed in a passage is typically not explicitly signalled in the text and, as such, it leaves much room for diverging annotation decisions based on the annotator&#x00027;s preferences and beliefs. Neither any of the filler questions (from B8) nor any of the question-group means reaches high correlation coefficients with any label.</p>
<p>The top-10 questions with the highest correlations are taken as features in some of the classification experiments. Almost all of these features correlate with the label &#x0201C;author&#x0201D; and only one with the label &#x0201C;narrator&#x0201D; (A2F5), scoring only marginally higher than A3F5 for &#x0201C;character&#x0201D; and A6F3 for &#x0201C;author&#x0201D;. To investigate the impact of each annotator on the correlation results, we calculate correlation coefficients for six groups, where one of the annotators is excluded (see <xref ref-type="table" rid="T6">Table 6</xref>). One can observe that if one of the annotators is missing, the top-10 features notably change. They change most if annotator 5 is excluded. Among the top-10 features, questions A5F5 and A2F5 remain consistent in all constellations, questions A4F6 and B5F3 mostly behave equally, while A1F13 and A5F2 are less consistent and are mostly removed from the top-10 if one of the annotators is excluded.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Most correlated features (questions) when one of the annotators is excluded: &#x0201C;&#x0002B;&#x0201D; if the feature remains among the top-10 correlated features, &#x0201C;&#x02013;&#x0201D; if it gets removed.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Excluded</bold></th>
<th valign="top" align="center" colspan="10" style="border-bottom: thin solid #000000;"><bold>Overall top-10 features</bold></th>
</tr>
<tr>
<th valign="top" align="left"><bold>Annotator</bold></th>
<th valign="top" align="center"><bold>A5F5</bold></th>
<th valign="top" align="center"><bold>A4F6</bold></th>
<th valign="top" align="center"><bold>A2F5</bold></th>
<th valign="top" align="center"><bold>A3F3</bold></th>
<th valign="top" align="center"><bold>B5F3</bold></th>
<th valign="top" align="center"><bold>B1F3</bold></th>
<th valign="top" align="center"><bold>B1F1</bold></th>
<th valign="top" align="center"><bold>A7F8</bold></th>
<th valign="top" align="center"><bold>A1F13</bold></th>
<th valign="top" align="center"><bold>A5F2</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">An1</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
</tr>
<tr>
<td valign="top" align="left">An2</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
</tr>
<tr>
<td valign="top" align="left">An3</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
</tr>
<tr>
<td valign="top" align="left">An4</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
</tr>
<tr>
<td valign="top" align="left">An5</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x02013;</td>
</tr>
<tr>
<td valign="top" align="left">An6</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x02013;</td>
<td valign="top" align="center">&#x0002B;</td>
<td valign="top" align="center">&#x0002B;</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="T7">Table 7</xref> shows the top-10 correlating features and the individual answers of the annotators. One can see that annotator 5 shows an extreme value (i.e., single-lowest or single-highest value) for 8 of the 10 questions. This explains why there is the most significant change when excluding annotator 5 in <xref ref-type="table" rid="T6">Table 6</xref>: in 7 cases, the feature does not pass the variance filter anymore.</p>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Overall top-10 questions with the answers given by the annotators.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Rank</bold></th>
<th valign="top" align="left"><bold>Question</bold></th>
<th valign="top" align="left"><bold>Text (English version)</bold></th>
<th valign="top" align="center" colspan="6" style="border-bottom: thin solid #000000;"><bold>Answers</bold></th>
</tr>
<tr>
<th/>
<th/>
<th/>
<th valign="top" align="left"><bold>An1</bold></th>
<th valign="top" align="left"><bold>An2</bold></th>
<th valign="top" align="left"><bold>An3</bold></th>
<th valign="top" align="left"><bold>An4</bold></th>
<th valign="top" align="left"><bold>An5</bold></th>
<th valign="top" align="left"><bold>An6</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">1</td>
<td valign="top" align="left">A5F5</td>
<td valign="top" align="left">When reading I usually try to identify an author&#x00027;s distinctive themes.</td>
<td valign="top" align="left">5</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">5</td>
</tr>
<tr>
<td valign="top" align="left">2</td>
<td valign="top" align="left">A4F6</td>
<td valign="top" align="left">While reading I completely forget what time it is.</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td valign="top" align="left">3</td>
<td valign="top" align="left">A2F5</td>
<td valign="top" align="left">I actively try to project myself into the role of fictional characters, almost as if I were preparing to act in a play.</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td valign="top" align="left">4</td>
<td valign="top" align="left">A3F3</td>
<td valign="top" align="left">I sometimes think I could draw a map of the places I have read about in a work of fiction.</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">1</td>
</tr>
<tr>
<td valign="top" align="left">5</td>
<td valign="top" align="left">B5F3</td>
<td valign="top" align="left">When reading a literary text, I bring to my mind that my idea of the author does not necessarily resemble the real author.</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">3</td>
</tr>
<tr>
<td valign="top" align="left">6</td>
<td valign="top" align="left">B1F3</td>
<td valign="top" align="left">I like to read novels in which the narrator often comes to the fore and the story takes a back seat.</td>
<td valign="top" align="left">5</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td valign="top" align="left">7</td>
<td valign="top" align="left">B1F1</td>
<td valign="top" align="left">I like to read novels that have a first-person narrator.</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">5</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td valign="top" align="left">8</td>
<td valign="top" align="left">A7F8</td>
<td valign="top" align="left">Works of literature often seem to make the issues of life more complicated than they actually are.</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">1</td>
<td valign="top" align="left">4</td>
</tr>
<tr>
<td valign="top" align="left">9</td>
<td valign="top" align="left">A1F13</td>
<td valign="top" align="left">Literature often gives special emphasis to those things that make a moral point.</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">3</td>
</tr>
<tr>
<td valign="top" align="left">10</td>
<td valign="top" align="left">A5F2</td>
<td valign="top" align="left">In reading I like to focus on what is distinctive about the author&#x00027;s style.</td>
<td valign="top" align="left">5</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">3</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">3</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Overall, our results suggest that there is indeed a certain amount of interdependence between annotator&#x00027;s beliefs and preferences (as captured by the questionnaire) and their attribution choices. Some findings seem plausible, others are more difficult to explain. For example, features like A5F5 which focus on the author are likely to have an influence on the annotation behaviour; while one cannot think of a straightforward connection between features like A4F6 and the annotation.</p>
<p>Given that we only have six annotators, we refrain from carrying out a deeper analysis at this point. However, we will make use of the findings when selecting features for the machine learning experiments in the following section.</p>
</sec>
</sec>
<sec>
<title>4.2. Neural Attribution Classification</title>
<sec>
<title>4.2.1. Method</title>
<p>We implement a neural classifier using Keras<xref ref-type="fn" rid="fn0005"><sup>5</sup></xref>, which is trained in different conditions, yielding several individual models. The architecture of our models is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. The context tokens of a sample are vectorised with a pretrained German BERT model<xref ref-type="fn" rid="fn0006"><sup>6</sup></xref>. To encode the clause of the sample, we add an additional dimension to the BERT embeddings, which has a value of 1 for tokens that belong to the clause and a value of 0 for tokens that constitute the context. The sequence of (768&#x0002B;1)-dimensional embeddings is fed to a 20-dimensional BiLSTM layer. Here, we use a dropout rate of 0.2 for the inputs and no dropout for the recurrent state. The output of the BiLSTM is then concatenated with a feature vector that encodes the annotator of the sample and depends on the training condition (see below). The concatenated input is fed to a 20-dimensional dense layer with ReLU activation and an L2 regularisation factor of 10<sup>&#x02212;4</sup>. Since the network should learn a multi-class multi-label classification, we use sigmoid activation in the output layer and binary cross-entropy as loss function. We use the Adam optimisation with a learning rate of 10<sup>&#x02212;4</sup>. Each model is trained with a batch size of 6 for 30 epochs<xref ref-type="fn" rid="fn0007"><sup>7</sup></xref>.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Architecture of the neural network with annotator encoding (An), BERT embeddings, and clause encodings (Cl).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-04-725321-g0001.tif"/>
</fig>
<p>First of all, we distinguish three training-set conditions (keywords for later reference are bold):</p>
<list list-type="order">
<list-item><p>We train six individual models for the <bold>unaggregated</bold> training sets of the individual annotators (1,897 training samples each). Each individual model should learn to make predictions for one specific annotator.</p></list-item>
<list-item><p>We train one model for the <bold>aggregated</bold> training sets of all annotators (11,382 training samples). This model should learn to make predictions depending on a specified annotator.</p></list-item>
<list-item><p>We train six models for the aggregated training sets of all but one annotators, i.e., one annotator is excluded during training (9,485 training samples each). These models should be tested in a <bold>cross-validated</bold> fashion to evaluate the predictions for an unseen annotator.</p></list-item>
</list>
<p>When training on unaggregated sets, no annotator encoding is required since all labels belong to the same annotator. In the other two conditions, we experiment with different annotator encodings:</p>
<list list-type="order">
<list-item><p>We use <bold>no encodings</bold> at all (as for the unaggregated training sets). This leads to possibly conflicting samples, since there could be samples with identical input (clause and context but no annotator) but different labels (from different annotators).</p></list-item>
<list-item><p>We use 6-dimensional <bold>one-hot</bold> encodings for the annotators.</p></list-item>
<list-item><p>We use 108-dimensional <bold>questionnaire</bold> encodings, combining the 94 values received from the questionnaire and the 14 mean values for every question group. Hereby, the values of the questionnaire, which originally lie in [1, 5], are shifted to [&#x02212;2, 2], so that the value 0 indicates neutrality.</p></list-item>
<list-item><p>We only use a top-10 <bold>selection</bold> of those questionnaire features that show the highest correlation with any label in the training set (see section Bias Analysis) for the correlation results).</p></list-item>
<list-item><p>For the cross-validated condition, we recalculate the correlations on the training set without the excluded annotator and use an adjusted <bold>selection*</bold> of the top-10 correlating features.</p></list-item>
</list>
<p>When it comes to prediction, we investigate two conditions for the aggregated models:</p>
<list list-type="order">
<list-item><p>All annotator encodings are <bold>given</bold> to the model as in training. In this condition, the model should predict the labels of a specified annotator.</p></list-item>
<list-item><p>All annotator encodings are <bold>zeroed</bold>, i.e., the model is only given zero vectors. Here, the model should predict the labels that a common-sense or neutral annotator would assign.</p></list-item>
</list>
</sec>
<sec>
<title>4.2.2. Results</title>
<p>We use binary (i.e., micro-averaged) accuracy (Acc) as evaluation measure for our models. Since the labels in our dataset are not distributed uniformly, we calculate the majority baseline for comparison. The majority baseline is the maximal accuracy that a model could achieve if it would predict the same output for all test samples. As secondary evaluation measures, we compute micro-averaged (Mic) and macro-averaged (Mac) f-score.</p>
<p>In a first experiment, we test whether the context representations are expressive enough to learn attribution categories at all. For this, we train a model with one-hot annotator encodings. <xref ref-type="table" rid="T8">Table 8</xref> shows that the model correctly predicts 96% of the labels from seen samples (i.e., the training set). As typical for NLP tasks, the accuracy on unseen samples is lower, namely 85% on the development and 88% on the test set. However, this still outperforms the majority baseline by &#x02265;20%.</p>
<table-wrap position="float" id="T8">
<label>Table 8</label>
<caption><p>Performances on the aggregated training, development and test sets after training on the aggregated training set with one-hot annotator encodings.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th/>
<th valign="top" align="center" colspan="3" style="border-bottom: thin solid #000000;"><bold>Train set</bold></th>
<th valign="top" align="center" colspan="3" style="border-bottom: thin solid #000000;"><bold>Dev set</bold></th>
<th valign="top" align="center" colspan="3" style="border-bottom: thin solid #000000;"><bold>Test set</bold></th>
</tr>
<tr>
<th valign="top" align="left"><bold>Annotator encoding</bold></th>
<th valign="top" align="left"><bold>Acc</bold></th>
<th valign="top" align="left"><bold>Mic</bold></th>
<th valign="top" align="left"><bold>Mac</bold></th>
<th valign="top" align="left"><bold>Acc</bold></th>
<th valign="top" align="left"><bold>Mic</bold></th>
<th valign="top" align="left"><bold>Mac</bold></th>
<th valign="top" align="left"><bold>Acc</bold></th>
<th valign="top" align="left"><bold>Mic</bold></th>
<th valign="top" align="left"><bold>Mac</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Majority baseline</td>
<td valign="top" align="left">0.65</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">0.65</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">0.65</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="left">&#x02013;</td>
</tr>
<tr>
<td valign="top" align="left">One-hot given</td>
<td valign="top" align="left">0.96</td>
<td valign="top" align="left">0.95</td>
<td valign="top" align="left">0.93</td>
<td valign="top" align="left">0.85</td>
<td valign="top" align="left">0.83</td>
<td valign="top" align="left">0.81</td>
<td valign="top" align="left">0.88</td>
<td valign="top" align="left">0.85</td>
<td valign="top" align="left">0.80</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In a second set of experiments, we test whether the classifier captures similarities between annotators. A model that is trained on one of the unaggregated training sets for a specific annotator should achieve the highest accuracy on the corresponding unaggregated test set of the same annotator. Indeed, <xref ref-type="table" rid="T9">Table 9</xref> shows that the highest accuracy for each annotator is achieved on the corresponding test set.</p>
<table-wrap position="float" id="T9">
<label>Table 9</label>
<caption><p>Accuracies on the unaggregated test sets after training on the unaggregated training sets.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th/>
<th valign="top" align="center" colspan="6" style="border-bottom: thin solid #000000;"><bold>Test set</bold></th>
</tr>
<tr>
<th valign="top" align="left"><bold>Train set</bold></th>
<th valign="top" align="center"><bold>An1</bold></th>
<th valign="top" align="center"><bold>An2</bold></th>
<th valign="top" align="center"><bold>An3</bold></th>
<th valign="top" align="center"><bold>An4</bold></th>
<th valign="top" align="center"><bold>An5</bold></th>
<th valign="top" align="center"><bold>An6</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">An1</td>
<td valign="top" align="center"><bold>0.87</bold></td>
<td valign="top" align="center">0.78</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.72</td>
</tr>
<tr>
<td valign="top" align="left">An2</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center"><bold>0.87</bold></td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.79</td>
</tr>
<tr>
<td valign="top" align="left">An3</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center">0.78</td>
<td valign="top" align="center"><bold>0.86</bold></td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">0.86</td>
<td valign="top" align="center">0.77</td>
</tr>
<tr>
<td valign="top" align="left">An4</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">0.75</td>
<td valign="top" align="center"><bold>0.86</bold></td>
<td valign="top" align="center">0.78</td>
<td valign="top" align="center">0.75</td>
</tr>
<tr>
<td valign="top" align="left">An5</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">0.80</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.78</td>
<td valign="top" align="center"><bold>0.87</bold></td>
<td valign="top" align="center">0.74</td>
</tr>
<tr>
<td valign="top" align="left">An6</td>
<td valign="top" align="center">0.78</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">0.78</td>
<td valign="top" align="center">0.79</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center"><bold>0.88</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Row names and column names represent annotators. Bold values mark accuracies for each test set after training on the corresponding training set</italic>.</p>
</table-wrap-foot>
</table-wrap>
<p>In more general words, each model should achieve higher accuracies on test sets of annotators that are more similar to the annotator of the training set. The similarity of two annotators can be measured with Krippendorff&#x00027;s inter-annotator-agreement coefficient &#x003B1; (Krippendorff, <xref ref-type="bibr" rid="B22">2018</xref>, pp. 221&#x02013;250), using the MASI distance (Passonneau, <xref ref-type="bibr" rid="B30">2006</xref>) for multi-label annotations. <xref ref-type="table" rid="T10">Table 10</xref> shows inter-annotator agreements for all pairs of annotators. The Pearson correlation between <xref ref-type="table" rid="T9">Tables 9</xref> and <xref ref-type="table" rid="T10">10</xref> is 0.88, indicating that the more similar the test annotator is to the training annotator, the more accurate are the model&#x00027;s predictions.</p>
<table-wrap position="float" id="T10">
<label>Table 10</label>
<caption><p>Pairwise Krippendorff&#x00027;s &#x003B1; on the training set.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th/>
<th valign="top" align="center" colspan="6" style="border-bottom: thin solid #000000;"><bold>Annotator</bold></th>
</tr>
<tr>
<th valign="top" align="left"><bold>Annotator</bold></th>
<th valign="top" align="center"><bold>An1</bold></th>
<th valign="top" align="center"><bold>An2</bold></th>
<th valign="top" align="center"><bold>An3</bold></th>
<th valign="top" align="center"><bold>An4</bold></th>
<th valign="top" align="center"><bold>An5</bold></th>
<th valign="top" align="center"><bold>An6</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">An1</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.68</td>
<td valign="top" align="center">0.64</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.56</td>
<td valign="top" align="center">0.67</td>
</tr>
<tr>
<td valign="top" align="left">An2</td>
<td valign="top" align="center">0.68</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.62</td>
<td valign="top" align="center">0.67</td>
<td valign="top" align="center">0.62</td>
<td valign="top" align="center">0.72</td>
</tr>
<tr>
<td valign="top" align="left">An3</td>
<td valign="top" align="center">0.64</td>
<td valign="top" align="center">0.62</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.63</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.60</td>
</tr>
<tr>
<td valign="top" align="left">An4</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.67</td>
<td valign="top" align="center">0.63</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.58</td>
<td valign="top" align="center">0.68</td>
</tr>
<tr>
<td valign="top" align="left">An5</td>
<td valign="top" align="center">0.56</td>
<td valign="top" align="center">0.62</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.58</td>
<td valign="top" align="center">1.00</td>
<td valign="top" align="center">0.50</td>
</tr>
<tr>
<td valign="top" align="left">An6</td>
<td valign="top" align="center">0.67</td>
<td valign="top" align="center">0.72</td>
<td valign="top" align="center">0.60</td>
<td valign="top" align="center">0.68</td>
<td valign="top" align="center">0.50</td>
<td valign="top" align="center">1.00</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Row names and column names represent annotators. Note that Krippendorff&#x00027;s &#x003B1; is symmetric</italic>.</p>
</table-wrap-foot>
</table-wrap>
<p><xref ref-type="fig" rid="F2">Figure 2</xref> further shows a hierarchical clustering of all annotators on the training set using 1 &#x02212; &#x003B1; as distance metric and the unweighted pair group method with arithmetic mean (UPGMA; Sokal et al., <xref ref-type="bibr" rid="B38">1958</xref>) as clustering method. We can see that there are sub-groups of annotators, e.g., annotator 1 and annotator 4 annotated similarly (&#x003B1; &#x0003D; 0.90). Consequently, a model trained on annotator 1&#x00027;s training set makes the second-best predictions on annotator 4&#x00027;s test set (the best predictions still makes a model trained on annotator 4&#x00027;s training set), and vice versa.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Clustering of annotators using the pairwise disagreement (1 &#x02212; &#x003B1;) on the training set.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-04-725321-g0002.tif"/>
</fig>
<p>The main set of experiments compares the performances when using different annotator encodings. The results are shown in <xref ref-type="table" rid="T11">Table 11</xref>. As we have already seen in <xref ref-type="table" rid="T8">Table 8</xref>, a model with one-hot encodings achieves an accuracy of 88% when annotator encodings are given during prediction. When they are zeroed, the accuracy drops to 84%, which is the same accuracy as a model with no annotator encodings achieves. This indicates that the one-hot model correctly interprets the zero vector as common-sense annotator, although it has not seen any such vector during training. The models with questionnaire and selection encodings achieve accuracies of 87 and 88%, respectively, when annotator encodings are given but behave quite differently when the encodings are zeroed: The accuracy for the questionnaire model drops to 77% whereas the accuracy for the selection model only drops to 84%. We theorise that the differences in performance can be explained as follows: The 108-dimensional questionnaire encodings presumably contain a lot of dimensions with lower variance between the annotators, making the questionnaire encodings more similar to each other than the selection encodings. This makes the encodings less distinctive which could explain the lower performance of 87% when compared to that of e.g., one-hot encodings. Furthermore, the zero vector represents a neutral annotator, i.e., an annotator that would answer all questions in the questionnaire with &#x0201C;neither agree nor disagree&#x0201D;; this is not necessarily the average or common-sense of our six annotators. The selection encodings, on the other hand, are based on distinctive features, i.e., features that are low (&#x0003C; 0) for some annotators and high (&#x0003E; 0) for other annotators. The zero vector then represents a non-extreme annotator that is likely to be similar to the average of our six annotators. Hence, the selection encodings perform as well as one-hot encodings.</p>
<table-wrap position="float" id="T11">
<label>Table 11</label>
<caption><p>Performances on the aggregated test set after training on the aggregated training set with different annotator encodings, and with given or zeroed annotator encodings at prediction time.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left" colspan="2"><bold>Annotator encoding</bold></th>
<th valign="top" align="left"><bold>Acc</bold></th>
<th valign="top" align="center"><bold>Mic</bold></th>
<th valign="top" align="center"><bold>Mac</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">No encoding</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.74</td>
</tr>
<tr>
<td valign="top" align="left">One-hot</td>
<td valign="top" align="left">Given</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.80</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">Zeroed</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.75</td>
</tr>
<tr>
<td valign="top" align="left">Questionnaire</td>
<td valign="top" align="left">Given</td>
<td valign="top" align="center">0.87</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">0.78</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">Zeroed</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">0.73</td>
<td valign="top" align="center">0.66</td>
</tr>
<tr>
<td valign="top" align="left">Selection</td>
<td valign="top" align="left">Given</td>
<td valign="top" align="center">0.88</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.79</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">Zeroed</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.73</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In a classification scenario where all annotators are seen in training, one-hot encodings would be sufficient as encoding of choice. In a last experiment, we investigate the effect of the type of annotator encoding if we want to make predictions for an annotator unseen in training. <xref ref-type="table" rid="T12">Table 12</xref> shows cross-validation results for questionnaire and selection encodings, with no encoding as baseline. The baseline accuracies range from 77 to 84% with an average of 81%. This result cannot be outperformed by questionnaire encodings which only show a greater standard deviation with accuracies ranging from 76 to 85%. Selection encodings, on the other hand, are able to improve the performance to accuracies of 82% in average. However, the selection is determined for all annotators, including the test annotator, in the first place, whereas in a real-world scenario one could determine the selection* on the training annotators&#x00027; annotations only. When doing so, the accuracy decreases to the level of the baseline. Interestingly, the results with no encodings show a parallel to that with selection* encodings, and the results with questionnaire encodings show a parallel to that with selection encodings. We do not have a straightforward explanation for this observation at hand. As a conclusion of this experiment, one can say that none of the possible encodings (no encoding, questionnaire, selection*) is more suitable than the others for predicting the labels of an unseen annotator.</p>
<table-wrap position="float" id="T12">
<label>Table 12</label>
<caption><p>Cross-validation performances on the unaggregated test sets after training on the aggregated training sets.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left" colspan="2"><bold>Annotator encoding</bold></th>
<th valign="top" align="left"><bold>An1</bold></th>
<th valign="top" align="center"><bold>An2</bold></th>
<th valign="top" align="center"><bold>An3</bold></th>
<th valign="top" align="center"><bold>An4</bold></th>
<th valign="top" align="center"><bold>An5</bold></th>
<th valign="top" align="center"><bold>An6</bold></th>
<th valign="top" align="center"><bold>&#x003BC;</bold></th>
<th valign="top" align="center"><bold>&#x003C3;</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">No encoding</td>
<td valign="top" align="left">&#x02013;</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">0.84</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.02</td>
</tr>
<tr>
<td valign="top" align="left">Questionnaire</td>
<td valign="top" align="left">Given</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.03</td>
</tr>
<tr>
<td valign="top" align="left">Selection</td>
<td valign="top" align="left">Given</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.76</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.85</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.03</td>
</tr>
<tr>
<td valign="top" align="left">Selection*</td>
<td valign="top" align="left">Given</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.83</td>
<td valign="top" align="center">0.82</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.77</td>
<td valign="top" align="center">0.81</td>
<td valign="top" align="center">0.02</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>The training samples of the test annotator (column name) were not included in the aggregated training set. The right-most columns show average (&#x003BC;) and standard deviations (&#x003C3;) for each six runs</italic>.</p>
</table-wrap-foot>
</table-wrap>
<p>We provide more detailed classification results, e.g., f-scores for some of the mentioned experiments or performances of all models on the unaggregated test sets, in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Materials</xref>.</p>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="s5">
<title>5. Discussion</title>
<p>In recent years, more and more researchers have begun to study annotator bias. Most of these studies are focused on the effect of bias on the quality of annotated data. This is particularly important if the annotations are done in a crowd-sourcing scenario, where the identification of spammers is crucial (cf. Sober&#x000F3;n et al., <xref ref-type="bibr" rid="B37">2013</xref>; Paun et al., <xref ref-type="bibr" rid="B31">2018</xref>). Other studies looked into bias effects that may arise from how the annotation task is formulated, especially in the areas of natural language inference and generation (Amidei et al., <xref ref-type="bibr" rid="B3">2018</xref>; Gururangan et al., <xref ref-type="bibr" rid="B17">2018</xref>; Tsuchiya, <xref ref-type="bibr" rid="B39">2018</xref>). An area where annotator bias is particularly relevant is hate speech detection and several papers have looked into different types of bias. For instance, Kuwatly et al. (<xref ref-type="bibr" rid="B23">2020</xref>) investigate how different user demographics influence hate speech annotations, while Wich et al. (<xref ref-type="bibr" rid="B40">2020</xref>) look at the impact of political biases.</p>
<p>Annotator bias that arises when annotating linguistic phenomena has also received a lot of attention (Morris and Hirst, <xref ref-type="bibr" rid="B28">2004</xref>; Morris, <xref ref-type="bibr" rid="B27">2010</xref>; Rohde et al., <xref ref-type="bibr" rid="B33">2016</xref>; Scholman and Demberg, <xref ref-type="bibr" rid="B35">2017</xref>). It has been shown that for linguistic annotations, annotator certainty is often not correlated with annotation variation (Poesio and Artstein, <xref ref-type="bibr" rid="B32">2005</xref>; Nedoluzhko and M&#x000ED;rovsk&#x000FD;, <xref ref-type="bibr" rid="B29">2013</xref>; Andresen et al., <xref ref-type="bibr" rid="B4">2020</xref>), indicating that disagreements between annotators may be influenced by annotator preferences. Annotation bias in literature has only rarely been discussed, which is surprising given that the literature analysis is often seen as inherently open to interpretation and divergence of opinion (Hammond et al., <xref ref-type="bibr" rid="B18">2013</xref>). One exception is a study by Gius and Jacke (<xref ref-type="bibr" rid="B16">2017</xref>), who classify annotation disagreements into four classes: misinterpretations, deficient category definitions, categories which depend on preliminary analyses, and textual ambiguities or polyvalence. Only the latter are seen as adequate reasons for disagreement. From an application perspective, the work that comes closest to our present study is Hammond et al. (<xref ref-type="bibr" rid="B18">2013</xref>), which is concerned with tracking attribution ambiguity for free indirect discourse in Virgina Woolf&#x00027;s <italic>To the lighthouse</italic> (<xref ref-type="bibr" rid="B42">1927</xref>). However, that paper describes work in progress and no results are given for the annotation experiment. Details on the planned machine-learning study are also sketchy.</p>
<p>From a machine-learning and computational-modelling perspective, several studies have shown that annotation bias can harm the performance of classifiers trained on the data (cf. Gururangan et al., <xref ref-type="bibr" rid="B17">2018</xref>; Tsuchiya, <xref ref-type="bibr" rid="B39">2018</xref>) and that annotator information can improve performance. For example, working on hate speech detection, Akhtar et al. (<xref ref-type="bibr" rid="B2">2020</xref>) divide annotators into two groups, building on their earlier work on measuring polarisation in hate speech annotation (Akhtar et al., <xref ref-type="bibr" rid="B1">2019</xref>). They train a classifier for each annotator group and also build an ensemble classifier, which labels an instance as hate speech whenever one of the individual classifiers did so. They find that the latter outperforms the former. Working on various natural language understanding tasks, Geva et al. (<xref ref-type="bibr" rid="B13">2019</xref>) take a slightly different approach and&#x02014;similarly to us&#x02014;include identifiers for individual annotators in the input feature vector for a neural-network architecture. They show that this improves performance compared to an annotator-agnostic classifier but the model is not able to generalise to unseen annotators. Note, however, that Geva et al.&#x00027;s data comes from natural language inference and question answering, where annotations are not simple labels but complete sentences which will carry inevitably a stronger annotator signal.</p>
<p>In this study, we combine previous research on annotator bias (and the reasons for it) with work on modelling bias computationally and apply it to a domain in which annotation bias has so far been under-researched, namely literature.</p>
<p>Specifically, we address the task of automatic speaker attribution in fictional narrative texts. This is a challenging task as the attribution of one or several speakers to a text passage does not solely depend on the text itself but also on the person who reads and interprets it&#x02014;which constitutes the phenomenon of uncertain attribution. In consequence, a classifier has to be trained on both text representations and features that capture a reader&#x00027;s bias to become fully capable of the task. The first question which we followed was to find such bias features by correlating the attribution annotations of six annotators with the answers they gave in an extended version of Miall and Kuiken (<xref ref-type="bibr" rid="B26">1995</xref>)&#x00027;s Literary Response Questionnaire. We found that the attribution category &#x0201C;author&#x0201D; is, in comparison to &#x0201C;narrator&#x0201D; and &#x0201C;character&#x0201D;, subject to the most bias features that were tested. Since narrator and character speech is at least partially marked at the surface text, whereas author speech requires an additional interpretation (except for passages in a preface or the like), we claim (following e.g., Sch&#x000F6;nert, <xref ref-type="bibr" rid="B36">2014</xref>) that this category is more subjective than the other two, which is underpinned by the correlation results.</p>
<p>We experimented with a neural classifier that is trained on pairs of text representation and a bias feature vector, and thus can learn to make biased predictions. We showed 1) that the neural architecture is capable of learning similarities and differences between sub-groups of annotators, and 2) that a single model can learn to produce accurate predictions for individual annotators as well as 3) average or common-sense predictions for an unspecified or unseen annotator. Although the nature of speaker attribution does not suggest to create a gold-standard annotation, we are confident that one can use these common-sense predictions for follow-up applications and analyses. For example, one could automatically label an extended diachronic text corpus with speaker attribution and analyse the distribution of the three categories over time.</p>
<p>In future work, we plan to extend the set of annotators. In the present study, we only looked at six annotators, who can be seen as expert annotators in that they were all (advanced) students of German literature and, moreover, had 6 months to a whole year of experience at annotating the literary categories in our project. In a follow-up study we intend to also look at lay annotators, for example recruited through crowd sourcing. Moving to lay annotators would allow us to take into account a larger group of annotators and thereby hopefully shed more light on the effect of literary preferences on annotation decisions. Furthermore, it would enable us to compare two different groups of annotators and investigate, for example, whether annotator bias tends to be stronger for one of the groups.</p>
</sec>
<sec sec-type="data-availability" id="s6">
<title>Data Availability Statement</title>
<p>The datasets and scripts generated for this study can be found in the repository <ext-link ext-link-type="uri" xlink:href="https://gitlab.gwdg.de/mona/neural-attribution">https://gitlab.gwdg.de/mona/neural-attribution</ext-link>.</p>
</sec>
<sec id="s7">
<title>Ethics Statement</title>
<p>Ethical review and approval was not required for the study on human participants in accordance with the local legislation and institutional requirements. The patients/participants provided their written informed consent to participate in this study.</p>
</sec>
<sec id="s8">
<title>Author Contributions</title>
<p>The questionnaire was translated and extended by TD, BG, AW, and FB. TD and HV conducted the questionnaire-correlation experiment. CS and TD planned and designed the classification experiments. TD supervised the questionnaire experiment, prepared the corpus and questionnaire data, implemented the neural networks, and conducted the classification experiments. AH, BG, LG, AW, and FB developed annotation guidelines for attribution. AW supervised the annotation process. LG, AW, FB, HV, and TD created the gold standard for reflective passages. All authors together developed and discussed the general outline of the study as well as the conceptualization and formalization of the relevant literary phenomena and wrote sections of the manuscript, contributed to manuscript revision, read, and approved the submitted version.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>This work was funded by Volkswagen Foundation (TD, AW, LG, BG, AH, and CS), by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) &#x02013; 424264086 (HV, FB, BG, AH, and CS), and by Nieders&#x000E4;chsisches Vorab.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<ack>
<p>In addition to our funders, Volkswagen Foundation, Deutsche Forschungsgemeinschaft and Nieders&#x000E4;chsisches Vorab, we cordially thank our research assistants: Friederike Altmann, Jan Lau, Jonas Lipski, Evelyn Ovsjannikov, Noreen Scheffel, Ruben van Wijk, and Marina Wurzbacher.</p></ack>
<sec sec-type="supplementary-material" id="s11">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2021.725321/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frai.2021.725321/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.PDF" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Akhtar</surname> <given-names>S.</given-names></name> <name><surname>Basile</surname> <given-names>V.</given-names></name> <name><surname>Patti</surname> <given-names>V.</given-names></name></person-group> (<year>2019</year>). <article-title>A new measure of polarization in the annotation of hate speech</article-title>, in <source>AI*IA 2019-Advances in Artificial Intelligence</source>, eds <person-group person-group-type="editor"><name><surname>Alviano</surname> <given-names>M.</given-names></name> <name><surname>Greco</surname> <given-names>G.</given-names></name> <name><surname>Scarcello</surname> <given-names>F.</given-names></name></person-group> (<publisher-loc>Rende</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>588</fpage>&#x02013;<lpage>603</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Akhtar</surname> <given-names>S.</given-names></name> <name><surname>Basile</surname> <given-names>V.</given-names></name> <name><surname>Patti</surname> <given-names>V.</given-names></name></person-group> (<year>2020</year>). <article-title>Modeling annotator perspective and polarized opinions to improve hate speech detection</article-title>, in <source>Proceedings of the 8th AAAI Conference on Human Computation and Crowdsourcing (HCOMP-20)</source> (<publisher-loc>Hilversum</publisher-loc>), <fpage>151</fpage>&#x02013;<lpage>154</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Amidei</surname> <given-names>J.</given-names></name> <name><surname>Piwek</surname> <given-names>P.</given-names></name> <name><surname>Willis</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>Rethinking the agreement in human evaluation tasks</article-title>, in <source>Proceedings of the 27th International Conference on Computational Linguistics</source> (<publisher-loc>Santa Fe, NM</publisher-loc>), <fpage>3318</fpage>&#x02013;<lpage>3329</lpage>. <pub-id pub-id-type="pmid">25756943</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Andresen</surname> <given-names>M.</given-names></name> <name><surname>Vauth</surname> <given-names>M.</given-names></name> <name><surname>Zinsmeister</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>Modeling ambiguity with many annotators and self-assessments of annotator certainty</article-title>, in <source>Proceedings of the 14th Linguistic Annotation Workshop</source> (<publisher-loc>Barcelona</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>48</fpage>&#x02013;<lpage>59</lpage>.</citation>
</ref>
<ref id="B5">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Barth</surname> <given-names>F.</given-names></name> <name><surname>D&#x000F6;nicke</surname> <given-names>T.</given-names></name> <name><surname>Gittel</surname> <given-names>B.</given-names></name> <name><surname>G&#x000F6;deke</surname> <given-names>L.</given-names></name> <name><surname>Hofmann</surname> <given-names>A. M.</given-names></name> <name><surname>Holler</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <source>MONACO: Modes of Narration and Attribution Corpus</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://gitlab.gwdg.de/mona/korpus-public">https://gitlab.gwdg.de/mona/korpus-public</ext-link></citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bonheim</surname> <given-names>H.</given-names></name></person-group> (<year>1975</year>). <article-title>Theory of narrative modes</article-title>. <source>Semiotica</source> <volume>14</volume>, <fpage>329</fpage>&#x02013;<lpage>344</lpage>. <pub-id pub-id-type="doi">10.1515/semi.1975.14.4.329</pub-id></citation>
</ref>
<ref id="B7">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Currie</surname> <given-names>G.</given-names></name></person-group> (<year>2010</year>). <source>Narratives and Narrators: A Philosophy of Stories</source>. <publisher-loc>New York, NY</publisher-loc>: <publisher-name>Oxford Scholarship Online</publisher-name>.</citation>
</ref>
<ref id="B8">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Dahn</surname> <given-names>F.</given-names></name></person-group> (<year>2012</year>). <article-title>Kampf um Rom</article-title>, in <source>TextGrid Repository</source> (Digitale Bibliothek). Available online at: <ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/11858/00-1734-0000-0002-6894-3">https://hdl.handle.net/11858/00-1734-0000-0002-</ext-link><ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/11858/00-1734-0000-0002-6894-3">6894-3</ext-link></citation>
</ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>D&#x000F6;nicke</surname> <given-names>T.</given-names></name></person-group> (<year>2020</year>). <article-title>Clause-level tense, mood, voice and modality tagging for German</article-title>, in <source>Proceedings of the 19th International Workshop on Treebanks and Linguistic Theories</source> (<publisher-loc>D&#x000FC;sseldorf</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>17</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>D&#x000F6;nicke</surname> <given-names>T.</given-names></name> <name><surname>G&#x000F6;deke</surname> <given-names>L.</given-names></name> <name><surname>Varachkina</surname> <given-names>H.</given-names></name></person-group> (<year>2021</year>). <article-title>Annotating quantified phenomena in complex sentence structures using the example of generalising statements in literary texts</article-title>, in <source>Proceedings of the 17th Joint ACL - ISO Workshop on Interoperable Semantic Annotation</source> (<publisher-loc>Groningen</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>20</fpage>&#x02013;<lpage>32</lpage>.</citation>
</ref>
<ref id="B11">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Fleiss</surname> <given-names>J. L.</given-names></name> <name><surname>Levin</surname> <given-names>B.</given-names></name> <name><surname>Paik</surname> <given-names>M. C.</given-names></name></person-group> (<year>2003</year>). <source>The Measurement of Interrater Agreement, Chapter 18, 3rd Edn</source>. <publisher-loc>Hoboken, NJ</publisher-loc>: <publisher-name>John Wiley &#x00026; Sons</publisher-name>.</citation>
</ref>
<ref id="B12">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Fontane</surname> <given-names>T.</given-names></name></person-group> (<year>2012</year>). <article-title>Der Stechlin</article-title>, in <source>TextGrid Repository</source> (Digitale Bibliothek). Available online at: <ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/11858/00-1734-0000-0002-AECF-D">https://hdl.handle.net/11858/00-1734-0000-0002-AECF-D</ext-link></citation>
</ref>
<ref id="B13">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Geva</surname> <given-names>M.</given-names></name> <name><surname>Goldberg</surname> <given-names>Y.</given-names></name> <name><surname>Berant</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>Are we modeling the task or the annotator? An investigation of annotator bias in natural language understanding datasets</article-title>, in <source>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</source> (<publisher-loc>Hong Kong</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>1161</fpage>&#x02013;<lpage>1166</lpage>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gittel</surname> <given-names>B.</given-names></name></person-group> (<year>2015</year>). <article-title>Essayismus als Fiktionalisierung von unsicheres Wissen prozessierender Reflexion</article-title>. <source>Scientia Poetica</source> <volume>19</volume>, <fpage>136</fpage>&#x02013;<lpage>171</lpage>. <pub-id pub-id-type="doi">10.1515/scipo-2015-0106</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Gittel</surname> <given-names>B.</given-names></name></person-group> (<year>2022</year>). <article-title>Reflexive Passagen in fiktionaler Literatur. &#x000DC;berlegungen zu ihrer Identifikation und Funktion am Beispiel von Wielands &#x0201C;Geschichte des Agathon&#x0201D; und Goethes &#x0201C;Wahlverwandtschaften&#x0201D;</article-title> <source>Euphorion</source> <fpage>116</fpage>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.winter-verlag.de/de/detail/c3743/Euphorion/">https://www.winter-verlag.de/de/detail/c3743/Euphorion/</ext-link></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gius</surname> <given-names>E.</given-names></name> <name><surname>Jacke</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>The hermeneutic profit of annotation: On preventing and fostering disagreement in literary analysis</article-title>. <source>Int. J. Humanities Arts Comput</source>. <volume>11</volume>, <fpage>233</fpage>&#x02013;<lpage>254</lpage>. <pub-id pub-id-type="doi">10.3366/ijhac.2017.0194</pub-id></citation>
</ref>
<ref id="B17">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Gururangan</surname> <given-names>S.</given-names></name> <name><surname>Swayamdipta</surname> <given-names>S.</given-names></name> <name><surname>Levy</surname> <given-names>O.</given-names></name> <name><surname>Schwartz</surname> <given-names>R.</given-names></name> <name><surname>Bowman</surname> <given-names>S.</given-names></name> <name><surname>Smith</surname> <given-names>N. A.</given-names></name></person-group> (<year>2018</year>). <article-title>Annotation artifacts in natural language inference data</article-title>, in <source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Vol. 2 (Short Papers)</source> (<publisher-loc>New Orleans</publisher-loc>), <fpage>107</fpage>&#x02013;<lpage>112</lpage>.</citation>
</ref>
<ref id="B18">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hammond</surname> <given-names>A.</given-names></name> <name><surname>Brooke</surname> <given-names>J.</given-names></name> <name><surname>Hirst</surname> <given-names>G.</given-names></name></person-group> (<year>2013</year>). <article-title>A tale of two cultures: bringing literary analysis and computational linguistics together</article-title>, in <source>Proceedings of the Workshop on Computational Linguistics for Literature</source> (<publisher-loc>Atlanta</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B19">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Hoffmann</surname> <given-names>E. T. A.</given-names></name></person-group> (<year>2012</year>). <article-title>Der Sandmann</article-title>, in <source>TextGrid Repository</source> (Digitale Bibliothek). Available online at: <ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/11858/00-1734-0000-0003-6A94-2">https://hdl.handle.net/11858/00-1734-0000-0003-6A94-2</ext-link></citation>
</ref>
<ref id="B20">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Konrad</surname> <given-names>E.-M.</given-names></name></person-group> (<year>2017</year>). <article-title>Signposts of factuality: on genuine assertions in fictional literature</article-title>, in <source>Art and Belief</source>, eds <person-group person-group-type="editor"><name><surname>Sullivan-Bissett</surname> <given-names>E.</given-names></name> <name><surname>Bradley</surname> <given-names>H.</given-names></name> <name><surname>Noordhof</surname> <given-names>P.</given-names></name></person-group> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Oxford Scholarship Online</publisher-name>), <fpage>42</fpage>&#x02013;<lpage>62</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>K&#x000F6;ppe</surname> <given-names>T.</given-names></name> <name><surname>St&#x000FC;hring</surname> <given-names>J.</given-names></name></person-group> (<year>2011</year>). <article-title>Against pan-narrator theories</article-title>. <source>J. Literary Semantics</source> <volume>40</volume>, <fpage>59</fpage>&#x02013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.1515/jlse.2011.004</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Krippendorff</surname> <given-names>K.</given-names></name></person-group> (<year>2018</year>). <source>Content Analysis: An Introduction to Its Methodology, 3rd Edn</source>. <publisher-loc>Thousand Oaks, CA</publisher-loc>: <publisher-name>Sage</publisher-name>.</citation>
</ref>
<ref id="B23">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kuwatly</surname> <given-names>H. A.</given-names></name> <name><surname>Wich</surname> <given-names>M.</given-names></name> <name><surname>Groh</surname> <given-names>G.</given-names></name></person-group> (<year>2020</year>). <article-title>Identifying and measuring annotator bias based on annotators&#x00027; demographic characteristics</article-title>, in <source>Proceedings of the Fourth Workshop on Online Abuse and Harms</source> (<publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>184</fpage>&#x02013;<lpage>190</lpage>.</citation>
</ref>
<ref id="B24">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Leslie</surname> <given-names>S.-J.</given-names></name> <name><surname>Lerner</surname> <given-names>A.</given-names></name></person-group> (<year>2016</year>). <article-title>Generic generalizations</article-title>, in <source>The Stanford Encyclopedia of Philosophy, Winter 2016 Edn</source>, eds <person-group person-group-type="editor"><name><surname>Zalta</surname> <given-names>E. N.</given-names></name></person-group> (<publisher-name>Metaphysics Research Lab; Stanford University</publisher-name>).</citation>
</ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Martinez</surname> <given-names>M.</given-names></name> <name><surname>Scheffel</surname> <given-names>M.</given-names></name></person-group> (<year>2016</year>). <source>Einf&#x000FC;hrung in die Erz&#x000E4;hltheorie</source>. <publisher-loc>M&#x000FC;nchen</publisher-loc>: <publisher-name>C. H. Beck</publisher-name>.</citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Miall</surname> <given-names>D. S.</given-names></name> <name><surname>Kuiken</surname> <given-names>D.</given-names></name></person-group> (<year>1995</year>). <article-title>Aspects of literary response: a new questionnaire</article-title>. <source>Res. Teach. English</source> <volume>29</volume>, <fpage>37</fpage>&#x02013;<lpage>58</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Morris</surname> <given-names>J.</given-names></name></person-group> (<year>2010</year>). <article-title>Individual differences in the interpretation of text: Implications for information science</article-title>. <source>J. Am. Soc. Inf. Sci. Technol</source>. <volume>61</volume>, <fpage>141</fpage>&#x02013;<lpage>149</lpage>. <pub-id pub-id-type="doi">10.1002/asi.21222</pub-id><pub-id pub-id-type="pmid">25855820</pub-id></citation></ref>
<ref id="B28">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Morris</surname> <given-names>J.</given-names></name> <name><surname>Hirst</surname> <given-names>G.</given-names></name></person-group> (<year>2004</year>). <article-title>The subjectivity of lexical cohesion in text</article-title>, in <source>AAAI Spring Symposium-Technical Report 20</source>.</citation>
</ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Nedoluzhko</surname> <given-names>A.</given-names></name> <name><surname>M&#x000ED;rovsk&#x000FD;</surname> <given-names>J.</given-names></name></person-group> (<year>2013</year>). <article-title>Annotators&#x00027; certainty and disagreements in coreference and bridging annotation in Prague dependency treebank</article-title>, in <source>Proceedings of the Second International Conference on Dependency Linguistics (DepLing 2013)</source> (<publisher-loc>Prague</publisher-loc>), <fpage>236</fpage>&#x02013;<lpage>243</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Passonneau</surname> <given-names>R.</given-names></name></person-group> (<year>2006</year>). <article-title>Measuring agreement on set-valued items (MASI) for semantic and pragmatic annotation</article-title>, in <source>5th International Conference on Language Resources and Evaluation, LREC 2006</source> (<publisher-loc>Genoa</publisher-loc>), <fpage>831</fpage>&#x02013;<lpage>836</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Paun</surname> <given-names>S.</given-names></name> <name><surname>Carpenter</surname> <given-names>B.</given-names></name> <name><surname>Chamberlain</surname> <given-names>J.</given-names></name> <name><surname>Hovy</surname> <given-names>D.</given-names></name> <name><surname>Kruschwitz</surname> <given-names>U.</given-names></name> <name><surname>Poesio</surname> <given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>Comparing Bayesian models of annotation</article-title>. <source>Trans. Assoc. Comput. Linguist</source>. <volume>6</volume>, <fpage>571</fpage>&#x02013;<lpage>585</lpage>. <pub-id pub-id-type="doi">10.1162/tacl_a_00040</pub-id><pub-id pub-id-type="pmid">15598354</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Poesio</surname> <given-names>M.</given-names></name> <name><surname>Artstein</surname> <given-names>R.</given-names></name></person-group> (<year>2005</year>). <article-title>The reliability of anaphoric annotation, reconsidered: taking ambiguity into account</article-title>, in <source>Proceedings of the Workshop on Frontiers in Corpus Annotations II: Pie in the Sky</source> (<publisher-loc>Ann Arbor</publisher-loc>), <fpage>76</fpage>&#x02013;<lpage>83</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rohde</surname> <given-names>H.</given-names></name> <name><surname>Dickinson</surname> <given-names>A.</given-names></name> <name><surname>Schneider</surname> <given-names>N.</given-names></name> <name><surname>Clark</surname> <given-names>C. N. L.</given-names></name> <name><surname>Louis</surname> <given-names>A.</given-names></name> <name><surname>Webber</surname> <given-names>B.</given-names></name></person-group> (<year>2016</year>). <article-title>Filling in the blanks in understanding discourse adverbials: consistency, conflict, and context-dependence in a crowdsourced elicitation task</article-title>, in <source>Proceedings of the 10th Linguistic Annotation Workshop held in conjunction with ACL 2016 (LAW-X 2016)</source> (<publisher-loc>Berlin</publisher-loc>), <fpage>49</fpage>&#x02013;<lpage>58</lpage>.</citation>
</ref>
<ref id="B34">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Schmid</surname> <given-names>W.</given-names></name></person-group> (<year>2011</year>). <article-title>Erz&#x000E4;hlstimme</article-title>, in <source>Grundbegriffe der Er&#x000E4;hlanalyse, Handbuch Erz&#x000E4;hlliteratur. Theorie, Analyse, Geschichte, Chapter B</source>, ed <person-group person-group-type="editor"><name><surname>Martinez</surname> <given-names>M.</given-names></name></person-group> (<publisher-loc>Weimar</publisher-loc>: <publisher-name>Verlag J.B. Metzler, Stuttgart</publisher-name>), <fpage>131</fpage>&#x02013;<lpage>138</lpage>.</citation>
</ref>
<ref id="B35">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Scholman</surname> <given-names>M.</given-names></name> <name><surname>Demberg</surname> <given-names>V.</given-names></name></person-group> (<year>2017</year>). <article-title>Crowdsourcing discourse interpretations: on the influence of context and the reliability of a connective insertion task</article-title>, in <source>Proceedings of the 11th Linguistic Annotation Workshop</source> (<publisher-loc>Valencia</publisher-loc>), <fpage>24</fpage>&#x02013;<lpage>33</lpage>.</citation>
</ref>
<ref id="B36">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Sch&#x000F6;nert</surname> <given-names>J.</given-names></name></person-group> (<year>2014</year>). <source>Handbook of Narratology. Vol. I</source>. <publisher-loc>Berlin; Boston</publisher-loc>: <publisher-name>de Gruyter</publisher-name>.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sober&#x000F3;n</surname> <given-names>G.</given-names></name> <name><surname>Aroyo</surname> <given-names>L.</given-names></name> <name><surname>Welty</surname> <given-names>C.</given-names></name> <name><surname>Inel</surname> <given-names>O.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Overmeen</surname> <given-names>M.</given-names></name></person-group> (<year>2013</year>). <article-title>Measuring crowd truth: Disagreement metrics combined with worker behavior filters</article-title>, in <source>Proceedings of the 1st International Conference on Crowdsourcing the Semantic Web&#x02013;Vol</source> (<publisher-loc>Sydney</publisher-loc>), <volume>1030</volume>, <fpage>45</fpage>&#x02013;<lpage>58</lpage>.</citation>
</ref>
<ref id="B38">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Sokal</surname> <given-names>R. R.</given-names></name> <name><surname>Michener</surname> <given-names>C. D.</given-names></name> <name><surname>Kansas</surname> <given-names>U.</given-names></name></person-group> (<year>1958</year>). <source>A Statistical Method for Evaluating Systematic Relationships, Vol. 38 of University of Kansas Science Bulletin</source>. <publisher-loc>Lawrence</publisher-loc>: <publisher-name>University of Kansas</publisher-name>.</citation>
</ref>
<ref id="B39">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Tsuchiya</surname> <given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>Performance impact caused by hidden bias of training data for recognizing textual entailment</article-title>, in <source>Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)</source>.</citation>
</ref>
<ref id="B40">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wich</surname> <given-names>M.</given-names></name> <name><surname>Bauer</surname> <given-names>J.</given-names></name> <name><surname>Groh</surname> <given-names>G.</given-names></name></person-group> (<year>2020</year>). <article-title>Impact of politically biased data on hate speech classification</article-title>, in <source>Proceedings of the Fourth Workshop on Online Abuse and Harms</source>, <fpage>54</fpage>&#x02013;<lpage>64</lpage>.</citation>
</ref>
<ref id="B41">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Wieland</surname> <given-names>C. M.</given-names></name></person-group> (<year>2012</year>). <article-title>Geschichte des Agathon</article-title>, in <source>TextGrid Repository</source>. Digitale Bibliothek. Available online at: <ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/11858/00-1734-0000-0005-A691-2">https://hdl.handle.net/11858/00-1734-0000-0005-A691-2</ext-link></citation>
</ref>
<ref id="B42">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Woolf</surname> <given-names>V.</given-names></name></person-group> (<year>1927</year>). <source>To the Lighthouse</source>. <publisher-loc>London</publisher-loc>: <publisher-name>Hogarth</publisher-name>.</citation>
</ref>
</ref-list>
<fn-group>
<fn id="fn0001"><p><sup>1</sup><ext-link ext-link-type="uri" xlink:href="https://catma.de/">https://catma.de/</ext-link></p></fn>
<fn id="fn0002"><p><sup>2</sup>We hypothesise that the annotation decisions for attribution are largely independent from the exact nature of the phenomenon to be attributed, i.e., when deciding whether the content of a text passage should be attributed to a character, the narrator, or the author it does not matter too much whether the attributable phenomenon is &#x0201C;comment&#x0201D;, &#x0201C;non-fictional speech&#x0201D;, or &#x0201C;generalisation&#x0201D;.</p></fn>
<fn id="fn0003"><p><sup>3</sup>Clause segmentation is performed with the clausizer presented in D&#x000F6;nicke (<xref ref-type="bibr" rid="B9">2020</xref>). The manually created clause-level annotations are then automatically mapped to the detected clauses.</p></fn>
<fn id="fn0004"><p><sup>4</sup>The corpus and annotation guidelines are published in Barth et al. (<xref ref-type="bibr" rid="B5">2021</xref>).</p></fn>
<fn id="fn0005"><p><sup>5</sup><ext-link ext-link-type="uri" xlink:href="https://keras.io">https://keras.io</ext-link></p></fn>
<fn id="fn0006"><p><sup>6</sup><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/dbmdz/bert-base-german-cased">https://huggingface.co/dbmdz/bert-base-german-cased</ext-link></p></fn>
<fn id="fn0007"><p><sup>7</sup>We used the development set for manual tuning of the parameters described here.</p></fn>
</fn-group>
</back>
</article>