Skip to content

Commit ed4ad30

Browse files
Fix Dutch noun chunks to skip overlapping spans (#11275)
* Add test for overlapping noun chunks * Skip overlapping noun chunks * Update spacy/tests/lang/nl/test_noun_chunks.py Co-authored-by: Sofie Van Landeghem <[email protected]> Co-authored-by: Sofie Van Landeghem <[email protected]>
1 parent 231a178 commit ed4ad30

File tree

2 files changed

+24
-5
lines changed

2 files changed

+24
-5
lines changed

spacy/lang/nl/syntax_iterators.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
4040
span_label = doc.vocab.strings.add("NP")
4141

4242
# Only NOUNS and PRONOUNS matter
43+
end_span = -1
4344
for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
4445
# For NOUNS
4546
# Pick children from syntactic parse (only those with certain dependencies)
@@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
5859
children_i = [c.i for c in children] + [word.i]
5960

6061
start_span = min(children_i)
61-
end_span = max(children_i) + 1
62-
yield start_span, end_span, span_label
62+
if start_span >= end_span:
63+
end_span = max(children_i) + 1
64+
yield start_span, end_span, span_label
6365

6466
# PRONOUNS only if it is the subject of a verb
6567
elif word.pos == PRON:
6668
if word.dep in pronoun_deps:
6769
start_span = word.i
68-
end_span = word.i + 1
69-
yield start_span, end_span, span_label
70+
if start_span >= end_span:
71+
end_span = word.i + 1
72+
yield start_span, end_span, span_label
7073

7174

7275
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

spacy/tests/lang/nl/test_noun_chunks.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
from spacy.tokens import Doc
21
import pytest
2+
from spacy.tokens import Doc
3+
from spacy.util import filter_spans
34

45

56
@pytest.fixture
@@ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking):
207208
"""
208209
chunks = [s.text.lower() for s in nl_sample.noun_chunks]
209210
assert chunks == nl_reference_chunking
211+
212+
213+
@pytest.mark.issue(10846)
214+
def test_no_overlapping_chunks(nl_vocab):
215+
# fmt: off
216+
doc = Doc(
217+
nl_vocab,
218+
words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
219+
deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
220+
heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
221+
pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
222+
)
223+
# fmt: on
224+
chunks = list(doc.noun_chunks)
225+
assert filter_spans(chunks) == chunks

0 commit comments

Comments
 (0)