Fix Dutch noun chunks to skip overlapping spans (#11275)

adrianeboyd · svlandeg · web-flow · commit ed4ad309e6dd · 2022-08-10T09:49:08.000+02:00
* Add test for overlapping noun chunks

* Skip overlapping noun chunks

* Update spacy/tests/lang/nl/test_noun_chunks.py

Co-authored-by: Sofie Van Landeghem &lt;svlandeg@users.noreply.github.com&gt;

Co-authored-by: Sofie Van Landeghem &lt;svlandeg@users.noreply.github.com&gt;
diff --git a/spacy/lang/nl/syntax_iterators.py b/spacy/lang/nl/syntax_iterators.py
@@ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
     span_label = doc.vocab.strings.add("NP")
 
     # Only NOUNS and PRONOUNS matter
+    end_span = -1
     for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)):
         # For NOUNS
         # Pick children from syntactic parse (only those with certain dependencies)
@@ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
             children_i = [c.i for c in children] + [word.i]
 
             start_span = min(children_i)
-            end_span = max(children_i) + 1
-            yield start_span, end_span, span_label
+            if start_span >= end_span:
+                end_span = max(children_i) + 1
+                yield start_span, end_span, span_label
 
         # PRONOUNS only if it is the subject of a verb
         elif word.pos == PRON:
             if word.dep in pronoun_deps:
                 start_span = word.i
-                end_span = word.i + 1
-                yield start_span, end_span, span_label
+                if start_span >= end_span:
+                    end_span = word.i + 1
+                    yield start_span, end_span, span_label
 
 
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/tests/lang/nl/test_noun_chunks.py b/spacy/tests/lang/nl/test_noun_chunks.py
@@ -1,5 +1,6 @@
-from spacy.tokens import Doc
 import pytest
+from spacy.tokens import Doc
+from spacy.util import filter_spans
 
 
 @pytest.fixture
@@ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking):
     """
     chunks = [s.text.lower() for s in nl_sample.noun_chunks]
     assert chunks == nl_reference_chunking
+
+
+@pytest.mark.issue(10846)
+def test_no_overlapping_chunks(nl_vocab):
+    # fmt: off
+    doc = Doc(
+        nl_vocab,
+        words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
+        deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
+        heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
+        pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
+    )
+    # fmt: on
+    chunks = list(doc.noun_chunks)
+    assert filter_spans(chunks) == chunks