Skip to content

Commit 24cc300

Browse files
authored
Search: respect spacing from block elements when indexing (#11658)
HTML tags can be divided in two categories: inline and block elements. Inline elements do not start on a new line, while block elements start on a new line. This gives block elements an implicit spacing that is not present in inline elements. If there are two tags next to each other, and one of them is a block element, there will be a space between them. Or if the two tags are inline elements, there will be no space between them.
1 parent 14e4353 commit 24cc300

File tree

3 files changed

+60
-1
lines changed

3 files changed

+60
-1
lines changed

readthedocs/search/parsers.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,45 @@ class GenericParser:
1515
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
1616
max_inner_documents = 10000
1717

18+
# Block level elements have an implicit line break before and after them.
19+
# List taken from: https://www.w3schools.com/htmL/html_blocks.asp.
20+
block_level_elements = [
21+
"address",
22+
"article",
23+
"aside",
24+
"blockquote",
25+
"canvas",
26+
"dd",
27+
"div",
28+
"dl",
29+
"dt",
30+
"fieldset",
31+
"figcaption",
32+
"figure",
33+
"footer",
34+
"form",
35+
"h1",
36+
"h2",
37+
"h3",
38+
"h4",
39+
"h5",
40+
"h6",
41+
"header",
42+
"hr",
43+
"li",
44+
"main",
45+
"nav",
46+
"noscript",
47+
"ol",
48+
"p",
49+
"pre",
50+
"section",
51+
"table",
52+
"tfoot",
53+
"ul",
54+
"video",
55+
]
56+
1857
def __init__(self, version):
1958
self.version = version
2059
self.project = self.version.project
@@ -334,7 +373,12 @@ def _parse_section_content(self, tag, *, depth=0):
334373
)
335374

336375
if content:
337-
contents.append(content)
376+
is_block_level_element = next_tag.tag in self.block_level_elements
377+
if is_block_level_element:
378+
# Add a line break before and after a block level element.
379+
contents.append(f"\n{content}\n")
380+
else:
381+
contents.append(content)
338382
next_tag = next_tag.next
339383

340384
return self._parse_content("".join(contents)), section_found

readthedocs/search/tests/data/sphinx/in/page.html

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,16 @@ <h2>Footnotes and domains<a class="headerlink" href="#footnotes-and-domains" tit
166166
</div>
167167
<!-- End of footnote -->
168168

169+
<!-- Definition list -->
170+
<section id="development">
171+
<h2>Development<a class="headerlink" href="#development" title="Permalink to this heading"></a></h2>
172+
<dl class="simple">
173+
<!-- NOTE: leave this as a single line to test a bug related to spacing -->
174+
<dt><a class="reference internal" href="contributing.html"><span class="doc">Contributing</span></a></dt><dd><p>How to contribute changes to the theme.</p></dd>
175+
</dl>
176+
</section>
177+
<!-- End of definition list -->
178+
169179
</div>
170180
</main>
171181
</body>

readthedocs/search/tests/data/sphinx/out/page.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@
5252
"title": "Footnotes and domains",
5353
"content": ""
5454
},
55+
{
56+
"content": "Contributing How to contribute changes to the theme.",
57+
"id": "development",
58+
"title": "Development"
59+
},
5560
{
5661
"id": "subsub-title",
5762
"title": "Subsub title",

0 commit comments

Comments
 (0)