Skip to content

Commit 89b456e

Browse files
committed
✨ NEW: Port abbr plugin (#14)
1 parent 4601e15 commit 89b456e

File tree

5 files changed

+200
-0
lines changed

5 files changed

+200
-0
lines changed

docs/index.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,12 @@ html_string = md.render("some *Markdown*")
109109
.. autofunction:: mdit_py_plugins.amsmath.amsmath_plugin
110110
```
111111

112+
## Abbreviation
113+
114+
```{eval-rst}
115+
.. autofunction:: mdit_py_plugins.abbr.abbr_plugin
116+
```
117+
112118
## MyST plugins
113119

114120
`myst_blocks` and `myst_role` plugins are also available, for utilisation by the [MyST renderer](https://myst-parser.readthedocs.io/en/latest/using/syntax.html)

mdit_py_plugins/abbr/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .index import abbr_plugin # noqa: F401

mdit_py_plugins/abbr/index.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Enclose abbreviations in <abbr> tags
2+
#
3+
4+
from typing import List
5+
6+
from markdown_it import MarkdownIt
7+
from markdown_it.common.utils import escapeRE, arrayReplaceAt
8+
from markdown_it.rules_block import StateBlock
9+
from markdown_it.token import Token
10+
11+
import re
12+
13+
# ASCII characters in Cc, Sc, Sm, Sk categories we should terminate on;
14+
# you can check character classes here:
15+
# http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
16+
OTHER_CHARS = r" \r\n$+<=>^`|~"
17+
18+
UNICODE_PUNCT_RE = r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]"
19+
UNICODE_SPACE_RE = r"[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]"
20+
21+
def abbr_plugin(md: MarkdownIt):
22+
"""Plugin ported from
23+
`markdown-it-abbr <https://github.com/markdown-it/markdown-it-abbr>`__.
24+
25+
.. code-block:: md
26+
27+
*[HTML] HyperText Markup Language
28+
"""
29+
30+
md.block.ruler.before(
31+
"reference", "abbr_def", abbr_def, {"alt": ["paragraph", "reference"]}
32+
)
33+
md.core.ruler.after("linkify", "abbr_replace", abbr_replace)
34+
35+
# ## RULES ##
36+
37+
def abbr_def(state: StateBlock, startLine: int, endLine: int, silent: bool):
38+
pos = state.bMarks[startLine] + state.tShift[startLine]
39+
max = state.eMarks[startLine]
40+
41+
if (pos + 2) >= max:
42+
return False
43+
44+
if state.srcCharCode[pos] != 0x2A: # /* * */
45+
return False
46+
pos += 1
47+
if state.srcCharCode[pos] != 0x5B: # /* [ */
48+
return False
49+
pos += 1
50+
51+
labelStart = pos
52+
labelEnd = None
53+
54+
while pos < max:
55+
ch = state.srcCharCode[pos]
56+
if ch == 0x5B: # /* [ */
57+
return False
58+
elif ch == 0x5D: # /* ] */
59+
labelEnd = pos
60+
break
61+
elif ch == 0x5C: # /* \ */
62+
pos += 1
63+
pos += 1
64+
65+
if labelEnd is None or state.srcCharCode[labelEnd + 1] != 0x3A:
66+
return False
67+
68+
if silent:
69+
return True
70+
71+
label = state.src[labelStart : labelEnd].replace("\\\\", "")
72+
title = state.src[labelEnd + 2 : max].strip()
73+
74+
if len(label) == 0:
75+
return False
76+
if len(title) == 0:
77+
return False
78+
if "abbreviations" not in state.env:
79+
state.env["abbreviations"] = {}
80+
if (":" + label) not in state.env["abbreviations"]:
81+
state.env["abbreviations"][":" + label] = title
82+
83+
state.line = startLine + 1
84+
return True
85+
86+
def abbr_replace(state: StateBlock):
87+
if "abbreviations" not in state.env:
88+
return
89+
90+
alternations = []
91+
for key in reversed(sorted([key[1:] for key in state.env["abbreviations"].keys()], key=len)):
92+
alternations.append(escapeRE(key))
93+
regSimple = re.compile(f"(?:{'|'.join(alternations)})")
94+
95+
otherChars = ''.join([escapeRE(ch) for ch in OTHER_CHARS])
96+
97+
regText = f"(^|{UNICODE_PUNCT_RE}|{UNICODE_SPACE_RE}|[{otherChars}])({'|'.join(alternations)})($|{UNICODE_PUNCT_RE}|{UNICODE_SPACE_RE}|[{otherChars}])"
98+
99+
reg = re.compile(regText)
100+
101+
blockTokens = state.tokens
102+
103+
for j in range(len(blockTokens)):
104+
if blockTokens[j].type != "inline":
105+
continue
106+
tokens = blockTokens[j].children
107+
108+
# we scan from the end, to keep position when new tags added
109+
assert tokens is not None
110+
i = len(tokens)
111+
while i >= 1:
112+
i -= 1
113+
assert isinstance(tokens, list)
114+
currentToken = tokens[i]
115+
116+
if currentToken.type != "text":
117+
continue
118+
119+
pos = 0
120+
lastIndex = 0
121+
text = currentToken.content
122+
nodes: List[Token] = []
123+
124+
# fast regexp run to determine whether there are any abbreviated
125+
# words in the current token
126+
if regSimple.search(text) is None:
127+
continue
128+
129+
while lastIndex < len(text):
130+
match = reg.search(text, lastIndex)
131+
if match is None:
132+
break
133+
134+
if match.start() > 0 or len(match.group(1)) > 0:
135+
token = Token("text", "", 0)
136+
token.content = text[pos : match.start() + len(match.group(1))]
137+
nodes.append(token)
138+
139+
token = Token("abbr_open", "abbr", 1)
140+
token.attrSet("title", state.env["abbreviations"][":" + match.group(2)])
141+
nodes.append(token)
142+
143+
token = Token("text", "", 0)
144+
token.content = match.group(2)
145+
nodes.append(token)
146+
147+
token = Token("abbr_close", "abbr", -1)
148+
nodes.append(token)
149+
150+
lastIndex = match.start() + len(match.group(0)) - len(match.group(3))
151+
pos = lastIndex
152+
153+
if len(nodes) == 0:
154+
continue
155+
156+
if pos < len(text):
157+
token = Token("text", "", 0)
158+
token.content = text[pos:]
159+
nodes.append(token)
160+
161+
# replace current node
162+
blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes)

tests/fixtures/abbr.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
Simple abbreviation
3+
.
4+
*[HTML]: HyperText Markup Language
5+
*[W3C]: World Wide Web Consortium
6+
7+
The HTML specification is maintained by the W3C.
8+
.
9+
<p>The <abbr title="HyperText Markup Language">HTML</abbr> specification is maintained by the <abbr title="World Wide Web Consortium">W3C.</p>
10+
.

tests/test_abbr.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from pathlib import Path
2+
3+
from markdown_it import MarkdownIt
4+
from markdown_it.utils import read_fixture_file
5+
import pytest
6+
7+
from mdit_py_plugins.abbr import abbr_plugin
8+
9+
FIXTURE_PATH = Path(__file__).parent
10+
11+
12+
@pytest.mark.parametrize(
13+
"line,title,input,expected",
14+
read_fixture_file(FIXTURE_PATH.joinpath("fixtures", "abbr.md")),
15+
)
16+
def test_all(line, title, input, expected):
17+
md = MarkdownIt("commonmark").use(abbr_plugin)
18+
md.options["xhtmlOut"] = False
19+
text = md.render(input)
20+
print(text)
21+
assert text.rstrip() == expected.rstrip()

0 commit comments

Comments
 (0)