|
| 1 | +# Enclose abbreviations in <abbr> tags |
| 2 | +# |
| 3 | + |
| 4 | +from typing import List |
| 5 | + |
| 6 | +from markdown_it import MarkdownIt |
| 7 | +from markdown_it.common.utils import escapeRE, arrayReplaceAt |
| 8 | +from markdown_it.rules_block import StateBlock |
| 9 | +from markdown_it.token import Token |
| 10 | + |
| 11 | +import re |
| 12 | + |
| 13 | +# ASCII characters in Cc, Sc, Sm, Sk categories we should terminate on; |
| 14 | +# you can check character classes here: |
| 15 | +# http://www.unicode.org/Public/UNIDATA/UnicodeData.txt |
| 16 | +OTHER_CHARS = r" \r\n$+<=>^`|~" |
| 17 | + |
| 18 | +UNICODE_PUNCT_RE = r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" |
| 19 | +UNICODE_SPACE_RE = r"[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]" |
| 20 | + |
| 21 | +def abbr_plugin(md: MarkdownIt): |
| 22 | + """Plugin ported from |
| 23 | + `markdown-it-abbr <https://github.com/markdown-it/markdown-it-abbr>`__. |
| 24 | +
|
| 25 | + .. code-block:: md |
| 26 | +
|
| 27 | + *[HTML] HyperText Markup Language |
| 28 | + """ |
| 29 | + |
| 30 | + md.block.ruler.before( |
| 31 | + "reference", "abbr_def", abbr_def, {"alt": ["paragraph", "reference"]} |
| 32 | + ) |
| 33 | + md.core.ruler.after("linkify", "abbr_replace", abbr_replace) |
| 34 | + |
| 35 | +# ## RULES ## |
| 36 | + |
| 37 | +def abbr_def(state: StateBlock, startLine: int, endLine: int, silent: bool): |
| 38 | + pos = state.bMarks[startLine] + state.tShift[startLine] |
| 39 | + max = state.eMarks[startLine] |
| 40 | + |
| 41 | + if (pos + 2) >= max: |
| 42 | + return False |
| 43 | + |
| 44 | + if state.srcCharCode[pos] != 0x2A: # /* * */ |
| 45 | + return False |
| 46 | + pos += 1 |
| 47 | + if state.srcCharCode[pos] != 0x5B: # /* [ */ |
| 48 | + return False |
| 49 | + pos += 1 |
| 50 | + |
| 51 | + labelStart = pos |
| 52 | + labelEnd = None |
| 53 | + |
| 54 | + while pos < max: |
| 55 | + ch = state.srcCharCode[pos] |
| 56 | + if ch == 0x5B: # /* [ */ |
| 57 | + return False |
| 58 | + elif ch == 0x5D: # /* ] */ |
| 59 | + labelEnd = pos |
| 60 | + break |
| 61 | + elif ch == 0x5C: # /* \ */ |
| 62 | + pos += 1 |
| 63 | + pos += 1 |
| 64 | + |
| 65 | + if labelEnd is None or state.srcCharCode[labelEnd + 1] != 0x3A: |
| 66 | + return False |
| 67 | + |
| 68 | + if silent: |
| 69 | + return True |
| 70 | + |
| 71 | + label = state.src[labelStart : labelEnd].replace("\\\\", "") |
| 72 | + title = state.src[labelEnd + 2 : max].strip() |
| 73 | + |
| 74 | + if len(label) == 0: |
| 75 | + return False |
| 76 | + if len(title) == 0: |
| 77 | + return False |
| 78 | + if "abbreviations" not in state.env: |
| 79 | + state.env["abbreviations"] = {} |
| 80 | + if (":" + label) not in state.env["abbreviations"]: |
| 81 | + state.env["abbreviations"][":" + label] = title |
| 82 | + |
| 83 | + state.line = startLine + 1 |
| 84 | + return True |
| 85 | + |
| 86 | +def abbr_replace(state: StateBlock): |
| 87 | + if "abbreviations" not in state.env: |
| 88 | + return |
| 89 | + |
| 90 | + alternations = [] |
| 91 | + for key in reversed(sorted([key[1:] for key in state.env["abbreviations"].keys()], key=len)): |
| 92 | + alternations.append(escapeRE(key)) |
| 93 | + regSimple = re.compile(f"(?:{'|'.join(alternations)})") |
| 94 | + |
| 95 | + otherChars = ''.join([escapeRE(ch) for ch in OTHER_CHARS]) |
| 96 | + |
| 97 | + regText = f"(^|{UNICODE_PUNCT_RE}|{UNICODE_SPACE_RE}|[{otherChars}])({'|'.join(alternations)})($|{UNICODE_PUNCT_RE}|{UNICODE_SPACE_RE}|[{otherChars}])" |
| 98 | + |
| 99 | + reg = re.compile(regText) |
| 100 | + |
| 101 | + blockTokens = state.tokens |
| 102 | + |
| 103 | + for j in range(len(blockTokens)): |
| 104 | + if blockTokens[j].type != "inline": |
| 105 | + continue |
| 106 | + tokens = blockTokens[j].children |
| 107 | + |
| 108 | + # we scan from the end, to keep position when new tags added |
| 109 | + assert tokens is not None |
| 110 | + i = len(tokens) |
| 111 | + while i >= 1: |
| 112 | + i -= 1 |
| 113 | + assert isinstance(tokens, list) |
| 114 | + currentToken = tokens[i] |
| 115 | + |
| 116 | + if currentToken.type != "text": |
| 117 | + continue |
| 118 | + |
| 119 | + pos = 0 |
| 120 | + lastIndex = 0 |
| 121 | + text = currentToken.content |
| 122 | + nodes: List[Token] = [] |
| 123 | + |
| 124 | + # fast regexp run to determine whether there are any abbreviated |
| 125 | + # words in the current token |
| 126 | + if regSimple.search(text) is None: |
| 127 | + continue |
| 128 | + |
| 129 | + while lastIndex < len(text): |
| 130 | + match = reg.search(text, lastIndex) |
| 131 | + if match is None: |
| 132 | + break |
| 133 | + |
| 134 | + if match.start() > 0 or len(match.group(1)) > 0: |
| 135 | + token = Token("text", "", 0) |
| 136 | + token.content = text[pos : match.start() + len(match.group(1))] |
| 137 | + nodes.append(token) |
| 138 | + |
| 139 | + token = Token("abbr_open", "abbr", 1) |
| 140 | + token.attrSet("title", state.env["abbreviations"][":" + match.group(2)]) |
| 141 | + nodes.append(token) |
| 142 | + |
| 143 | + token = Token("text", "", 0) |
| 144 | + token.content = match.group(2) |
| 145 | + nodes.append(token) |
| 146 | + |
| 147 | + token = Token("abbr_close", "abbr", -1) |
| 148 | + nodes.append(token) |
| 149 | + |
| 150 | + lastIndex = match.start() + len(match.group(0)) - len(match.group(3)) |
| 151 | + pos = lastIndex |
| 152 | + |
| 153 | + if len(nodes) == 0: |
| 154 | + continue |
| 155 | + |
| 156 | + if pos < len(text): |
| 157 | + token = Token("text", "", 0) |
| 158 | + token.content = text[pos:] |
| 159 | + nodes.append(token) |
| 160 | + |
| 161 | + # replace current node |
| 162 | + blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes) |
0 commit comments