Skip to content

Commit ef35794

Browse files
docs: add validation consistency audit and fix Spanish README
- Fix Spanish documentation entry point links and stack consistency - Add docs consistency validator for links, encoding, emojis, stack references, and bilingual counterparts - Add npm run docs:audit command
1 parent 5bdd538 commit ef35794

3 files changed

Lines changed: 425 additions & 83 deletions

File tree

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
#!/usr/bin/env python3
2+
"""
3+
BMAD-METHOD Utility Script: validate_docs_consistency.py
4+
5+
Audits Markdown/HTML documentation for enterprise documentation consistency.
6+
7+
Checks covered:
8+
- Broken local Markdown/HTML links and anchors.
9+
- Prohibited emoji/pictograph characters in Markdown files (R-14).
10+
- Common mojibake/encoding artifacts (R-03).
11+
- Obsolete or forbidden stack references in docs (R-16/R-20).
12+
- Missing bilingual counterpart files for common English/Spanish doc patterns (R-01).
13+
14+
The script is read-only and exits non-zero when issues are found.
15+
"""
16+
17+
from __future__ import annotations
18+
19+
import argparse
20+
import os
21+
import re
22+
import sys
23+
import unicodedata
24+
from dataclasses import dataclass
25+
from pathlib import Path
26+
from urllib.parse import unquote, urlparse
27+
28+
REPO_ROOT = Path(__file__).resolve().parents[2]
29+
DEFAULT_TARGETS = [REPO_ROOT / "README.md", REPO_ROOT / "docs"]
30+
31+
SKIP_DIRS = {
32+
".git",
33+
".nx",
34+
".venv",
35+
"bin",
36+
"build",
37+
"dist",
38+
"node_modules",
39+
"obj",
40+
}
41+
42+
LOCAL_LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\(([^)]+)\)|(?:href|src)=[\"']([^\"']+)[\"']", re.IGNORECASE)
43+
HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$", re.MULTILINE)
44+
45+
MOJIBAKE_TOKENS = (
46+
"Ã",
47+
"Â",
48+
"“",
49+
"â€",
50+
"’",
51+
"–",
52+
"—",
53+
"",
54+
)
55+
56+
FORBIDDEN_STACK_PATTERNS = [
57+
(re.compile(r"\bPostgreSQL\b", re.IGNORECASE), "PostgreSQL reference found. UMS authoritative database is SQL Server unless explicitly marked as external comparison."),
58+
(re.compile(r"\.NET\s+8\b", re.IGNORECASE), ".NET 8 reference found. UMS authoritative backend stack is .NET 10."),
59+
(re.compile(r"SQL\s+Server\s+2019\b", re.IGNORECASE), "SQL Server 2019 reference found. Current project baseline is SQL Server 2022."),
60+
]
61+
62+
EMOJI_RANGES = [
63+
(0x1F000, 0x1FAFF),
64+
(0x2600, 0x27BF),
65+
(0x1F100, 0x1F1FF),
66+
]
67+
EXTRA_EMOJI_CODEPOINTS = {0x2B50, 0x2B55, 0x2B1B, 0x2B1C, 0xFE0F, 0xFE0E, 0x200D, 0x200B, 0x200C, 0x2060, 0xFEFF}
68+
69+
70+
@dataclass(frozen=True)
71+
class Issue:
72+
severity: str
73+
rule: str
74+
path: Path
75+
line: int
76+
issue_type: str
77+
message: str
78+
recommendation: str
79+
80+
81+
def iter_files(targets: list[Path]) -> list[Path]:
82+
files: list[Path] = []
83+
for target in targets:
84+
if not target.exists():
85+
continue
86+
if target.is_file() and target.suffix.lower() in {".md", ".html"}:
87+
files.append(target)
88+
continue
89+
if target.is_dir():
90+
for root, dirs, names in os.walk(target):
91+
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
92+
for name in names:
93+
path = Path(root) / name
94+
if path.suffix.lower() in {".md", ".html"}:
95+
files.append(path)
96+
return sorted(set(files))
97+
98+
99+
def line_number(text: str, index: int) -> int:
100+
return text.count("\n", 0, index) + 1
101+
102+
103+
def is_emoji(ch: str) -> bool:
104+
cp = ord(ch)
105+
if cp in EXTRA_EMOJI_CODEPOINTS:
106+
return True
107+
return any(start <= cp <= end for start, end in EMOJI_RANGES)
108+
109+
110+
def slugify_heading(heading: str) -> str:
111+
heading = re.sub(r"<[^>]+>", "", heading)
112+
heading = re.sub(r"[`*_~]", "", heading)
113+
heading = unicodedata.normalize("NFKD", heading)
114+
heading = "".join(ch for ch in heading if not unicodedata.combining(ch))
115+
heading = heading.strip().lower()
116+
heading = re.sub(r"[^a-z0-9\s-]", "", heading)
117+
heading = re.sub(r"\s+", "-", heading)
118+
heading = re.sub(r"-+", "-", heading).strip("-")
119+
return heading
120+
121+
122+
def collect_anchors(text: str) -> set[str]:
123+
anchors: set[str] = set()
124+
seen: dict[str, int] = {}
125+
for match in HEADING_RE.finditer(text):
126+
base = slugify_heading(match.group(2))
127+
if not base:
128+
continue
129+
count = seen.get(base, 0)
130+
slug = base if count == 0 else f"{base}-{count}"
131+
seen[base] = count + 1
132+
anchors.add(slug)
133+
anchors.update(re.findall(r"<a\s+(?:[^>]*?\s+)?name=[\"']([^\"']+)[\"']", text, flags=re.IGNORECASE))
134+
anchors.update(re.findall(r"id=[\"']([^\"']+)[\"']", text, flags=re.IGNORECASE))
135+
return anchors
136+
137+
138+
def normalize_link(raw: str) -> str:
139+
raw = raw.strip()
140+
if raw.startswith("<") and raw.endswith(">"):
141+
raw = raw[1:-1].strip()
142+
return raw
143+
144+
145+
def is_external_or_special(link: str) -> bool:
146+
parsed = urlparse(link)
147+
return bool(parsed.scheme in {"http", "https", "mailto", "tel", "data"}) or link.startswith("#")
148+
149+
150+
def validate_links(path: Path, text: str, file_cache: dict[Path, str]) -> list[Issue]:
151+
issues: list[Issue] = []
152+
for match in LOCAL_LINK_RE.finditer(text):
153+
link = normalize_link(match.group(1) or match.group(2) or "")
154+
if not link or is_external_or_special(link):
155+
continue
156+
if link.startswith(".") or link.startswith("/") or not urlparse(link).scheme:
157+
link_path, _, anchor = link.partition("#")
158+
clean_path = unquote(link_path)
159+
clean_path = clean_path.split("?", 1)[0]
160+
target = (path.parent / clean_path).resolve() if clean_path else path.resolve()
161+
if clean_path and not target.exists():
162+
issues.append(Issue(
163+
"critical",
164+
"R-10/R-13",
165+
path,
166+
line_number(text, match.start()),
167+
"broken-link",
168+
f"Broken local link: {link}",
169+
"Fix the relative path from the current document or remove the stale link.",
170+
))
171+
continue
172+
if anchor and target.suffix.lower() in {".md", ".html"} and target.exists():
173+
target_text = file_cache.get(target)
174+
if target_text is None:
175+
target_text = target.read_text(encoding="utf-8", errors="replace")
176+
file_cache[target] = target_text
177+
if anchor and unquote(anchor).lower() not in collect_anchors(target_text):
178+
issues.append(Issue(
179+
"warning",
180+
"R-10/R-13",
181+
path,
182+
line_number(text, match.start()),
183+
"broken-anchor",
184+
f"Anchor not found: {link}",
185+
"Update the anchor to match the target heading generated by GitHub Markdown.",
186+
))
187+
return issues
188+
189+
190+
def validate_encoding_and_professionalism(path: Path, text: str) -> list[Issue]:
191+
issues: list[Issue] = []
192+
for token in MOJIBAKE_TOKENS:
193+
index = text.find(token)
194+
if index >= 0:
195+
issues.append(Issue(
196+
"critical",
197+
"R-03",
198+
path,
199+
line_number(text, index),
200+
"encoding",
201+
f"Possible mojibake token found: {token}",
202+
"Run cleanup_markdown_encoding.py and review the affected sentence manually.",
203+
))
204+
if path.suffix.lower() == ".md":
205+
for index, ch in enumerate(text):
206+
if is_emoji(ch):
207+
issues.append(Issue(
208+
"warning",
209+
"R-14",
210+
path,
211+
line_number(text, index),
212+
"decorative-character",
213+
f"Prohibited emoji/decorative character found: U+{ord(ch):04X}",
214+
"Remove the emoji/icon and keep the Markdown enterprise-professional.",
215+
))
216+
break
217+
return issues
218+
219+
220+
def validate_stack(path: Path, text: str) -> list[Issue]:
221+
issues: list[Issue] = []
222+
lower_path = str(path).replace("\\", "/").lower()
223+
if "/docs/" not in lower_path and not lower_path.endswith("readme.md"):
224+
return issues
225+
for pattern, message in FORBIDDEN_STACK_PATTERNS:
226+
for match in pattern.finditer(text):
227+
context = text[max(0, match.start() - 100): match.end() + 100].lower()
228+
if "external comparison" in context or "comparación externa" in context:
229+
continue
230+
issues.append(Issue(
231+
"critical",
232+
"R-16/R-20",
233+
path,
234+
line_number(text, match.start()),
235+
"stack-consistency",
236+
message,
237+
"Align the document with .NET 10 + SQL Server 2022 + EF Core, or mark the reference as an explicit external comparison.",
238+
))
239+
return issues
240+
241+
242+
def counterpart_candidates(path: Path) -> list[Path]:
243+
s = str(path)
244+
candidates: list[str] = []
245+
if s.endswith(".es.md"):
246+
candidates.append(s[:-6] + ".md")
247+
elif s.endswith(".md"):
248+
candidates.append(s[:-3] + ".es.md")
249+
candidates.append(s.replace("/product-es/", "/product/"))
250+
candidates.append(s.replace("/product/", "/product-es/"))
251+
candidates.append(s.replace("/project-es/", "/project/"))
252+
candidates.append(s.replace("/project/", "/project-es/"))
253+
candidates.append(s.replace("/requirements-es/", "/requirements/"))
254+
candidates.append(s.replace("/requirements/", "/requirements-es/"))
255+
candidates.append(s.replace("/blueprints-es/", "/blueprints/"))
256+
candidates.append(s.replace("/blueprints/", "/blueprints-es/"))
257+
return [Path(c) for c in candidates if c != s]
258+
259+
260+
def validate_bilingual(files: list[Path]) -> list[Issue]:
261+
file_set = {p.resolve() for p in files if p.suffix.lower() == ".md"}
262+
issues: list[Issue] = []
263+
for path in file_set:
264+
rel = path.relative_to(REPO_ROOT) if path.is_relative_to(REPO_ROOT) else path
265+
rel_s = str(rel).replace("\\", "/")
266+
if not rel_s.startswith("docs/"):
267+
continue
268+
if rel_s.startswith("docs/qa/"):
269+
continue
270+
candidates = [c.resolve() for c in counterpart_candidates(path)]
271+
if not any(c in file_set for c in candidates):
272+
issues.append(Issue(
273+
"warning",
274+
"R-01",
275+
path,
276+
1,
277+
"bilingual-sync",
278+
"No obvious bilingual counterpart found for this documentation file.",
279+
"Create or link the English/Spanish counterpart, or document why the file is intentionally single-language.",
280+
))
281+
return issues
282+
283+
284+
def format_issue(issue: Issue) -> str:
285+
rel = issue.path.relative_to(REPO_ROOT) if issue.path.is_relative_to(REPO_ROOT) else issue.path
286+
return (
287+
f"- [{issue.severity.upper()}] {issue.rule} {rel}:{issue.line}\n"
288+
f" Type: {issue.issue_type}\n"
289+
f" Issue: {issue.message}\n"
290+
f" Fix: {issue.recommendation}"
291+
)
292+
293+
294+
def main() -> int:
295+
parser = argparse.ArgumentParser(description="Validate UMS documentation consistency.")
296+
parser.add_argument("paths", nargs="*", help="Files or directories to audit. Defaults to README.md and docs/.")
297+
parser.add_argument("--no-bilingual", action="store_true", help="Skip bilingual counterpart scan.")
298+
args = parser.parse_args()
299+
300+
targets = [Path(p).resolve() for p in args.paths] if args.paths else DEFAULT_TARGETS
301+
files = iter_files(targets)
302+
file_cache: dict[Path, str] = {}
303+
issues: list[Issue] = []
304+
305+
for path in files:
306+
text = path.read_text(encoding="utf-8", errors="replace")
307+
file_cache[path.resolve()] = text
308+
issues.extend(validate_encoding_and_professionalism(path, text))
309+
issues.extend(validate_stack(path, text))
310+
issues.extend(validate_links(path, text, file_cache))
311+
312+
if not args.no_bilingual:
313+
issues.extend(validate_bilingual(files))
314+
315+
if issues:
316+
print("Documentation consistency audit failed.\n")
317+
for issue in sorted(issues, key=lambda x: (str(x.path), x.line, x.rule, x.issue_type)):
318+
print(format_issue(issue))
319+
print(f"\nTotal issues: {len(issues)}")
320+
return 1
321+
322+
print(f"Documentation consistency audit passed. Scanned {len(files)} files.")
323+
return 0
324+
325+
326+
if __name__ == "__main__":
327+
sys.exit(main())

0 commit comments

Comments
 (0)