|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +BMAD-METHOD Utility Script: validate_docs_consistency.py |
| 4 | +
|
| 5 | +Audits Markdown/HTML documentation for enterprise documentation consistency. |
| 6 | +
|
| 7 | +Checks covered: |
| 8 | +- Broken local Markdown/HTML links and anchors. |
| 9 | +- Prohibited emoji/pictograph characters in Markdown files (R-14). |
| 10 | +- Common mojibake/encoding artifacts (R-03). |
| 11 | +- Obsolete or forbidden stack references in docs (R-16/R-20). |
| 12 | +- Missing bilingual counterpart files for common English/Spanish doc patterns (R-01). |
| 13 | +
|
| 14 | +The script is read-only and exits non-zero when issues are found. |
| 15 | +""" |
| 16 | + |
| 17 | +from __future__ import annotations |
| 18 | + |
| 19 | +import argparse |
| 20 | +import os |
| 21 | +import re |
| 22 | +import sys |
| 23 | +import unicodedata |
| 24 | +from dataclasses import dataclass |
| 25 | +from pathlib import Path |
| 26 | +from urllib.parse import unquote, urlparse |
| 27 | + |
| 28 | +REPO_ROOT = Path(__file__).resolve().parents[2] |
| 29 | +DEFAULT_TARGETS = [REPO_ROOT / "README.md", REPO_ROOT / "docs"] |
| 30 | + |
| 31 | +SKIP_DIRS = { |
| 32 | + ".git", |
| 33 | + ".nx", |
| 34 | + ".venv", |
| 35 | + "bin", |
| 36 | + "build", |
| 37 | + "dist", |
| 38 | + "node_modules", |
| 39 | + "obj", |
| 40 | +} |
| 41 | + |
| 42 | +LOCAL_LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\(([^)]+)\)|(?:href|src)=[\"']([^\"']+)[\"']", re.IGNORECASE) |
| 43 | +HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$", re.MULTILINE) |
| 44 | + |
| 45 | +MOJIBAKE_TOKENS = ( |
| 46 | + "Ã", |
| 47 | + "Â", |
| 48 | + "“", |
| 49 | + "â€", |
| 50 | + "’", |
| 51 | + "–", |
| 52 | + "—", |
| 53 | + "", |
| 54 | +) |
| 55 | + |
| 56 | +FORBIDDEN_STACK_PATTERNS = [ |
| 57 | + (re.compile(r"\bPostgreSQL\b", re.IGNORECASE), "PostgreSQL reference found. UMS authoritative database is SQL Server unless explicitly marked as external comparison."), |
| 58 | + (re.compile(r"\.NET\s+8\b", re.IGNORECASE), ".NET 8 reference found. UMS authoritative backend stack is .NET 10."), |
| 59 | + (re.compile(r"SQL\s+Server\s+2019\b", re.IGNORECASE), "SQL Server 2019 reference found. Current project baseline is SQL Server 2022."), |
| 60 | +] |
| 61 | + |
| 62 | +EMOJI_RANGES = [ |
| 63 | + (0x1F000, 0x1FAFF), |
| 64 | + (0x2600, 0x27BF), |
| 65 | + (0x1F100, 0x1F1FF), |
| 66 | +] |
| 67 | +EXTRA_EMOJI_CODEPOINTS = {0x2B50, 0x2B55, 0x2B1B, 0x2B1C, 0xFE0F, 0xFE0E, 0x200D, 0x200B, 0x200C, 0x2060, 0xFEFF} |
| 68 | + |
| 69 | + |
| 70 | +@dataclass(frozen=True) |
| 71 | +class Issue: |
| 72 | + severity: str |
| 73 | + rule: str |
| 74 | + path: Path |
| 75 | + line: int |
| 76 | + issue_type: str |
| 77 | + message: str |
| 78 | + recommendation: str |
| 79 | + |
| 80 | + |
| 81 | +def iter_files(targets: list[Path]) -> list[Path]: |
| 82 | + files: list[Path] = [] |
| 83 | + for target in targets: |
| 84 | + if not target.exists(): |
| 85 | + continue |
| 86 | + if target.is_file() and target.suffix.lower() in {".md", ".html"}: |
| 87 | + files.append(target) |
| 88 | + continue |
| 89 | + if target.is_dir(): |
| 90 | + for root, dirs, names in os.walk(target): |
| 91 | + dirs[:] = [d for d in dirs if d not in SKIP_DIRS] |
| 92 | + for name in names: |
| 93 | + path = Path(root) / name |
| 94 | + if path.suffix.lower() in {".md", ".html"}: |
| 95 | + files.append(path) |
| 96 | + return sorted(set(files)) |
| 97 | + |
| 98 | + |
| 99 | +def line_number(text: str, index: int) -> int: |
| 100 | + return text.count("\n", 0, index) + 1 |
| 101 | + |
| 102 | + |
| 103 | +def is_emoji(ch: str) -> bool: |
| 104 | + cp = ord(ch) |
| 105 | + if cp in EXTRA_EMOJI_CODEPOINTS: |
| 106 | + return True |
| 107 | + return any(start <= cp <= end for start, end in EMOJI_RANGES) |
| 108 | + |
| 109 | + |
| 110 | +def slugify_heading(heading: str) -> str: |
| 111 | + heading = re.sub(r"<[^>]+>", "", heading) |
| 112 | + heading = re.sub(r"[`*_~]", "", heading) |
| 113 | + heading = unicodedata.normalize("NFKD", heading) |
| 114 | + heading = "".join(ch for ch in heading if not unicodedata.combining(ch)) |
| 115 | + heading = heading.strip().lower() |
| 116 | + heading = re.sub(r"[^a-z0-9\s-]", "", heading) |
| 117 | + heading = re.sub(r"\s+", "-", heading) |
| 118 | + heading = re.sub(r"-+", "-", heading).strip("-") |
| 119 | + return heading |
| 120 | + |
| 121 | + |
| 122 | +def collect_anchors(text: str) -> set[str]: |
| 123 | + anchors: set[str] = set() |
| 124 | + seen: dict[str, int] = {} |
| 125 | + for match in HEADING_RE.finditer(text): |
| 126 | + base = slugify_heading(match.group(2)) |
| 127 | + if not base: |
| 128 | + continue |
| 129 | + count = seen.get(base, 0) |
| 130 | + slug = base if count == 0 else f"{base}-{count}" |
| 131 | + seen[base] = count + 1 |
| 132 | + anchors.add(slug) |
| 133 | + anchors.update(re.findall(r"<a\s+(?:[^>]*?\s+)?name=[\"']([^\"']+)[\"']", text, flags=re.IGNORECASE)) |
| 134 | + anchors.update(re.findall(r"id=[\"']([^\"']+)[\"']", text, flags=re.IGNORECASE)) |
| 135 | + return anchors |
| 136 | + |
| 137 | + |
| 138 | +def normalize_link(raw: str) -> str: |
| 139 | + raw = raw.strip() |
| 140 | + if raw.startswith("<") and raw.endswith(">"): |
| 141 | + raw = raw[1:-1].strip() |
| 142 | + return raw |
| 143 | + |
| 144 | + |
| 145 | +def is_external_or_special(link: str) -> bool: |
| 146 | + parsed = urlparse(link) |
| 147 | + return bool(parsed.scheme in {"http", "https", "mailto", "tel", "data"}) or link.startswith("#") |
| 148 | + |
| 149 | + |
| 150 | +def validate_links(path: Path, text: str, file_cache: dict[Path, str]) -> list[Issue]: |
| 151 | + issues: list[Issue] = [] |
| 152 | + for match in LOCAL_LINK_RE.finditer(text): |
| 153 | + link = normalize_link(match.group(1) or match.group(2) or "") |
| 154 | + if not link or is_external_or_special(link): |
| 155 | + continue |
| 156 | + if link.startswith(".") or link.startswith("/") or not urlparse(link).scheme: |
| 157 | + link_path, _, anchor = link.partition("#") |
| 158 | + clean_path = unquote(link_path) |
| 159 | + clean_path = clean_path.split("?", 1)[0] |
| 160 | + target = (path.parent / clean_path).resolve() if clean_path else path.resolve() |
| 161 | + if clean_path and not target.exists(): |
| 162 | + issues.append(Issue( |
| 163 | + "critical", |
| 164 | + "R-10/R-13", |
| 165 | + path, |
| 166 | + line_number(text, match.start()), |
| 167 | + "broken-link", |
| 168 | + f"Broken local link: {link}", |
| 169 | + "Fix the relative path from the current document or remove the stale link.", |
| 170 | + )) |
| 171 | + continue |
| 172 | + if anchor and target.suffix.lower() in {".md", ".html"} and target.exists(): |
| 173 | + target_text = file_cache.get(target) |
| 174 | + if target_text is None: |
| 175 | + target_text = target.read_text(encoding="utf-8", errors="replace") |
| 176 | + file_cache[target] = target_text |
| 177 | + if anchor and unquote(anchor).lower() not in collect_anchors(target_text): |
| 178 | + issues.append(Issue( |
| 179 | + "warning", |
| 180 | + "R-10/R-13", |
| 181 | + path, |
| 182 | + line_number(text, match.start()), |
| 183 | + "broken-anchor", |
| 184 | + f"Anchor not found: {link}", |
| 185 | + "Update the anchor to match the target heading generated by GitHub Markdown.", |
| 186 | + )) |
| 187 | + return issues |
| 188 | + |
| 189 | + |
| 190 | +def validate_encoding_and_professionalism(path: Path, text: str) -> list[Issue]: |
| 191 | + issues: list[Issue] = [] |
| 192 | + for token in MOJIBAKE_TOKENS: |
| 193 | + index = text.find(token) |
| 194 | + if index >= 0: |
| 195 | + issues.append(Issue( |
| 196 | + "critical", |
| 197 | + "R-03", |
| 198 | + path, |
| 199 | + line_number(text, index), |
| 200 | + "encoding", |
| 201 | + f"Possible mojibake token found: {token}", |
| 202 | + "Run cleanup_markdown_encoding.py and review the affected sentence manually.", |
| 203 | + )) |
| 204 | + if path.suffix.lower() == ".md": |
| 205 | + for index, ch in enumerate(text): |
| 206 | + if is_emoji(ch): |
| 207 | + issues.append(Issue( |
| 208 | + "warning", |
| 209 | + "R-14", |
| 210 | + path, |
| 211 | + line_number(text, index), |
| 212 | + "decorative-character", |
| 213 | + f"Prohibited emoji/decorative character found: U+{ord(ch):04X}", |
| 214 | + "Remove the emoji/icon and keep the Markdown enterprise-professional.", |
| 215 | + )) |
| 216 | + break |
| 217 | + return issues |
| 218 | + |
| 219 | + |
| 220 | +def validate_stack(path: Path, text: str) -> list[Issue]: |
| 221 | + issues: list[Issue] = [] |
| 222 | + lower_path = str(path).replace("\\", "/").lower() |
| 223 | + if "/docs/" not in lower_path and not lower_path.endswith("readme.md"): |
| 224 | + return issues |
| 225 | + for pattern, message in FORBIDDEN_STACK_PATTERNS: |
| 226 | + for match in pattern.finditer(text): |
| 227 | + context = text[max(0, match.start() - 100): match.end() + 100].lower() |
| 228 | + if "external comparison" in context or "comparación externa" in context: |
| 229 | + continue |
| 230 | + issues.append(Issue( |
| 231 | + "critical", |
| 232 | + "R-16/R-20", |
| 233 | + path, |
| 234 | + line_number(text, match.start()), |
| 235 | + "stack-consistency", |
| 236 | + message, |
| 237 | + "Align the document with .NET 10 + SQL Server 2022 + EF Core, or mark the reference as an explicit external comparison.", |
| 238 | + )) |
| 239 | + return issues |
| 240 | + |
| 241 | + |
| 242 | +def counterpart_candidates(path: Path) -> list[Path]: |
| 243 | + s = str(path) |
| 244 | + candidates: list[str] = [] |
| 245 | + if s.endswith(".es.md"): |
| 246 | + candidates.append(s[:-6] + ".md") |
| 247 | + elif s.endswith(".md"): |
| 248 | + candidates.append(s[:-3] + ".es.md") |
| 249 | + candidates.append(s.replace("/product-es/", "/product/")) |
| 250 | + candidates.append(s.replace("/product/", "/product-es/")) |
| 251 | + candidates.append(s.replace("/project-es/", "/project/")) |
| 252 | + candidates.append(s.replace("/project/", "/project-es/")) |
| 253 | + candidates.append(s.replace("/requirements-es/", "/requirements/")) |
| 254 | + candidates.append(s.replace("/requirements/", "/requirements-es/")) |
| 255 | + candidates.append(s.replace("/blueprints-es/", "/blueprints/")) |
| 256 | + candidates.append(s.replace("/blueprints/", "/blueprints-es/")) |
| 257 | + return [Path(c) for c in candidates if c != s] |
| 258 | + |
| 259 | + |
| 260 | +def validate_bilingual(files: list[Path]) -> list[Issue]: |
| 261 | + file_set = {p.resolve() for p in files if p.suffix.lower() == ".md"} |
| 262 | + issues: list[Issue] = [] |
| 263 | + for path in file_set: |
| 264 | + rel = path.relative_to(REPO_ROOT) if path.is_relative_to(REPO_ROOT) else path |
| 265 | + rel_s = str(rel).replace("\\", "/") |
| 266 | + if not rel_s.startswith("docs/"): |
| 267 | + continue |
| 268 | + if rel_s.startswith("docs/qa/"): |
| 269 | + continue |
| 270 | + candidates = [c.resolve() for c in counterpart_candidates(path)] |
| 271 | + if not any(c in file_set for c in candidates): |
| 272 | + issues.append(Issue( |
| 273 | + "warning", |
| 274 | + "R-01", |
| 275 | + path, |
| 276 | + 1, |
| 277 | + "bilingual-sync", |
| 278 | + "No obvious bilingual counterpart found for this documentation file.", |
| 279 | + "Create or link the English/Spanish counterpart, or document why the file is intentionally single-language.", |
| 280 | + )) |
| 281 | + return issues |
| 282 | + |
| 283 | + |
| 284 | +def format_issue(issue: Issue) -> str: |
| 285 | + rel = issue.path.relative_to(REPO_ROOT) if issue.path.is_relative_to(REPO_ROOT) else issue.path |
| 286 | + return ( |
| 287 | + f"- [{issue.severity.upper()}] {issue.rule} {rel}:{issue.line}\n" |
| 288 | + f" Type: {issue.issue_type}\n" |
| 289 | + f" Issue: {issue.message}\n" |
| 290 | + f" Fix: {issue.recommendation}" |
| 291 | + ) |
| 292 | + |
| 293 | + |
| 294 | +def main() -> int: |
| 295 | + parser = argparse.ArgumentParser(description="Validate UMS documentation consistency.") |
| 296 | + parser.add_argument("paths", nargs="*", help="Files or directories to audit. Defaults to README.md and docs/.") |
| 297 | + parser.add_argument("--no-bilingual", action="store_true", help="Skip bilingual counterpart scan.") |
| 298 | + args = parser.parse_args() |
| 299 | + |
| 300 | + targets = [Path(p).resolve() for p in args.paths] if args.paths else DEFAULT_TARGETS |
| 301 | + files = iter_files(targets) |
| 302 | + file_cache: dict[Path, str] = {} |
| 303 | + issues: list[Issue] = [] |
| 304 | + |
| 305 | + for path in files: |
| 306 | + text = path.read_text(encoding="utf-8", errors="replace") |
| 307 | + file_cache[path.resolve()] = text |
| 308 | + issues.extend(validate_encoding_and_professionalism(path, text)) |
| 309 | + issues.extend(validate_stack(path, text)) |
| 310 | + issues.extend(validate_links(path, text, file_cache)) |
| 311 | + |
| 312 | + if not args.no_bilingual: |
| 313 | + issues.extend(validate_bilingual(files)) |
| 314 | + |
| 315 | + if issues: |
| 316 | + print("Documentation consistency audit failed.\n") |
| 317 | + for issue in sorted(issues, key=lambda x: (str(x.path), x.line, x.rule, x.issue_type)): |
| 318 | + print(format_issue(issue)) |
| 319 | + print(f"\nTotal issues: {len(issues)}") |
| 320 | + return 1 |
| 321 | + |
| 322 | + print(f"Documentation consistency audit passed. Scanned {len(files)} files.") |
| 323 | + return 0 |
| 324 | + |
| 325 | + |
| 326 | +if __name__ == "__main__": |
| 327 | + sys.exit(main()) |
0 commit comments