tools/html_audit.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

#!/usr/bin/env python3
"""Lightweight HTML audit for common a11y/markup issues."""

from __future__ import annotations

from html.parser import HTMLParser
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]

SKIP_DIRS = {"partials", ".git"}
SKIP_FILES = {
    "test.html",
    "test_jp.html",
    "test_zh.html",
    "startpage/test.html",
}


class AuditParser(HTMLParser):
    def __init__(self) -> None:
        super().__init__()
        self.ids: dict[str, int] = {}
        self.duplicate_ids: set[str] = set()
        self.missing_alt: list[str] = []
        self.missing_iframe_title: list[str] = []
        self.blank_rel: list[str] = []

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        attr_map = {k.lower(): (v or "") for k, v in attrs}

        if "id" in attr_map:
            ident = attr_map["id"]
            if ident:
                if ident in self.ids:
                    self.duplicate_ids.add(ident)
                self.ids[ident] = self.ids.get(ident, 0) + 1

        if tag == "img":
            if "alt" not in attr_map:
                src = attr_map.get("src", "")
                self.missing_alt.append(src)

        if tag == "iframe":
            if not attr_map.get("title", ""):
                src = attr_map.get("src", "")
                self.missing_iframe_title.append(src)

        if tag == "a":
            if attr_map.get("target", "") == "_blank":
                rel = attr_map.get("rel", "")
                if "noopener" not in rel:
                    href = attr_map.get("href", "")
                    self.blank_rel.append(href)


def main() -> int:
    issues = []

    for html in ROOT.rglob("*.html"):
        if any(part in SKIP_DIRS for part in html.parts):
            continue
        rel = html.relative_to(ROOT).as_posix()
        if rel in SKIP_FILES:
            continue

        parser = AuditParser()
        parser.feed(html.read_text(encoding="utf-8", errors="ignore"))

        if parser.duplicate_ids:
            issues.append((rel, "duplicate-ids", sorted(parser.duplicate_ids)))
        if parser.missing_alt:
            issues.append((rel, "img-missing-alt", parser.missing_alt))
        if parser.missing_iframe_title:
            issues.append((rel, "iframe-missing-title", parser.missing_iframe_title))
        if parser.blank_rel:
            issues.append((rel, "target-blank-missing-noopener", parser.blank_rel))

    if not issues:
        print("OK: no audit issues found")
        return 0

    print("HTML audit issues:")
    for rel, kind, items in issues:
        print(f"- {rel}: {kind}")
        for item in items[:10]:
            print(f"  - {item}")
        if len(items) > 10:
            print(f"  - ... ({len(items) - 10} more)")
    return 1


if __name__ == "__main__":
    raise SystemExit(main())