summaryrefslogtreecommitdiff
path: root/tools/link_check.py
blob: 246eaf85d7408cca8a478da7f8464a9eca380b60 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
"""Very small internal link checker.

Checks:
- href="/path" and src="/path" for local files
- Only checks local paths (starting with / or relative), skips http(s), mailto, xmpp, onion, etc.

Usage:
  python tools/link_check.py
"""

from __future__ import annotations

import re
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]

SKIP_DIRS = {".git"}
SKIP_FILES = {
    "test.html",
    "test_jp.html",
    "test_zh.html",
    "startpage/test.html",
}

RE_URL = re.compile(r"\b(?:href|src)=(['\"])(.*?)\1", re.I)


def is_external(u: str) -> bool:
    u = u.strip()
    return (
        u.startswith("http://")
        or u.startswith("https://")
        or u.startswith("mailto:")
        or u.startswith("xmpp:")
        or u.startswith("signal:")
        or u.startswith("data:")
        or u.startswith("javascript:")
        or u.startswith("#")
        or u.startswith("//")
        or u.endswith(".onion/")
        or ".onion" in u
    )


def normalize(p: Path, url: str) -> Path | None:
    url = url.split("#", 1)[0].split("?", 1)[0].strip()
    if not url:
        return None
    if is_external(url):
        return None

    if url.startswith("/"):
        return (ROOT / url.lstrip("/")).resolve()

    # relative
    return (p.parent / url).resolve()


def main() -> int:
    missing = []
    for html in ROOT.rglob("*.html"):
        if any(part in SKIP_DIRS for part in html.parts):
            continue
        rel = html.relative_to(ROOT).as_posix()
        if rel in SKIP_FILES:
            continue
        text = html.read_text(encoding="utf-8", errors="ignore")
        for m in RE_URL.finditer(text):
            url = m.group(2)
            target = normalize(html, url)
            if not target:
                continue
            # if it points to a directory, allow index.html
            if target.is_dir():
                if (target / "index.html").exists():
                    continue
            if not target.exists():
                missing.append((str(html.relative_to(ROOT)), url))

    if missing:
        print("Missing local links:")
        for src, url in missing:
            print(f"- {src}: {url}")
        return 1

    print("OK: no missing local href/src found")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())