diff options
Diffstat (limited to 'tools/translate_pages.py')
| -rw-r--r-- | tools/translate_pages.py | 258 |
1 files changed, 258 insertions, 0 deletions
diff --git a/tools/translate_pages.py b/tools/translate_pages.py new file mode 100644 index 0000000..3127d66 --- /dev/null +++ b/tools/translate_pages.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +"""Generate zh/jp copies of all HTML pages (except startpage/). + +This is a best-effort, offline translation helper. + +- It copies each *.html to *_zh.html and *_jp.html (same directory). +- It preserves all HTML structure, links, ids, classes. +- It translates only user-visible text nodes and some common attributes. +- It skips anything under "startpage/". + +Notes: +- This is not a static site generator. It only writes additional files. +- Translation quality depends on the dictionaries below. +""" + +from __future__ import annotations + +import os +import re +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + + +SKIP_DIRS = { + "startpage", + "mstartpage", + "partials", +} + + +# Tags whose text content should not be translated. +SKIP_TAGS = { + "script", + "style", + "code", + "pre", + "kbd", + "samp", +} + + +# Very small phrase dictionaries (hand-tuned for this repo). +# For anything not in the dictionary, we leave the text as-is. +ZH = { + "Skip to content": "跳至内容", + "Menu": "菜单", + "Language": "语言", + "Home": "首页", + "StartPage": "StartPage", + "Blog": "博客", + "Guestbook": "留言板", + "Journal": "日志", + "Diary": "日记", + "Gaming": "游戏", + "Bookmarks": "书签", + "Accounts": "账户", + "Computers": "电脑设备", + "Contact": "联系", + "Welcome": "欢迎", + "My Current Vibe": "当前氛围", + "Music": "音乐", + "Current Blog": "当前博客", + "Changelog": "更新日志", + "Friends": "朋友们", + "Games": "游戏", + "Countries": "国家", + "Sponsors / VPNs / Buttons": "赞助商 / VPN / 按钮", + "Open guestbook": "打开留言板", + "Loading…": "加载中…", + "Loading...": "加载中…", + "Licensed under": "采用", + "site": "网站", + "Error": "错误", +} + + +JA = { + "Skip to content": "本文へ移動", + "Menu": "メニュー", + "Language": "言語", + "Home": "ホーム", + "StartPage": "StartPage", + "Blog": "ブログ", + "Guestbook": "ゲストブック", + "Journal": "ジャーナル", + "Diary": "日記", + "Gaming": "ゲーム", + "Bookmarks": "ブックマーク", + "Accounts": "アカウント", + "Computers": "コンピューター", + "Contact": "連絡先", + "Welcome": "ようこそ", + "My Current Vibe": "今の雰囲気", + "Music": "音楽", + "Current Blog": "現在のブログ", + "Changelog": "更新履歴", + "Friends": "友達", + "Games": "ゲーム", + "Countries": "国", + "Sponsors / VPNs / Buttons": "スポンサー / VPN / ボタン", + "Open guestbook": "ゲストブックを開く", + "Loading…": "読み込み中…", + "Loading...": "読み込み中…", + "Licensed under": "ライセンス:", + "Error": "エラー", +} + + +ATTR_TRANSLATE = { + "title", + "aria-label", + "aria-labelledby", # generally ids; don't translate + "alt", + "placeholder", +} + + +RE_TAG = re.compile(r"(<[^>]+>)") +RE_TEXT_NODE = re.compile(r"^(\s*)(.*?)(\s*)$", re.S) +RE_ATTR = re.compile(r'(\s)([a-zA-Z_:.-]+)=("[^"]*"|\'[\s\S]*?\')') + + +def should_skip_path(p: Path) -> bool: + rel = p.relative_to(ROOT) + parts = set(rel.parts) + return any(d in parts for d in SKIP_DIRS) + + +def translate_phrase(s: str, mapping: dict[str, str]) -> str: + # Exact match first + if s in mapping: + return mapping[s] + + # Replace common UI tokens inside longer strings (simple, conservative) + out = s + for k, v in mapping.items(): + if k and k in out: + out = out.replace(k, v) + return out + + +def translate_text_node(text: str, mapping: dict[str, str]) -> str: + m = RE_TEXT_NODE.match(text) + if not m: + return text + lead, core, tail = m.group(1), m.group(2), m.group(3) + + # Skip empty or purely whitespace + if not core.strip(): + return text + + # Skip if it's just punctuation/symbols + if not re.search(r"[A-Za-z]", core): + return text + + translated = translate_phrase(core, mapping) + return f"{lead}{translated}{tail}" + + +def tag_name(tag: str) -> str | None: + # tag is like <div ...> or </div> + t = tag.strip()[1:-1].strip() + if not t: + return None + if t.startswith("!") or t.startswith("?"): + return None + if t.startswith("/"): + t = t[1:].lstrip() + name = re.split(r"\s+", t, maxsplit=1)[0].lower() + return name + + +def translate_attrs(tag: str, mapping: dict[str, str]) -> str: + # Don't touch aria-labelledby since it's usually an id. + def repl(m: re.Match[str]) -> str: + space, key, val = m.group(1), m.group(2), m.group(3) + k = key.lower() + if k not in ATTR_TRANSLATE or k == "aria-labelledby": + return m.group(0) + quote = val[0] + inner = val[1:-1] + new_inner = translate_phrase(inner, mapping) + if new_inner == inner: + return m.group(0) + return f"{space}{key}={quote}{new_inner}{quote}" + + return RE_ATTR.sub(repl, tag) + + +def translate_html(src: str, mapping: dict[str, str]) -> str: + parts = RE_TAG.split(src) + out: list[str] = [] + + skip_depth = 0 + for part in parts: + if part.startswith("<") and part.endswith(">"): + name = tag_name(part) + + # track skip tags nesting + if name in SKIP_TAGS: + if part.lstrip().startswith("</"): + if skip_depth > 0: + skip_depth -= 1 + else: + skip_depth += 1 + + out.append(translate_attrs(part, mapping)) + else: + if skip_depth > 0: + out.append(part) + else: + out.append(translate_text_node(part, mapping)) + + return "".join(out) + + +def write_if_changed(path: Path, content: str) -> None: + old = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else None + if old == content: + return + path.write_text(content, encoding="utf-8") + + +def main() -> int: + html_files = sorted(ROOT.rglob("*.html")) + for p in html_files: + if should_skip_path(p): + continue + + # Skip already translated files + if p.name.endswith("_zh.html") or p.name.endswith("_jp.html"): + continue + + # Only translate pages that look like they are part of the unified site + # (Keep legacy old HTML alone unless user explicitly wants all) + src = p.read_text(encoding="utf-8", errors="ignore") + + # Output names + zh_path = p.with_name(p.stem + "_zh.html") + jp_path = p.with_name(p.stem + "_jp.html") + + zh = translate_html(src, ZH) + jp = translate_html(src, JA) + + # Set lang attribute if present + zh = re.sub(r"<html\s+lang=\"[^\"]*\"", '<html lang="zh"', zh, count=1) + jp = re.sub(r"<html\s+lang=\"[^\"]*\"", '<html lang="ja"', jp, count=1) + + write_if_changed(zh_path, zh) + write_if_changed(jp_path, jp) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) |
