summaryrefslogtreecommitdiff
path: root/tools/translate_pages.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/translate_pages.py')
-rw-r--r--tools/translate_pages.py258
1 files changed, 258 insertions, 0 deletions
diff --git a/tools/translate_pages.py b/tools/translate_pages.py
new file mode 100644
index 0000000..3127d66
--- /dev/null
+++ b/tools/translate_pages.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""Generate zh/jp copies of all HTML pages (except startpage/).
+
+This is a best-effort, offline translation helper.
+
+- It copies each *.html to *_zh.html and *_jp.html (same directory).
+- It preserves all HTML structure, links, ids, classes.
+- It translates only user-visible text nodes and some common attributes.
+- It skips anything under "startpage/".
+
+Notes:
+- This is not a static site generator. It only writes additional files.
+- Translation quality depends on the dictionaries below.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+SKIP_DIRS = {
+ "startpage",
+ "mstartpage",
+ "partials",
+}
+
+
+# Tags whose text content should not be translated.
+SKIP_TAGS = {
+ "script",
+ "style",
+ "code",
+ "pre",
+ "kbd",
+ "samp",
+}
+
+
+# Very small phrase dictionaries (hand-tuned for this repo).
+# For anything not in the dictionary, we leave the text as-is.
+ZH = {
+ "Skip to content": "跳至内容",
+ "Menu": "菜单",
+ "Language": "语言",
+ "Home": "首页",
+ "StartPage": "StartPage",
+ "Blog": "博客",
+ "Guestbook": "留言板",
+ "Journal": "日志",
+ "Diary": "日记",
+ "Gaming": "游戏",
+ "Bookmarks": "书签",
+ "Accounts": "账户",
+ "Computers": "电脑设备",
+ "Contact": "联系",
+ "Welcome": "欢迎",
+ "My Current Vibe": "当前氛围",
+ "Music": "音乐",
+ "Current Blog": "当前博客",
+ "Changelog": "更新日志",
+ "Friends": "朋友们",
+ "Games": "游戏",
+ "Countries": "国家",
+ "Sponsors / VPNs / Buttons": "赞助商 / VPN / 按钮",
+ "Open guestbook": "打开留言板",
+ "Loading…": "加载中…",
+ "Loading...": "加载中…",
+ "Licensed under": "采用",
+ "site": "网站",
+ "Error": "错误",
+}
+
+
+JA = {
+ "Skip to content": "本文へ移動",
+ "Menu": "メニュー",
+ "Language": "言語",
+ "Home": "ホーム",
+ "StartPage": "StartPage",
+ "Blog": "ブログ",
+ "Guestbook": "ゲストブック",
+ "Journal": "ジャーナル",
+ "Diary": "日記",
+ "Gaming": "ゲーム",
+ "Bookmarks": "ブックマーク",
+ "Accounts": "アカウント",
+ "Computers": "コンピューター",
+ "Contact": "連絡先",
+ "Welcome": "ようこそ",
+ "My Current Vibe": "今の雰囲気",
+ "Music": "音楽",
+ "Current Blog": "現在のブログ",
+ "Changelog": "更新履歴",
+ "Friends": "友達",
+ "Games": "ゲーム",
+ "Countries": "国",
+ "Sponsors / VPNs / Buttons": "スポンサー / VPN / ボタン",
+ "Open guestbook": "ゲストブックを開く",
+ "Loading…": "読み込み中…",
+ "Loading...": "読み込み中…",
+ "Licensed under": "ライセンス:",
+ "Error": "エラー",
+}
+
+
+ATTR_TRANSLATE = {
+ "title",
+ "aria-label",
+ "aria-labelledby", # generally ids; don't translate
+ "alt",
+ "placeholder",
+}
+
+
+RE_TAG = re.compile(r"(<[^>]+>)")
+RE_TEXT_NODE = re.compile(r"^(\s*)(.*?)(\s*)$", re.S)
+RE_ATTR = re.compile(r'(\s)([a-zA-Z_:.-]+)=("[^"]*"|\'[\s\S]*?\')')
+
+
+def should_skip_path(p: Path) -> bool:
+ rel = p.relative_to(ROOT)
+ parts = set(rel.parts)
+ return any(d in parts for d in SKIP_DIRS)
+
+
+def translate_phrase(s: str, mapping: dict[str, str]) -> str:
+ # Exact match first
+ if s in mapping:
+ return mapping[s]
+
+ # Replace common UI tokens inside longer strings (simple, conservative)
+ out = s
+ for k, v in mapping.items():
+ if k and k in out:
+ out = out.replace(k, v)
+ return out
+
+
+def translate_text_node(text: str, mapping: dict[str, str]) -> str:
+ m = RE_TEXT_NODE.match(text)
+ if not m:
+ return text
+ lead, core, tail = m.group(1), m.group(2), m.group(3)
+
+ # Skip empty or purely whitespace
+ if not core.strip():
+ return text
+
+ # Skip if it's just punctuation/symbols
+ if not re.search(r"[A-Za-z]", core):
+ return text
+
+ translated = translate_phrase(core, mapping)
+ return f"{lead}{translated}{tail}"
+
+
+def tag_name(tag: str) -> str | None:
+ # tag is like <div ...> or </div>
+ t = tag.strip()[1:-1].strip()
+ if not t:
+ return None
+ if t.startswith("!") or t.startswith("?"):
+ return None
+ if t.startswith("/"):
+ t = t[1:].lstrip()
+ name = re.split(r"\s+", t, maxsplit=1)[0].lower()
+ return name
+
+
+def translate_attrs(tag: str, mapping: dict[str, str]) -> str:
+ # Don't touch aria-labelledby since it's usually an id.
+ def repl(m: re.Match[str]) -> str:
+ space, key, val = m.group(1), m.group(2), m.group(3)
+ k = key.lower()
+ if k not in ATTR_TRANSLATE or k == "aria-labelledby":
+ return m.group(0)
+ quote = val[0]
+ inner = val[1:-1]
+ new_inner = translate_phrase(inner, mapping)
+ if new_inner == inner:
+ return m.group(0)
+ return f"{space}{key}={quote}{new_inner}{quote}"
+
+ return RE_ATTR.sub(repl, tag)
+
+
+def translate_html(src: str, mapping: dict[str, str]) -> str:
+ parts = RE_TAG.split(src)
+ out: list[str] = []
+
+ skip_depth = 0
+ for part in parts:
+ if part.startswith("<") and part.endswith(">"):
+ name = tag_name(part)
+
+ # track skip tags nesting
+ if name in SKIP_TAGS:
+ if part.lstrip().startswith("</"):
+ if skip_depth > 0:
+ skip_depth -= 1
+ else:
+ skip_depth += 1
+
+ out.append(translate_attrs(part, mapping))
+ else:
+ if skip_depth > 0:
+ out.append(part)
+ else:
+ out.append(translate_text_node(part, mapping))
+
+ return "".join(out)
+
+
+def write_if_changed(path: Path, content: str) -> None:
+ old = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else None
+ if old == content:
+ return
+ path.write_text(content, encoding="utf-8")
+
+
+def main() -> int:
+ html_files = sorted(ROOT.rglob("*.html"))
+ for p in html_files:
+ if should_skip_path(p):
+ continue
+
+ # Skip already translated files
+ if p.name.endswith("_zh.html") or p.name.endswith("_jp.html"):
+ continue
+
+ # Only translate pages that look like they are part of the unified site
+ # (Keep legacy old HTML alone unless user explicitly wants all)
+ src = p.read_text(encoding="utf-8", errors="ignore")
+
+ # Output names
+ zh_path = p.with_name(p.stem + "_zh.html")
+ jp_path = p.with_name(p.stem + "_jp.html")
+
+ zh = translate_html(src, ZH)
+ jp = translate_html(src, JA)
+
+ # Set lang attribute if present
+ zh = re.sub(r"<html\s+lang=\"[^\"]*\"", '<html lang="zh"', zh, count=1)
+ jp = re.sub(r"<html\s+lang=\"[^\"]*\"", '<html lang="ja"', jp, count=1)
+
+ write_if_changed(zh_path, zh)
+ write_if_changed(jp_path, jp)
+
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())