kvm/ui/tools/find_duplicate_translations.py

#!/usr/bin/env python3
import argparse
import json
import re
from datetime import datetime
from pathlib import Path
import sys

def flatten_strings(obj, prefix=""):
    if isinstance(obj, dict):
        for k, v in obj.items():
            key = f"{prefix}.{k}" if prefix else k
            yield from flatten_strings(v, key)
    else:
        # only consider scalar strings for translation targets
        if isinstance(obj, str):
            yield prefix, obj

def normalize(s, ignore_case=False, trim=False, collapse_ws=False):
    if collapse_ws:
        s = re.sub(r"\s+", " ", s)
    if trim:
        s = s.strip()
    if ignore_case:
        s = s.lower()
    return s

def main(argv):
    p = argparse.ArgumentParser(
        description="Find identical translation targets with different keys in en.json"
    )
    p.add_argument(
        "--en", default="./localization/messages/en.json", help="path to en.json"
    )
    p.add_argument(
        "--out",
        default="./reports/duplicate_translation_targets.json",
        help="output report path (JSON)",
    )
    p.add_argument(
        "--ignore-case", default=True, action="store_true", help="ignore case when comparing values"
    )
    p.add_argument(
        "--trim",
        default=True, action="store_true",
        help="trim surrounding whitespace before comparing",
    )
    p.add_argument(
        "--collapse-ws",
        default=True, action="store_true",
        help="collapse internal whitespace before comparing",
    )
    args = p.parse_args()

    en_path = Path(args.en)
    if not en_path.is_file():
        print(f"en.json not found: {en_path}")
        raise SystemExit(2)

    with en_path.open(encoding="utf-8") as f:
        payload = json.load(f)

    entries = list(flatten_strings(payload))
    total_keys = len(entries)

    groups = {}
    original_values = {}
    for key, val in entries:
        norm = normalize(
            val,
            ignore_case=args.ignore_case,
            trim=args.trim,
            collapse_ws=args.collapse_ws,
        )
        groups.setdefault(norm, []).append(key)
        # keep the first seen original for reporting
        original_values.setdefault(norm, val)

    duplicates = []
    for norm, keys in groups.items():
        if len(keys) > 1:
            duplicates.append(
                {
                    "normalized_value": norm,
                    "original_value": original_values.get(norm),
                    "keys": sorted(keys),
                    "count": len(keys),
                }
            )

    report = {
        "generated_at": datetime.utcnow().isoformat() + "Z",
        "en_json": str(en_path),
        "total_string_keys": total_keys,
        "duplicate_groups": sorted(
            duplicates, key=lambda d: (-d["count"], d["normalized_value"])
        ),
        "duplicate_count": len(duplicates),
    }

    out_path = Path(args.out)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    print(
        f"Wrote {out_path} — total keys: {total_keys}, duplicate groups: {len(duplicates)}"
    )

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))