Commit multilingual editorial frontend work

2026-04-07 07:36:27 +02:00
parent 0eae030142
commit 77f57cf528
119 changed files with 5255 additions and 220 deletions
@@ -0,0 +1,364 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import hashlib
+import json
+import re
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Iterable
+
+
+SUPPORTED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
+
+
+TOKEN_REPLACEMENTS = {
+    "jr": "Dave Jr",
+    "nova": "Villanova",
+    "meanddave": "Dave And Dave Jr",
+    "meanddave1": "Dave And Dave Jr",
+    "meandjr": "Dave And Dave Jr",
+    "meandjr2": "Dave And Dave Jr",
+    "withdavejr": "With Dave Jr",
+    "withdavejr1": "With Dave Jr",
+    "witherin": "With Erin",
+    "withkevin": "With Kevin",
+    "withmom": "With Mom",
+    "withgrandma": "With Grandma",
+    "jr_brigid": "Dave Jr And Brigid",
+    "jr_grandma": "Dave Jr And Grandma",
+    "jr_nova": "Dave Jr Villanova",
+    "jr_trivia": "Dave Jr Trivia",
+    "four_jr": "Dave Jr At Four",
+    "four_jr2": "Dave Jr At Four",
+    "shirt_nova": "Shirt Villanova",
+    "me_nova": "Me Villanova",
+    "citizen": "Citizen",
+    "meseum": "Museum",
+    "me_trivia": "Me Trivia",
+}
+
+KNOWN_FACEBOOK_PREFIX_RE = re.compile(r"^\d{6,}_")
+
+
+DESCRIPTION_RULES = [
+    ({"villanova", "nova", "jr"}, "Dave Jr in Villanova gear."),
+    ({"meanddave", "meandjr", "withdavejr"}, "Dave with Dave Jr."),
+    ({"dave", "dad", "me"}, "Family moment with Dave."),
+    ({"jr"}, "Family snapshot featuring Dave Jr."),
+    ({"grandma", "mom", "kevin", "erin", "noreen"}, "Family portrait moment."),
+    ({"gift", "gifts"}, "Gift-opening or celebration moment."),
+    ({"school"}, "School-related family snapshot."),
+    ({"thanksgiving"}, "Holiday family gathering moment."),
+    ({"trivia"}, "Trivia and music related family snapshot."),
+    ({"poland"}, "Travel memory from Poland."),
+    ({"museum"}, "Museum or outing snapshot."),
+    ({"fireman"}, "Dress-up or costume moment."),
+]
+
+
+@dataclass
+class MediaRow:
+    filename: str
+    relative_url: str
+    title: str
+    description: str
+    tags: list[str]
+    sha1: str
+    file_size: int
+    duplicate_of: str | None = None
+    named_file: bool = False
+
+
+def slugify(text: str) -> str:
+    text = text.lower()
+    text = re.sub(r"[^a-z0-9]+", "-", text)
+    return text.strip("-")
+
+
+def humanize_stem(stem: str) -> str:
+    cleaned = stem.strip()
+    cleaned = re.sub(r"^\d+", "", cleaned)
+    cleaned = re.sub(r"_\d{6,}.*$", "", cleaned)
+    cleaned = cleaned.replace("(1)", "").replace("(2)", "").replace("(3)", "")
+    cleaned = cleaned.replace("(4)", "").replace("(5)", "").replace("(6)", "").replace("(7)", "")
+    lowered = cleaned.lower().strip()
+    if lowered in TOKEN_REPLACEMENTS:
+        return TOKEN_REPLACEMENTS[lowered]
+
+    lowered = lowered.replace("_", " ").replace("-", " ")
+    lowered = re.sub(r"\s+", " ", lowered).strip()
+    if not lowered:
+        return "Untitled Photo"
+
+    words: list[str] = []
+    for word in lowered.split():
+        if word in {"jr", "jr."}:
+            words.append("Dave Jr")
+        elif word in {"nova", "villanova"}:
+            words.append("Villanova")
+        elif word in {"me", "dave", "kevin", "erin", "noreen", "mom", "grandma", "poland", "museum", "trivia"}:
+            words.append(word.capitalize())
+        else:
+            words.append(word.capitalize())
+    return " ".join(words)
+
+
+def is_named_file(filename: str, title: str) -> bool:
+    stem = Path(filename).stem.lower()
+    if title == "Untitled Photo":
+        return False
+    if stem.startswith("unnamed"):
+        return False
+    if KNOWN_FACEBOOK_PREFIX_RE.match(stem):
+        return False
+    return True
+
+
+def infer_tags(filename: str, title: str) -> list[str]:
+    text = f"{filename} {title}".lower()
+    tags: list[str] = []
+
+    if any(token in text for token in ("jr", "dave jr", "baby")):
+        tags.extend(["child", "dave-jr"])
+    if any(token in text for token in ("me ", " me", "meand", " meand", "dave and")):
+        tags.append("dave")
+    if any(token in text for token in ("villanova", "nova", "rangers")):
+        tags.extend(["villanova", "sports"])
+    if any(token in text for token in ("gift", "gifts", "thanksgiving", "christmas", "may17")):
+        tags.append("holiday")
+    if any(token in text for token in ("grandma", "mom", "kevin", "erin", "noreen", "brigid")):
+        tags.append("family")
+    if any(token in text for token in ("trivia", "music", "marvin gaye")):
+        tags.append("music")
+    if any(token in text for token in ("school", "museum", "poland", "citizen")):
+        tags.append("outing")
+    if any(token in text for token in ("fireman", "dress", "beret", "shirt", "glasses")):
+        tags.append("dress-up")
+    if any(token in text for token in ("haircut", "slide", "tree", "balls", "eating")):
+        tags.append("playtime")
+    if not tags:
+        tags.append("family")
+
+    return sorted(set(tags))
+
+
+def infer_description(filename: str, title: str) -> str:
+    text = f"{filename} {title}".lower()
+    named_file = is_named_file(filename, title)
+
+    if any(token in text for token in ("meanddave", "me and dave", "meandjr", "dave and dave jr")):
+        return "Dave with Dave Jr."
+    if any(token in text for token in ("withdavejr", "with dave jr")):
+        return "Family snapshot with Dave Jr."
+    if any(token in text for token in ("jr_nova", "villanova", "shirt villanova", "nova dave")):
+        return "Villanova-flavored family snapshot."
+    if any(token in text for token in ("gift", "gifts", "thanksgiving", "may17")):
+        return "Family celebration moment."
+    if any(token in text for token in ("fireman", "dress", "beret", "glasses")):
+        return "Dress-up family snapshot."
+    if any(token in text for token in ("school", "museum", "poland", "citizen")):
+        return "Family outing or travel snapshot."
+    if any(token in text for token in ("haircut", "slide", "tree", "balls", "eating")):
+        return "Everyday family moment."
+    if any(token in text for token in ("grandma", "mom", "kevin", "erin", "noreen", "brigid")):
+        return "Family portrait moment."
+    if any(token in text for token in ("jr", "dave jr")):
+        return "Family snapshot featuring Dave Jr."
+    if not named_file:
+        return "Family archive snapshot awaiting fuller annotation."
+    return "Family archive snapshot."
+
+
+def sha1_for_file(path: Path) -> str:
+    digest = hashlib.sha1()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def build_rows(source_dir: Path, url_prefix: str) -> list[MediaRow]:
+    rows: list[MediaRow] = []
+    first_seen_by_hash: dict[str, str] = {}
+
+    for path in sorted(source_dir.iterdir(), key=lambda p: p.name.lower()):
+        if not path.is_file() or path.suffix.lower() not in SUPPORTED_EXTENSIONS:
+            continue
+
+        title = humanize_stem(path.stem)
+        description = infer_description(path.name, title)
+        tags = infer_tags(path.name, title)
+        digest = sha1_for_file(path)
+        duplicate_of = first_seen_by_hash.get(digest)
+        if duplicate_of is None:
+            first_seen_by_hash[digest] = path.name
+
+        rows.append(
+            MediaRow(
+                filename=path.name,
+                relative_url=f"{url_prefix.rstrip('/')}/{path.name}",
+                title=title,
+                description=description,
+                tags=tags,
+                sha1=digest,
+                file_size=path.stat().st_size,
+                duplicate_of=duplicate_of,
+                named_file=is_named_file(path.name, title),
+            )
+        )
+
+    return rows
+
+
+def write_json(rows: Iterable[MediaRow], target: Path) -> None:
+    payload = [asdict(row) for row in rows]
+    target.write_text(json.dumps(payload, indent=2, ensure_ascii=True), encoding="utf-8")
+
+
+def write_csv(rows: Iterable[MediaRow], target: Path) -> None:
+    fieldnames = [
+        "filename",
+        "relative_url",
+        "title",
+        "description",
+        "tags",
+        "sha1",
+        "file_size",
+        "duplicate_of",
+        "named_file",
+    ]
+    with target.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            data = asdict(row)
+            data["tags"] = json.dumps(data["tags"], ensure_ascii=True)
+            writer.writerow(data)
+
+
+def sql_escape(value: str) -> str:
+    return value.replace("\\", "\\\\").replace("'", "''")
+
+
+def choose_cover_url(rows: list[MediaRow]) -> str:
+    preferred_rows = [row for row in rows if row.named_file and not row.duplicate_of]
+    if preferred_rows:
+        ranked = sorted(
+            preferred_rows,
+            key=lambda row: (
+                0 if "dave and dave jr" in row.title.lower() else 1,
+                0 if "dave jr" in row.title.lower() else 1,
+                row.filename.lower(),
+            ),
+        )
+        return ranked[0].relative_url
+    unique_rows = [row for row in rows if not row.duplicate_of]
+    if unique_rows:
+        return unique_rows[0].relative_url
+    return rows[0].relative_url
+
+
+def write_sql(rows: list[MediaRow], target: Path, slug: str, album_title: str, cover_url: str, unique_only: bool) -> None:
+    rows_to_write = [row for row in rows if not row.duplicate_of] if unique_only else rows
+    lines = [
+        "-- Generated by build_facebook_album_manifest.py",
+        f"SET @folder_slug = '{sql_escape(slug)}';",
+        f"SET @cover_url = '{sql_escape(cover_url)}';",
+        "",
+        "INSERT INTO media_folders (slug, name_en, name_nb, description_en, description_nb, cover_image_url, active, sort_order)",
+        "VALUES (",
+        f"  @folder_slug,",
+        f"  '{sql_escape(album_title)}',",
+        f"  '{sql_escape(album_title)}',",
+        "  'Imported Facebook family archive with starter captions generated from filenames. Descriptions should be reviewed and enriched.',",
+        "  'Importert familiearkiv fra Facebook med starttekster generert fra filnavn. Beskrivelsene bor gjennomgaas og forbedres.',",
+        "  @cover_url,",
+        "  1,",
+        "  0",
+        ")",
+        "ON DUPLICATE KEY UPDATE",
+        "  name_en = VALUES(name_en),",
+        "  name_nb = VALUES(name_nb),",
+        "  description_en = VALUES(description_en),",
+        "  description_nb = VALUES(description_nb),",
+        "  cover_image_url = VALUES(cover_image_url);",
+        "",
+        "SET @folder_id = (SELECT id FROM media_folders WHERE slug = @folder_slug LIMIT 1);",
+        "",
+    ]
+
+    for index, row in enumerate(rows_to_write, start=1):
+        tags_json = json.dumps(row.tags, ensure_ascii=True)
+        lines.extend(
+            [
+                "INSERT INTO media (type, filename, url, title, description, category, folder_id, tags, credit, created_at)",
+                "VALUES (",
+                "  'image',",
+                f"  '{sql_escape(row.filename)}',",
+                f"  '{sql_escape(row.relative_url)}',",
+                f"  '{sql_escape(row.title)}',",
+                f"  '{sql_escape(row.description)}',",
+                "  'family',",
+                "  @folder_id,",
+                f"  '{sql_escape(tags_json)}',",
+                "  'Facebook archive import',",
+                "  NOW()",
+                ")",
+                "ON DUPLICATE KEY UPDATE",
+                "  title = VALUES(title),",
+                "  description = VALUES(description),",
+                "  folder_id = VALUES(folder_id),",
+                "  tags = VALUES(tags),",
+                "  credit = VALUES(credit);",
+                "",
+            ]
+        )
+
+    target.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Build a JSON/CSV/SQL manifest for a photo folder.")
+    parser.add_argument("--source", required=True, help="Path to the source image folder.")
+    parser.add_argument("--slug", default="facebook-060426", help="Target media folder slug.")
+    parser.add_argument("--title", default="Facebook Archive / June 4 2026", help="Folder title.")
+    parser.add_argument("--url-prefix", default="/uploads/facebook-060426", help="URL prefix for deployed images.")
+    parser.add_argument("--output-dir", default="C:\\wamp64\\www\\davegilligan-new\\tmp", help="Directory for generated files.")
+    parser.add_argument("--include-duplicates", action="store_true", help="Include duplicate hashes in the SQL seed.")
+    args = parser.parse_args()
+
+    source_dir = Path(args.source)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    rows = build_rows(source_dir, args.url_prefix)
+    if not rows:
+        raise SystemExit("No supported images found.")
+
+    cover_url = choose_cover_url(rows)
+    stem = slugify(args.slug)
+
+    write_json(rows, output_dir / f"{stem}-manifest.json")
+    write_csv(rows, output_dir / f"{stem}-manifest.csv")
+    write_sql(
+        rows,
+        output_dir / f"{stem}-seed.sql",
+        args.slug,
+        args.title,
+        cover_url,
+        unique_only=not args.include_duplicates,
+    )
+
+    duplicate_count = sum(1 for row in rows if row.duplicate_of)
+    print(f"Generated manifest for {len(rows)} files")
+    print(f"Duplicate files detected: {duplicate_count}")
+    print(f"Unique rows for SQL seed: {len([row for row in rows if not row.duplicate_of]) if not args.include_duplicates else len(rows)}")
+    print(f"Cover image: {cover_url}")
+    print(f"Output directory: {output_dir}")
+
+
+if __name__ == "__main__":
+    main()