Commit multilingual editorial frontend work
This commit is contained in:
@@ -0,0 +1,364 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
|
||||
|
||||
|
||||
TOKEN_REPLACEMENTS = {
|
||||
"jr": "Dave Jr",
|
||||
"nova": "Villanova",
|
||||
"meanddave": "Dave And Dave Jr",
|
||||
"meanddave1": "Dave And Dave Jr",
|
||||
"meandjr": "Dave And Dave Jr",
|
||||
"meandjr2": "Dave And Dave Jr",
|
||||
"withdavejr": "With Dave Jr",
|
||||
"withdavejr1": "With Dave Jr",
|
||||
"witherin": "With Erin",
|
||||
"withkevin": "With Kevin",
|
||||
"withmom": "With Mom",
|
||||
"withgrandma": "With Grandma",
|
||||
"jr_brigid": "Dave Jr And Brigid",
|
||||
"jr_grandma": "Dave Jr And Grandma",
|
||||
"jr_nova": "Dave Jr Villanova",
|
||||
"jr_trivia": "Dave Jr Trivia",
|
||||
"four_jr": "Dave Jr At Four",
|
||||
"four_jr2": "Dave Jr At Four",
|
||||
"shirt_nova": "Shirt Villanova",
|
||||
"me_nova": "Me Villanova",
|
||||
"citizen": "Citizen",
|
||||
"meseum": "Museum",
|
||||
"me_trivia": "Me Trivia",
|
||||
}
|
||||
|
||||
KNOWN_FACEBOOK_PREFIX_RE = re.compile(r"^\d{6,}_")
|
||||
|
||||
|
||||
DESCRIPTION_RULES = [
|
||||
({"villanova", "nova", "jr"}, "Dave Jr in Villanova gear."),
|
||||
({"meanddave", "meandjr", "withdavejr"}, "Dave with Dave Jr."),
|
||||
({"dave", "dad", "me"}, "Family moment with Dave."),
|
||||
({"jr"}, "Family snapshot featuring Dave Jr."),
|
||||
({"grandma", "mom", "kevin", "erin", "noreen"}, "Family portrait moment."),
|
||||
({"gift", "gifts"}, "Gift-opening or celebration moment."),
|
||||
({"school"}, "School-related family snapshot."),
|
||||
({"thanksgiving"}, "Holiday family gathering moment."),
|
||||
({"trivia"}, "Trivia and music related family snapshot."),
|
||||
({"poland"}, "Travel memory from Poland."),
|
||||
({"museum"}, "Museum or outing snapshot."),
|
||||
({"fireman"}, "Dress-up or costume moment."),
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class MediaRow:
|
||||
filename: str
|
||||
relative_url: str
|
||||
title: str
|
||||
description: str
|
||||
tags: list[str]
|
||||
sha1: str
|
||||
file_size: int
|
||||
duplicate_of: str | None = None
|
||||
named_file: bool = False
|
||||
|
||||
|
||||
def slugify(text: str) -> str:
|
||||
text = text.lower()
|
||||
text = re.sub(r"[^a-z0-9]+", "-", text)
|
||||
return text.strip("-")
|
||||
|
||||
|
||||
def humanize_stem(stem: str) -> str:
|
||||
cleaned = stem.strip()
|
||||
cleaned = re.sub(r"^\d+", "", cleaned)
|
||||
cleaned = re.sub(r"_\d{6,}.*$", "", cleaned)
|
||||
cleaned = cleaned.replace("(1)", "").replace("(2)", "").replace("(3)", "")
|
||||
cleaned = cleaned.replace("(4)", "").replace("(5)", "").replace("(6)", "").replace("(7)", "")
|
||||
lowered = cleaned.lower().strip()
|
||||
if lowered in TOKEN_REPLACEMENTS:
|
||||
return TOKEN_REPLACEMENTS[lowered]
|
||||
|
||||
lowered = lowered.replace("_", " ").replace("-", " ")
|
||||
lowered = re.sub(r"\s+", " ", lowered).strip()
|
||||
if not lowered:
|
||||
return "Untitled Photo"
|
||||
|
||||
words: list[str] = []
|
||||
for word in lowered.split():
|
||||
if word in {"jr", "jr."}:
|
||||
words.append("Dave Jr")
|
||||
elif word in {"nova", "villanova"}:
|
||||
words.append("Villanova")
|
||||
elif word in {"me", "dave", "kevin", "erin", "noreen", "mom", "grandma", "poland", "museum", "trivia"}:
|
||||
words.append(word.capitalize())
|
||||
else:
|
||||
words.append(word.capitalize())
|
||||
return " ".join(words)
|
||||
|
||||
|
||||
def is_named_file(filename: str, title: str) -> bool:
|
||||
stem = Path(filename).stem.lower()
|
||||
if title == "Untitled Photo":
|
||||
return False
|
||||
if stem.startswith("unnamed"):
|
||||
return False
|
||||
if KNOWN_FACEBOOK_PREFIX_RE.match(stem):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def infer_tags(filename: str, title: str) -> list[str]:
|
||||
text = f"{filename} {title}".lower()
|
||||
tags: list[str] = []
|
||||
|
||||
if any(token in text for token in ("jr", "dave jr", "baby")):
|
||||
tags.extend(["child", "dave-jr"])
|
||||
if any(token in text for token in ("me ", " me", "meand", " meand", "dave and")):
|
||||
tags.append("dave")
|
||||
if any(token in text for token in ("villanova", "nova", "rangers")):
|
||||
tags.extend(["villanova", "sports"])
|
||||
if any(token in text for token in ("gift", "gifts", "thanksgiving", "christmas", "may17")):
|
||||
tags.append("holiday")
|
||||
if any(token in text for token in ("grandma", "mom", "kevin", "erin", "noreen", "brigid")):
|
||||
tags.append("family")
|
||||
if any(token in text for token in ("trivia", "music", "marvin gaye")):
|
||||
tags.append("music")
|
||||
if any(token in text for token in ("school", "museum", "poland", "citizen")):
|
||||
tags.append("outing")
|
||||
if any(token in text for token in ("fireman", "dress", "beret", "shirt", "glasses")):
|
||||
tags.append("dress-up")
|
||||
if any(token in text for token in ("haircut", "slide", "tree", "balls", "eating")):
|
||||
tags.append("playtime")
|
||||
if not tags:
|
||||
tags.append("family")
|
||||
|
||||
return sorted(set(tags))
|
||||
|
||||
|
||||
def infer_description(filename: str, title: str) -> str:
|
||||
text = f"{filename} {title}".lower()
|
||||
named_file = is_named_file(filename, title)
|
||||
|
||||
if any(token in text for token in ("meanddave", "me and dave", "meandjr", "dave and dave jr")):
|
||||
return "Dave with Dave Jr."
|
||||
if any(token in text for token in ("withdavejr", "with dave jr")):
|
||||
return "Family snapshot with Dave Jr."
|
||||
if any(token in text for token in ("jr_nova", "villanova", "shirt villanova", "nova dave")):
|
||||
return "Villanova-flavored family snapshot."
|
||||
if any(token in text for token in ("gift", "gifts", "thanksgiving", "may17")):
|
||||
return "Family celebration moment."
|
||||
if any(token in text for token in ("fireman", "dress", "beret", "glasses")):
|
||||
return "Dress-up family snapshot."
|
||||
if any(token in text for token in ("school", "museum", "poland", "citizen")):
|
||||
return "Family outing or travel snapshot."
|
||||
if any(token in text for token in ("haircut", "slide", "tree", "balls", "eating")):
|
||||
return "Everyday family moment."
|
||||
if any(token in text for token in ("grandma", "mom", "kevin", "erin", "noreen", "brigid")):
|
||||
return "Family portrait moment."
|
||||
if any(token in text for token in ("jr", "dave jr")):
|
||||
return "Family snapshot featuring Dave Jr."
|
||||
if not named_file:
|
||||
return "Family archive snapshot awaiting fuller annotation."
|
||||
return "Family archive snapshot."
|
||||
|
||||
|
||||
def sha1_for_file(path: Path) -> str:
|
||||
digest = hashlib.sha1()
|
||||
with path.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
return digest.hexdigest()
|
||||
|
||||
|
||||
def build_rows(source_dir: Path, url_prefix: str) -> list[MediaRow]:
|
||||
rows: list[MediaRow] = []
|
||||
first_seen_by_hash: dict[str, str] = {}
|
||||
|
||||
for path in sorted(source_dir.iterdir(), key=lambda p: p.name.lower()):
|
||||
if not path.is_file() or path.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
||||
continue
|
||||
|
||||
title = humanize_stem(path.stem)
|
||||
description = infer_description(path.name, title)
|
||||
tags = infer_tags(path.name, title)
|
||||
digest = sha1_for_file(path)
|
||||
duplicate_of = first_seen_by_hash.get(digest)
|
||||
if duplicate_of is None:
|
||||
first_seen_by_hash[digest] = path.name
|
||||
|
||||
rows.append(
|
||||
MediaRow(
|
||||
filename=path.name,
|
||||
relative_url=f"{url_prefix.rstrip('/')}/{path.name}",
|
||||
title=title,
|
||||
description=description,
|
||||
tags=tags,
|
||||
sha1=digest,
|
||||
file_size=path.stat().st_size,
|
||||
duplicate_of=duplicate_of,
|
||||
named_file=is_named_file(path.name, title),
|
||||
)
|
||||
)
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def write_json(rows: Iterable[MediaRow], target: Path) -> None:
|
||||
payload = [asdict(row) for row in rows]
|
||||
target.write_text(json.dumps(payload, indent=2, ensure_ascii=True), encoding="utf-8")
|
||||
|
||||
|
||||
def write_csv(rows: Iterable[MediaRow], target: Path) -> None:
|
||||
fieldnames = [
|
||||
"filename",
|
||||
"relative_url",
|
||||
"title",
|
||||
"description",
|
||||
"tags",
|
||||
"sha1",
|
||||
"file_size",
|
||||
"duplicate_of",
|
||||
"named_file",
|
||||
]
|
||||
with target.open("w", newline="", encoding="utf-8") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
data = asdict(row)
|
||||
data["tags"] = json.dumps(data["tags"], ensure_ascii=True)
|
||||
writer.writerow(data)
|
||||
|
||||
|
||||
def sql_escape(value: str) -> str:
|
||||
return value.replace("\\", "\\\\").replace("'", "''")
|
||||
|
||||
|
||||
def choose_cover_url(rows: list[MediaRow]) -> str:
|
||||
preferred_rows = [row for row in rows if row.named_file and not row.duplicate_of]
|
||||
if preferred_rows:
|
||||
ranked = sorted(
|
||||
preferred_rows,
|
||||
key=lambda row: (
|
||||
0 if "dave and dave jr" in row.title.lower() else 1,
|
||||
0 if "dave jr" in row.title.lower() else 1,
|
||||
row.filename.lower(),
|
||||
),
|
||||
)
|
||||
return ranked[0].relative_url
|
||||
unique_rows = [row for row in rows if not row.duplicate_of]
|
||||
if unique_rows:
|
||||
return unique_rows[0].relative_url
|
||||
return rows[0].relative_url
|
||||
|
||||
|
||||
def write_sql(rows: list[MediaRow], target: Path, slug: str, album_title: str, cover_url: str, unique_only: bool) -> None:
|
||||
rows_to_write = [row for row in rows if not row.duplicate_of] if unique_only else rows
|
||||
lines = [
|
||||
"-- Generated by build_facebook_album_manifest.py",
|
||||
f"SET @folder_slug = '{sql_escape(slug)}';",
|
||||
f"SET @cover_url = '{sql_escape(cover_url)}';",
|
||||
"",
|
||||
"INSERT INTO media_folders (slug, name_en, name_nb, description_en, description_nb, cover_image_url, active, sort_order)",
|
||||
"VALUES (",
|
||||
f" @folder_slug,",
|
||||
f" '{sql_escape(album_title)}',",
|
||||
f" '{sql_escape(album_title)}',",
|
||||
" 'Imported Facebook family archive with starter captions generated from filenames. Descriptions should be reviewed and enriched.',",
|
||||
" 'Importert familiearkiv fra Facebook med starttekster generert fra filnavn. Beskrivelsene bor gjennomgaas og forbedres.',",
|
||||
" @cover_url,",
|
||||
" 1,",
|
||||
" 0",
|
||||
")",
|
||||
"ON DUPLICATE KEY UPDATE",
|
||||
" name_en = VALUES(name_en),",
|
||||
" name_nb = VALUES(name_nb),",
|
||||
" description_en = VALUES(description_en),",
|
||||
" description_nb = VALUES(description_nb),",
|
||||
" cover_image_url = VALUES(cover_image_url);",
|
||||
"",
|
||||
"SET @folder_id = (SELECT id FROM media_folders WHERE slug = @folder_slug LIMIT 1);",
|
||||
"",
|
||||
]
|
||||
|
||||
for index, row in enumerate(rows_to_write, start=1):
|
||||
tags_json = json.dumps(row.tags, ensure_ascii=True)
|
||||
lines.extend(
|
||||
[
|
||||
"INSERT INTO media (type, filename, url, title, description, category, folder_id, tags, credit, created_at)",
|
||||
"VALUES (",
|
||||
" 'image',",
|
||||
f" '{sql_escape(row.filename)}',",
|
||||
f" '{sql_escape(row.relative_url)}',",
|
||||
f" '{sql_escape(row.title)}',",
|
||||
f" '{sql_escape(row.description)}',",
|
||||
" 'family',",
|
||||
" @folder_id,",
|
||||
f" '{sql_escape(tags_json)}',",
|
||||
" 'Facebook archive import',",
|
||||
" NOW()",
|
||||
")",
|
||||
"ON DUPLICATE KEY UPDATE",
|
||||
" title = VALUES(title),",
|
||||
" description = VALUES(description),",
|
||||
" folder_id = VALUES(folder_id),",
|
||||
" tags = VALUES(tags),",
|
||||
" credit = VALUES(credit);",
|
||||
"",
|
||||
]
|
||||
)
|
||||
|
||||
target.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Build a JSON/CSV/SQL manifest for a photo folder.")
|
||||
parser.add_argument("--source", required=True, help="Path to the source image folder.")
|
||||
parser.add_argument("--slug", default="facebook-060426", help="Target media folder slug.")
|
||||
parser.add_argument("--title", default="Facebook Archive / June 4 2026", help="Folder title.")
|
||||
parser.add_argument("--url-prefix", default="/uploads/facebook-060426", help="URL prefix for deployed images.")
|
||||
parser.add_argument("--output-dir", default="C:\\wamp64\\www\\davegilligan-new\\tmp", help="Directory for generated files.")
|
||||
parser.add_argument("--include-duplicates", action="store_true", help="Include duplicate hashes in the SQL seed.")
|
||||
args = parser.parse_args()
|
||||
|
||||
source_dir = Path(args.source)
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
rows = build_rows(source_dir, args.url_prefix)
|
||||
if not rows:
|
||||
raise SystemExit("No supported images found.")
|
||||
|
||||
cover_url = choose_cover_url(rows)
|
||||
stem = slugify(args.slug)
|
||||
|
||||
write_json(rows, output_dir / f"{stem}-manifest.json")
|
||||
write_csv(rows, output_dir / f"{stem}-manifest.csv")
|
||||
write_sql(
|
||||
rows,
|
||||
output_dir / f"{stem}-seed.sql",
|
||||
args.slug,
|
||||
args.title,
|
||||
cover_url,
|
||||
unique_only=not args.include_duplicates,
|
||||
)
|
||||
|
||||
duplicate_count = sum(1 for row in rows if row.duplicate_of)
|
||||
print(f"Generated manifest for {len(rows)} files")
|
||||
print(f"Duplicate files detected: {duplicate_count}")
|
||||
print(f"Unique rows for SQL seed: {len([row for row in rows if not row.duplicate_of]) if not args.include_duplicates else len(rows)}")
|
||||
print(f"Cover image: {cover_url}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user