#!/usr/bin/env python3 from __future__ import annotations import argparse import csv import hashlib import json import re from dataclasses import asdict, dataclass from pathlib import Path from typing import Iterable SUPPORTED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif"} TOKEN_REPLACEMENTS = { "jr": "Dave Jr", "nova": "Villanova", "meanddave": "Dave And Dave Jr", "meanddave1": "Dave And Dave Jr", "meandjr": "Dave And Dave Jr", "meandjr2": "Dave And Dave Jr", "withdavejr": "With Dave Jr", "withdavejr1": "With Dave Jr", "witherin": "With Erin", "withkevin": "With Kevin", "withmom": "With Mom", "withgrandma": "With Grandma", "jr_brigid": "Dave Jr And Brigid", "jr_grandma": "Dave Jr And Grandma", "jr_nova": "Dave Jr Villanova", "jr_trivia": "Dave Jr Trivia", "four_jr": "Dave Jr At Four", "four_jr2": "Dave Jr At Four", "shirt_nova": "Shirt Villanova", "me_nova": "Me Villanova", "citizen": "Citizen", "meseum": "Museum", "me_trivia": "Me Trivia", } KNOWN_FACEBOOK_PREFIX_RE = re.compile(r"^\d{6,}_") DESCRIPTION_RULES = [ ({"villanova", "nova", "jr"}, "Dave Jr in Villanova gear."), ({"meanddave", "meandjr", "withdavejr"}, "Dave with Dave Jr."), ({"dave", "dad", "me"}, "Family moment with Dave."), ({"jr"}, "Family snapshot featuring Dave Jr."), ({"grandma", "mom", "kevin", "erin", "noreen"}, "Family portrait moment."), ({"gift", "gifts"}, "Gift-opening or celebration moment."), ({"school"}, "School-related family snapshot."), ({"thanksgiving"}, "Holiday family gathering moment."), ({"trivia"}, "Trivia and music related family snapshot."), ({"poland"}, "Travel memory from Poland."), ({"museum"}, "Museum or outing snapshot."), ({"fireman"}, "Dress-up or costume moment."), ] @dataclass class MediaRow: filename: str relative_url: str title: str description: str tags: list[str] sha1: str file_size: int duplicate_of: str | None = None named_file: bool = False def slugify(text: str) -> str: text = text.lower() text = re.sub(r"[^a-z0-9]+", "-", text) return text.strip("-") def humanize_stem(stem: str) -> str: cleaned = stem.strip() cleaned = re.sub(r"^\d+", "", cleaned) cleaned = re.sub(r"_\d{6,}.*$", "", cleaned) cleaned = cleaned.replace("(1)", "").replace("(2)", "").replace("(3)", "") cleaned = cleaned.replace("(4)", "").replace("(5)", "").replace("(6)", "").replace("(7)", "") lowered = cleaned.lower().strip() if lowered in TOKEN_REPLACEMENTS: return TOKEN_REPLACEMENTS[lowered] lowered = lowered.replace("_", " ").replace("-", " ") lowered = re.sub(r"\s+", " ", lowered).strip() if not lowered: return "Untitled Photo" words: list[str] = [] for word in lowered.split(): if word in {"jr", "jr."}: words.append("Dave Jr") elif word in {"nova", "villanova"}: words.append("Villanova") elif word in {"me", "dave", "kevin", "erin", "noreen", "mom", "grandma", "poland", "museum", "trivia"}: words.append(word.capitalize()) else: words.append(word.capitalize()) return " ".join(words) def is_named_file(filename: str, title: str) -> bool: stem = Path(filename).stem.lower() if title == "Untitled Photo": return False if stem.startswith("unnamed"): return False if KNOWN_FACEBOOK_PREFIX_RE.match(stem): return False return True def infer_tags(filename: str, title: str) -> list[str]: text = f"{filename} {title}".lower() tags: list[str] = [] if any(token in text for token in ("jr", "dave jr", "baby")): tags.extend(["child", "dave-jr"]) if any(token in text for token in ("me ", " me", "meand", " meand", "dave and")): tags.append("dave") if any(token in text for token in ("villanova", "nova", "rangers")): tags.extend(["villanova", "sports"]) if any(token in text for token in ("gift", "gifts", "thanksgiving", "christmas", "may17")): tags.append("holiday") if any(token in text for token in ("grandma", "mom", "kevin", "erin", "noreen", "brigid")): tags.append("family") if any(token in text for token in ("trivia", "music", "marvin gaye")): tags.append("music") if any(token in text for token in ("school", "museum", "poland", "citizen")): tags.append("outing") if any(token in text for token in ("fireman", "dress", "beret", "shirt", "glasses")): tags.append("dress-up") if any(token in text for token in ("haircut", "slide", "tree", "balls", "eating")): tags.append("playtime") if not tags: tags.append("family") return sorted(set(tags)) def infer_description(filename: str, title: str) -> str: text = f"{filename} {title}".lower() named_file = is_named_file(filename, title) if any(token in text for token in ("meanddave", "me and dave", "meandjr", "dave and dave jr")): return "Dave with Dave Jr." if any(token in text for token in ("withdavejr", "with dave jr")): return "Family snapshot with Dave Jr." if any(token in text for token in ("jr_nova", "villanova", "shirt villanova", "nova dave")): return "Villanova-flavored family snapshot." if any(token in text for token in ("gift", "gifts", "thanksgiving", "may17")): return "Family celebration moment." if any(token in text for token in ("fireman", "dress", "beret", "glasses")): return "Dress-up family snapshot." if any(token in text for token in ("school", "museum", "poland", "citizen")): return "Family outing or travel snapshot." if any(token in text for token in ("haircut", "slide", "tree", "balls", "eating")): return "Everyday family moment." if any(token in text for token in ("grandma", "mom", "kevin", "erin", "noreen", "brigid")): return "Family portrait moment." if any(token in text for token in ("jr", "dave jr")): return "Family snapshot featuring Dave Jr." if not named_file: return "Family archive snapshot awaiting fuller annotation." return "Family archive snapshot." def sha1_for_file(path: Path) -> str: digest = hashlib.sha1() with path.open("rb") as handle: for chunk in iter(lambda: handle.read(1024 * 1024), b""): digest.update(chunk) return digest.hexdigest() def build_rows(source_dir: Path, url_prefix: str) -> list[MediaRow]: rows: list[MediaRow] = [] first_seen_by_hash: dict[str, str] = {} for path in sorted(source_dir.iterdir(), key=lambda p: p.name.lower()): if not path.is_file() or path.suffix.lower() not in SUPPORTED_EXTENSIONS: continue title = humanize_stem(path.stem) description = infer_description(path.name, title) tags = infer_tags(path.name, title) digest = sha1_for_file(path) duplicate_of = first_seen_by_hash.get(digest) if duplicate_of is None: first_seen_by_hash[digest] = path.name rows.append( MediaRow( filename=path.name, relative_url=f"{url_prefix.rstrip('/')}/{path.name}", title=title, description=description, tags=tags, sha1=digest, file_size=path.stat().st_size, duplicate_of=duplicate_of, named_file=is_named_file(path.name, title), ) ) return rows def write_json(rows: Iterable[MediaRow], target: Path) -> None: payload = [asdict(row) for row in rows] target.write_text(json.dumps(payload, indent=2, ensure_ascii=True), encoding="utf-8") def write_csv(rows: Iterable[MediaRow], target: Path) -> None: fieldnames = [ "filename", "relative_url", "title", "description", "tags", "sha1", "file_size", "duplicate_of", "named_file", ] with target.open("w", newline="", encoding="utf-8") as handle: writer = csv.DictWriter(handle, fieldnames=fieldnames) writer.writeheader() for row in rows: data = asdict(row) data["tags"] = json.dumps(data["tags"], ensure_ascii=True) writer.writerow(data) def sql_escape(value: str) -> str: return value.replace("\\", "\\\\").replace("'", "''") def choose_cover_url(rows: list[MediaRow]) -> str: preferred_rows = [row for row in rows if row.named_file and not row.duplicate_of] if preferred_rows: ranked = sorted( preferred_rows, key=lambda row: ( 0 if "dave and dave jr" in row.title.lower() else 1, 0 if "dave jr" in row.title.lower() else 1, row.filename.lower(), ), ) return ranked[0].relative_url unique_rows = [row for row in rows if not row.duplicate_of] if unique_rows: return unique_rows[0].relative_url return rows[0].relative_url def write_sql(rows: list[MediaRow], target: Path, slug: str, album_title: str, cover_url: str, unique_only: bool) -> None: rows_to_write = [row for row in rows if not row.duplicate_of] if unique_only else rows lines = [ "-- Generated by build_facebook_album_manifest.py", f"SET @folder_slug = '{sql_escape(slug)}';", f"SET @cover_url = '{sql_escape(cover_url)}';", "", "INSERT INTO media_folders (slug, name_en, name_nb, description_en, description_nb, cover_image_url, active, sort_order)", "VALUES (", f" @folder_slug,", f" '{sql_escape(album_title)}',", f" '{sql_escape(album_title)}',", " 'Imported Facebook family archive with starter captions generated from filenames. Descriptions should be reviewed and enriched.',", " 'Importert familiearkiv fra Facebook med starttekster generert fra filnavn. Beskrivelsene bor gjennomgaas og forbedres.',", " @cover_url,", " 1,", " 0", ")", "ON DUPLICATE KEY UPDATE", " name_en = VALUES(name_en),", " name_nb = VALUES(name_nb),", " description_en = VALUES(description_en),", " description_nb = VALUES(description_nb),", " cover_image_url = VALUES(cover_image_url);", "", "SET @folder_id = (SELECT id FROM media_folders WHERE slug = @folder_slug LIMIT 1);", "", ] for index, row in enumerate(rows_to_write, start=1): tags_json = json.dumps(row.tags, ensure_ascii=True) lines.extend( [ "INSERT INTO media (type, filename, url, title, description, category, folder_id, tags, credit, created_at)", "VALUES (", " 'image',", f" '{sql_escape(row.filename)}',", f" '{sql_escape(row.relative_url)}',", f" '{sql_escape(row.title)}',", f" '{sql_escape(row.description)}',", " 'family',", " @folder_id,", f" '{sql_escape(tags_json)}',", " 'Facebook archive import',", " NOW()", ")", "ON DUPLICATE KEY UPDATE", " title = VALUES(title),", " description = VALUES(description),", " folder_id = VALUES(folder_id),", " tags = VALUES(tags),", " credit = VALUES(credit);", "", ] ) target.write_text("\n".join(lines) + "\n", encoding="utf-8") def main() -> None: parser = argparse.ArgumentParser(description="Build a JSON/CSV/SQL manifest for a photo folder.") parser.add_argument("--source", required=True, help="Path to the source image folder.") parser.add_argument("--slug", default="facebook-060426", help="Target media folder slug.") parser.add_argument("--title", default="Facebook Archive / June 4 2026", help="Folder title.") parser.add_argument("--url-prefix", default="/uploads/facebook-060426", help="URL prefix for deployed images.") parser.add_argument("--output-dir", default="C:\\wamp64\\www\\davegilligan-new\\tmp", help="Directory for generated files.") parser.add_argument("--include-duplicates", action="store_true", help="Include duplicate hashes in the SQL seed.") args = parser.parse_args() source_dir = Path(args.source) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) rows = build_rows(source_dir, args.url_prefix) if not rows: raise SystemExit("No supported images found.") cover_url = choose_cover_url(rows) stem = slugify(args.slug) write_json(rows, output_dir / f"{stem}-manifest.json") write_csv(rows, output_dir / f"{stem}-manifest.csv") write_sql( rows, output_dir / f"{stem}-seed.sql", args.slug, args.title, cover_url, unique_only=not args.include_duplicates, ) duplicate_count = sum(1 for row in rows if row.duplicate_of) print(f"Generated manifest for {len(rows)} files") print(f"Duplicate files detected: {duplicate_count}") print(f"Unique rows for SQL seed: {len([row for row in rows if not row.duplicate_of]) if not args.include_duplicates else len(rows)}") print(f"Cover image: {cover_url}") print(f"Output directory: {output_dir}") if __name__ == "__main__": main()