#!/usr/bin/env python3
from __future__ import annotations

import argparse
import csv
import hashlib
import json
import re
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Iterable


SUPPORTED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}


TOKEN_REPLACEMENTS = {
    "jr": "Dave Jr",
    "nova": "Villanova",
    "meanddave": "Dave And Dave Jr",
    "meanddave1": "Dave And Dave Jr",
    "meandjr": "Dave And Dave Jr",
    "meandjr2": "Dave And Dave Jr",
    "withdavejr": "With Dave Jr",
    "withdavejr1": "With Dave Jr",
    "witherin": "With Erin",
    "withkevin": "With Kevin",
    "withmom": "With Mom",
    "withgrandma": "With Grandma",
    "jr_brigid": "Dave Jr And Brigid",
    "jr_grandma": "Dave Jr And Grandma",
    "jr_nova": "Dave Jr Villanova",
    "jr_trivia": "Dave Jr Trivia",
    "four_jr": "Dave Jr At Four",
    "four_jr2": "Dave Jr At Four",
    "shirt_nova": "Shirt Villanova",
    "me_nova": "Me Villanova",
    "citizen": "Citizen",
    "meseum": "Museum",
    "me_trivia": "Me Trivia",
}

KNOWN_FACEBOOK_PREFIX_RE = re.compile(r"^\d{6,}_")


DESCRIPTION_RULES = [
    ({"villanova", "nova", "jr"}, "Dave Jr in Villanova gear."),
    ({"meanddave", "meandjr", "withdavejr"}, "Dave with Dave Jr."),
    ({"dave", "dad", "me"}, "Family moment with Dave."),
    ({"jr"}, "Family snapshot featuring Dave Jr."),
    ({"grandma", "mom", "kevin", "erin", "noreen"}, "Family portrait moment."),
    ({"gift", "gifts"}, "Gift-opening or celebration moment."),
    ({"school"}, "School-related family snapshot."),
    ({"thanksgiving"}, "Holiday family gathering moment."),
    ({"trivia"}, "Trivia and music related family snapshot."),
    ({"poland"}, "Travel memory from Poland."),
    ({"museum"}, "Museum or outing snapshot."),
    ({"fireman"}, "Dress-up or costume moment."),
]


@dataclass
class MediaRow:
    filename: str
    relative_url: str
    title: str
    description: str
    tags: list[str]
    sha1: str
    file_size: int
    duplicate_of: str | None = None
    named_file: bool = False


def slugify(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z0-9]+", "-", text)
    return text.strip("-")


def humanize_stem(stem: str) -> str:
    cleaned = stem.strip()
    cleaned = re.sub(r"^\d+", "", cleaned)
    cleaned = re.sub(r"_\d{6,}.*$", "", cleaned)
    cleaned = cleaned.replace("(1)", "").replace("(2)", "").replace("(3)", "")
    cleaned = cleaned.replace("(4)", "").replace("(5)", "").replace("(6)", "").replace("(7)", "")
    lowered = cleaned.lower().strip()
    if lowered in TOKEN_REPLACEMENTS:
        return TOKEN_REPLACEMENTS[lowered]

    lowered = lowered.replace("_", " ").replace("-", " ")
    lowered = re.sub(r"\s+", " ", lowered).strip()
    if not lowered:
        return "Untitled Photo"

    words: list[str] = []
    for word in lowered.split():
        if word in {"jr", "jr."}:
            words.append("Dave Jr")
        elif word in {"nova", "villanova"}:
            words.append("Villanova")
        elif word in {"me", "dave", "kevin", "erin", "noreen", "mom", "grandma", "poland", "museum", "trivia"}:
            words.append(word.capitalize())
        else:
            words.append(word.capitalize())
    return " ".join(words)


def is_named_file(filename: str, title: str) -> bool:
    stem = Path(filename).stem.lower()
    if title == "Untitled Photo":
        return False
    if stem.startswith("unnamed"):
        return False
    if KNOWN_FACEBOOK_PREFIX_RE.match(stem):
        return False
    return True


def infer_tags(filename: str, title: str) -> list[str]:
    text = f"{filename} {title}".lower()
    tags: list[str] = []

    if any(token in text for token in ("jr", "dave jr", "baby")):
        tags.extend(["child", "dave-jr"])
    if any(token in text for token in ("me ", " me", "meand", " meand", "dave and")):
        tags.append("dave")
    if any(token in text for token in ("villanova", "nova", "rangers")):
        tags.extend(["villanova", "sports"])
    if any(token in text for token in ("gift", "gifts", "thanksgiving", "christmas", "may17")):
        tags.append("holiday")
    if any(token in text for token in ("grandma", "mom", "kevin", "erin", "noreen", "brigid")):
        tags.append("family")
    if any(token in text for token in ("trivia", "music", "marvin gaye")):
        tags.append("music")
    if any(token in text for token in ("school", "museum", "poland", "citizen")):
        tags.append("outing")
    if any(token in text for token in ("fireman", "dress", "beret", "shirt", "glasses")):
        tags.append("dress-up")
    if any(token in text for token in ("haircut", "slide", "tree", "balls", "eating")):
        tags.append("playtime")
    if not tags:
        tags.append("family")

    return sorted(set(tags))


def infer_description(filename: str, title: str) -> str:
    text = f"{filename} {title}".lower()
    named_file = is_named_file(filename, title)

    if any(token in text for token in ("meanddave", "me and dave", "meandjr", "dave and dave jr")):
        return "Dave with Dave Jr."
    if any(token in text for token in ("withdavejr", "with dave jr")):
        return "Family snapshot with Dave Jr."
    if any(token in text for token in ("jr_nova", "villanova", "shirt villanova", "nova dave")):
        return "Villanova-flavored family snapshot."
    if any(token in text for token in ("gift", "gifts", "thanksgiving", "may17")):
        return "Family celebration moment."
    if any(token in text for token in ("fireman", "dress", "beret", "glasses")):
        return "Dress-up family snapshot."
    if any(token in text for token in ("school", "museum", "poland", "citizen")):
        return "Family outing or travel snapshot."
    if any(token in text for token in ("haircut", "slide", "tree", "balls", "eating")):
        return "Everyday family moment."
    if any(token in text for token in ("grandma", "mom", "kevin", "erin", "noreen", "brigid")):
        return "Family portrait moment."
    if any(token in text for token in ("jr", "dave jr")):
        return "Family snapshot featuring Dave Jr."
    if not named_file:
        return "Family archive snapshot awaiting fuller annotation."
    return "Family archive snapshot."


def sha1_for_file(path: Path) -> str:
    digest = hashlib.sha1()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()


def build_rows(source_dir: Path, url_prefix: str) -> list[MediaRow]:
    rows: list[MediaRow] = []
    first_seen_by_hash: dict[str, str] = {}

    for path in sorted(source_dir.iterdir(), key=lambda p: p.name.lower()):
        if not path.is_file() or path.suffix.lower() not in SUPPORTED_EXTENSIONS:
            continue

        title = humanize_stem(path.stem)
        description = infer_description(path.name, title)
        tags = infer_tags(path.name, title)
        digest = sha1_for_file(path)
        duplicate_of = first_seen_by_hash.get(digest)
        if duplicate_of is None:
            first_seen_by_hash[digest] = path.name

        rows.append(
            MediaRow(
                filename=path.name,
                relative_url=f"{url_prefix.rstrip('/')}/{path.name}",
                title=title,
                description=description,
                tags=tags,
                sha1=digest,
                file_size=path.stat().st_size,
                duplicate_of=duplicate_of,
                named_file=is_named_file(path.name, title),
            )
        )

    return rows


def write_json(rows: Iterable[MediaRow], target: Path) -> None:
    payload = [asdict(row) for row in rows]
    target.write_text(json.dumps(payload, indent=2, ensure_ascii=True), encoding="utf-8")


def write_csv(rows: Iterable[MediaRow], target: Path) -> None:
    fieldnames = [
        "filename",
        "relative_url",
        "title",
        "description",
        "tags",
        "sha1",
        "file_size",
        "duplicate_of",
        "named_file",
    ]
    with target.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            data = asdict(row)
            data["tags"] = json.dumps(data["tags"], ensure_ascii=True)
            writer.writerow(data)


def sql_escape(value: str) -> str:
    return value.replace("\\", "\\\\").replace("'", "''")


def choose_cover_url(rows: list[MediaRow]) -> str:
    preferred_rows = [row for row in rows if row.named_file and not row.duplicate_of]
    if preferred_rows:
        ranked = sorted(
            preferred_rows,
            key=lambda row: (
                0 if "dave and dave jr" in row.title.lower() else 1,
                0 if "dave jr" in row.title.lower() else 1,
                row.filename.lower(),
            ),
        )
        return ranked[0].relative_url
    unique_rows = [row for row in rows if not row.duplicate_of]
    if unique_rows:
        return unique_rows[0].relative_url
    return rows[0].relative_url


def write_sql(rows: list[MediaRow], target: Path, slug: str, album_title: str, cover_url: str, unique_only: bool) -> None:
    rows_to_write = [row for row in rows if not row.duplicate_of] if unique_only else rows
    lines = [
        "-- Generated by build_facebook_album_manifest.py",
        f"SET @folder_slug = '{sql_escape(slug)}';",
        f"SET @cover_url = '{sql_escape(cover_url)}';",
        "",
        "INSERT INTO media_folders (slug, name_en, name_nb, description_en, description_nb, cover_image_url, active, sort_order)",
        "VALUES (",
        f"  @folder_slug,",
        f"  '{sql_escape(album_title)}',",
        f"  '{sql_escape(album_title)}',",
        "  'Imported Facebook family archive with starter captions generated from filenames. Descriptions should be reviewed and enriched.',",
        "  'Importert familiearkiv fra Facebook med starttekster generert fra filnavn. Beskrivelsene bor gjennomgaas og forbedres.',",
        "  @cover_url,",
        "  1,",
        "  0",
        ")",
        "ON DUPLICATE KEY UPDATE",
        "  name_en = VALUES(name_en),",
        "  name_nb = VALUES(name_nb),",
        "  description_en = VALUES(description_en),",
        "  description_nb = VALUES(description_nb),",
        "  cover_image_url = VALUES(cover_image_url);",
        "",
        "SET @folder_id = (SELECT id FROM media_folders WHERE slug = @folder_slug LIMIT 1);",
        "",
    ]

    for index, row in enumerate(rows_to_write, start=1):
        tags_json = json.dumps(row.tags, ensure_ascii=True)
        lines.extend(
            [
                "INSERT INTO media (type, filename, url, title, description, category, folder_id, tags, credit, created_at)",
                "VALUES (",
                "  'image',",
                f"  '{sql_escape(row.filename)}',",
                f"  '{sql_escape(row.relative_url)}',",
                f"  '{sql_escape(row.title)}',",
                f"  '{sql_escape(row.description)}',",
                "  'family',",
                "  @folder_id,",
                f"  '{sql_escape(tags_json)}',",
                "  'Facebook archive import',",
                "  NOW()",
                ")",
                "ON DUPLICATE KEY UPDATE",
                "  title = VALUES(title),",
                "  description = VALUES(description),",
                "  folder_id = VALUES(folder_id),",
                "  tags = VALUES(tags),",
                "  credit = VALUES(credit);",
                "",
            ]
        )

    target.write_text("\n".join(lines) + "\n", encoding="utf-8")


def main() -> None:
    parser = argparse.ArgumentParser(description="Build a JSON/CSV/SQL manifest for a photo folder.")
    parser.add_argument("--source", required=True, help="Path to the source image folder.")
    parser.add_argument("--slug", default="facebook-060426", help="Target media folder slug.")
    parser.add_argument("--title", default="Facebook Archive / June 4 2026", help="Folder title.")
    parser.add_argument("--url-prefix", default="/uploads/facebook-060426", help="URL prefix for deployed images.")
    parser.add_argument("--output-dir", default="C:\\wamp64\\www\\davegilligan-new\\tmp", help="Directory for generated files.")
    parser.add_argument("--include-duplicates", action="store_true", help="Include duplicate hashes in the SQL seed.")
    args = parser.parse_args()

    source_dir = Path(args.source)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    rows = build_rows(source_dir, args.url_prefix)
    if not rows:
        raise SystemExit("No supported images found.")

    cover_url = choose_cover_url(rows)
    stem = slugify(args.slug)

    write_json(rows, output_dir / f"{stem}-manifest.json")
    write_csv(rows, output_dir / f"{stem}-manifest.csv")
    write_sql(
        rows,
        output_dir / f"{stem}-seed.sql",
        args.slug,
        args.title,
        cover_url,
        unique_only=not args.include_duplicates,
    )

    duplicate_count = sum(1 for row in rows if row.duplicate_of)
    print(f"Generated manifest for {len(rows)} files")
    print(f"Duplicate files detected: {duplicate_count}")
    print(f"Unique rows for SQL seed: {len([row for row in rows if not row.duplicate_of]) if not args.include_duplicates else len(rows)}")
    print(f"Cover image: {cover_url}")
    print(f"Output directory: {output_dir}")


if __name__ == "__main__":
    main()