prepare( 'SELECT COUNT(*) FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? AND COLUMN_NAME = ?' ); $stmt->execute([$table, $column]); return (int)$stmt->fetchColumn() > 0; } function ownershipJson(array $value): string { return json_encode($value, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); } function ownershipMapSchedule(?string $schedule): string { return match ($schedule) { 'daily' => 'daily', 'weekly' => 'weekly', default => 'manual', }; } function ownershipScraperTags(array $source, string $packageSlug, string $corpusSlug): string { $tags = []; foreach (explode(',', (string)($source['tags'] ?? '')) as $tag) { $tag = trim($tag); if ($tag !== '') { $tags[$tag] = true; } } $tags['corpus-source:' . (int)$source['id']] = true; $tags['package:' . $packageSlug] = true; $tags['owned-corpus:' . $corpusSlug] = true; return implode(', ', array_keys($tags)); } function ownershipScraperDescription(array $source, string $packageSlug, string $corpusName): string { $description = trim((string)($source['description'] ?? '')); $suffix = sprintf( 'Mirrored from owned Caveau package %s / corpus %s / corpus_sources.id=%d.', $packageSlug, $corpusName, (int)$source['id'] ); return trim($description === '' ? $suffix : $description . "\n\n" . $suffix); } $packageSlug = dbnToolsEnv('DBN_CAVEAU_OWNER_PACKAGE_SLUG') ?: dbnToolsRequiredPackageSlug(); try { $db = dbnToolsDb(); $db->beginTransaction(); $client = dbnToolsFetchClient($db); if (!$client || empty($client['is_active'])) { throw new RuntimeException('Do Better Norge client tenant is not active or was not found.'); } $clientId = (int)$client['id']; $package = dbnToolsFetchPackage($packageSlug, $db); if (!$package || empty($package['is_active']) || empty($package['corpus_id'])) { throw new RuntimeException("Active package {$packageSlug} with a corpus_id was not found."); } $packageId = (int)$package['id']; $corpusId = (int)$package['corpus_id']; if (!dbnToolsHasActiveSubscription($clientId, $packageId, $db)) { $stmt = $db->prepare( 'INSERT INTO client_corpus_subscriptions (client_id, package_id, is_active, source) VALUES (?, ?, 1, ?) ON DUPLICATE KEY UPDATE is_active = VALUES(is_active), source = VALUES(source), cancelled_at = NULL' ); $stmt->execute([$clientId, $packageId, 'manual']); } $stmt = $db->prepare('SELECT id, slug, name FROM corpuses WHERE id = ? LIMIT 1'); $stmt->execute([$corpusId]); $corpus = $stmt->fetch(PDO::FETCH_ASSOC); if (!$corpus) { throw new RuntimeException("Corpus #{$corpusId} was not found."); } $ownershipUpdates = []; $ownershipValues = []; if (ownershipColumnExists($db, 'corpuses', 'owner_type')) { $ownershipUpdates[] = 'owner_type = ?'; $ownershipValues[] = 'client'; } if (ownershipColumnExists($db, 'corpuses', 'owner_client_id')) { $ownershipUpdates[] = 'owner_client_id = ?'; $ownershipValues[] = $clientId; } if (ownershipColumnExists($db, 'corpuses', 'owner_notes')) { $ownershipUpdates[] = 'owner_notes = ?'; $ownershipValues[] = sprintf( 'Owned and controlled by Do Better Norge (%s) via %s as of %s. Package: %s.', (string)$client['slug'], (string)($client['contact_email'] ?? 'daveadmin@dobetternorge.no'), date('Y-m-d'), $packageSlug ); } if ($ownershipUpdates !== []) { $stmt = $db->prepare('UPDATE corpuses SET ' . implode(', ', $ownershipUpdates) . ' WHERE id = ?'); $stmt->execute([...$ownershipValues, $corpusId]); } $stmt = $db->prepare('SELECT id FROM client_corpora WHERE client_id = ? AND slug = ? LIMIT 1'); $stmt->execute([$clientId, 'do-better-norge']); $defaultCorpusId = (int)($stmt->fetchColumn() ?: 0); if ($defaultCorpusId > 0) { $db->prepare( 'UPDATE client_corpora SET name = ?, description = ?, is_default = 1 WHERE id = ? AND client_id = ?' )->execute([ 'Do Better Norge', 'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.', $defaultCorpusId, $clientId, ]); } else { $stmt = $db->prepare( 'INSERT INTO client_corpora (client_id, name, slug, description, is_default) VALUES (?, ?, ?, ?, 1)' ); $stmt->execute([ $clientId, 'Do Better Norge', 'do-better-norge', 'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.', ]); $defaultCorpusId = (int)$db->lastInsertId(); } $db->prepare('UPDATE client_corpora SET is_default = 0 WHERE client_id = ? AND id <> ?') ->execute([$clientId, $defaultCorpusId]); $stmt = $db->prepare( 'SELECT id, name, description, url, scraper_type, schedule, cron_day, cron_hour, depth, pdf_only, is_active, category, language, tags, total_docs_ingested FROM corpus_sources WHERE corpus_id = ? AND scraper_type IN ("website", "pdf", "rss") ORDER BY is_active DESC, name ASC' ); $stmt->execute([$corpusId]); $sources = $stmt->fetchAll(PDO::FETCH_ASSOC); $inserted = 0; $updated = 0; foreach ($sources as $source) { $sourceId = (int)$source['id']; $marker = '%corpus-source:' . $sourceId . '%'; $lookup = $db->prepare( 'SELECT id FROM client_scraper_sources WHERE client_id = ? AND tags LIKE ? LIMIT 1' ); $lookup->execute([$clientId, $marker]); $clientSourceId = (int)($lookup->fetchColumn() ?: 0); $isActive = (int)!empty($source['is_active']); $status = $isActive === 1 ? 'approved' : 'paused'; $schedule = ownershipMapSchedule((string)($source['schedule'] ?? 'manual')); $tags = ownershipScraperTags($source, $packageSlug, (string)$corpus['slug']); $description = ownershipScraperDescription($source, $packageSlug, (string)$corpus['name']); $maxPages = ((string)($source['scraper_type'] ?? '') === 'pdf' || (int)($source['pdf_only'] ?? 0) === 1) ? 100 : 50; $values = [ (string)$source['name'], (string)$source['url'], $description, (string)$source['scraper_type'], $schedule, (int)($source['cron_day'] ?? 0), (int)($source['cron_hour'] ?? 3), max(1, min(5, (int)($source['depth'] ?? 1))), $maxPages, (int)($source['pdf_only'] ?? 0), (string)($source['category'] ?? 'legal'), (string)($source['language'] ?? 'no'), $tags, $status, 'Approved as an owned Do Better Norge source mirrored from the package corpus.', $isActive, (int)($source['total_docs_ingested'] ?? 0), ]; if ($clientSourceId > 0) { $stmt = $db->prepare( 'UPDATE client_scraper_sources SET name = ?, url = ?, description = ?, scraper_type = ?, schedule = ?, cron_day = ?, cron_hour = ?, max_depth = ?, max_pages = ?, pdf_only = ?, category = ?, language = ?, tags = ?, status = ?, approval_notes = ?, is_active = ?, total_docs_ingested = ? WHERE id = ? AND client_id = ?' ); $stmt->execute([...$values, $clientSourceId, $clientId]); $updated++; continue; } $stmt = $db->prepare( 'INSERT INTO client_scraper_sources (client_id, name, url, description, scraper_type, schedule, cron_day, cron_hour, max_depth, max_pages, pdf_only, category, language, tags, status, approval_notes, approved_at, is_active, total_docs_ingested) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), ?, ?)' ); $stmt->execute([$clientId, ...$values]); $inserted++; } $stmt = $db->prepare( 'SELECT COUNT(*) FROM corpus_sources WHERE corpus_id = ? AND scraper_type NOT IN ("website", "pdf", "rss")' ); $stmt->execute([$corpusId]); $systemOnlySources = (int)$stmt->fetchColumn(); $db->commit(); echo "Caveau corpus ownership configured.\n"; echo "Client: {$client['slug']} (#{$clientId})\n"; echo "Package: {$packageSlug} (#{$packageId})\n"; echo "Owned corpus: {$corpus['slug']} (#{$corpusId})\n"; echo "Default client corpus: do-better-norge (#{$defaultCorpusId})\n"; echo "Client scrapers mirrored: {$inserted} inserted, {$updated} updated\n"; echo "System-only package sources skipped: {$systemOnlySources}\n"; } catch (Throwable $e) { if (isset($db) && $db instanceof PDO && $db->inTransaction()) { $db->rollBack(); } fwrite(STDERR, "Ownership setup failed: {$e->getMessage()}\n"); exit(1); }