diff --git a/scripts/setup-caveau-corpus-ownership.php b/scripts/setup-caveau-corpus-ownership.php new file mode 100644 index 0000000..0b8c23f --- /dev/null +++ b/scripts/setup-caveau-corpus-ownership.php @@ -0,0 +1,258 @@ +prepare( + 'SELECT COUNT(*) + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? AND COLUMN_NAME = ?' + ); + $stmt->execute([$table, $column]); + return (int)$stmt->fetchColumn() > 0; +} + +function ownershipJson(array $value): string +{ + return json_encode($value, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); +} + +function ownershipMapSchedule(?string $schedule): string +{ + return match ($schedule) { + 'daily' => 'daily', + 'weekly' => 'weekly', + default => 'manual', + }; +} + +function ownershipScraperTags(array $source, string $packageSlug, string $corpusSlug): string +{ + $tags = []; + foreach (explode(',', (string)($source['tags'] ?? '')) as $tag) { + $tag = trim($tag); + if ($tag !== '') { + $tags[$tag] = true; + } + } + + $tags['corpus-source:' . (int)$source['id']] = true; + $tags['package:' . $packageSlug] = true; + $tags['owned-corpus:' . $corpusSlug] = true; + + return implode(', ', array_keys($tags)); +} + +function ownershipScraperDescription(array $source, string $packageSlug, string $corpusName): string +{ + $description = trim((string)($source['description'] ?? '')); + $suffix = sprintf( + 'Mirrored from owned Caveau package %s / corpus %s / corpus_sources.id=%d.', + $packageSlug, + $corpusName, + (int)$source['id'] + ); + + return trim($description === '' ? $suffix : $description . "\n\n" . $suffix); +} + +$packageSlug = dbnToolsEnv('DBN_CAVEAU_OWNER_PACKAGE_SLUG') ?: dbnToolsRequiredPackageSlug(); + +try { + $db = dbnToolsDb(); + $db->beginTransaction(); + + $client = dbnToolsFetchClient($db); + if (!$client || empty($client['is_active'])) { + throw new RuntimeException('Do Better Norge client tenant is not active or was not found.'); + } + $clientId = (int)$client['id']; + + $package = dbnToolsFetchPackage($packageSlug, $db); + if (!$package || empty($package['is_active']) || empty($package['corpus_id'])) { + throw new RuntimeException("Active package {$packageSlug} with a corpus_id was not found."); + } + $packageId = (int)$package['id']; + $corpusId = (int)$package['corpus_id']; + + if (!dbnToolsHasActiveSubscription($clientId, $packageId, $db)) { + $stmt = $db->prepare( + 'INSERT INTO client_corpus_subscriptions (client_id, package_id, is_active, source) + VALUES (?, ?, 1, ?) + ON DUPLICATE KEY UPDATE is_active = VALUES(is_active), source = VALUES(source), cancelled_at = NULL' + ); + $stmt->execute([$clientId, $packageId, 'manual']); + } + + $stmt = $db->prepare('SELECT id, slug, name FROM corpuses WHERE id = ? LIMIT 1'); + $stmt->execute([$corpusId]); + $corpus = $stmt->fetch(PDO::FETCH_ASSOC); + if (!$corpus) { + throw new RuntimeException("Corpus #{$corpusId} was not found."); + } + + $ownershipUpdates = []; + $ownershipValues = []; + if (ownershipColumnExists($db, 'corpuses', 'owner_type')) { + $ownershipUpdates[] = 'owner_type = ?'; + $ownershipValues[] = 'client'; + } + if (ownershipColumnExists($db, 'corpuses', 'owner_client_id')) { + $ownershipUpdates[] = 'owner_client_id = ?'; + $ownershipValues[] = $clientId; + } + if (ownershipColumnExists($db, 'corpuses', 'owner_notes')) { + $ownershipUpdates[] = 'owner_notes = ?'; + $ownershipValues[] = sprintf( + 'Owned and controlled by Do Better Norge (%s) via %s as of %s. Package: %s.', + (string)$client['slug'], + (string)($client['contact_email'] ?? 'daveadmin@dobetternorge.no'), + date('Y-m-d'), + $packageSlug + ); + } + + if ($ownershipUpdates !== []) { + $stmt = $db->prepare('UPDATE corpuses SET ' . implode(', ', $ownershipUpdates) . ' WHERE id = ?'); + $stmt->execute([...$ownershipValues, $corpusId]); + } + + $stmt = $db->prepare('SELECT id FROM client_corpora WHERE client_id = ? AND slug = ? LIMIT 1'); + $stmt->execute([$clientId, 'do-better-norge']); + $defaultCorpusId = (int)($stmt->fetchColumn() ?: 0); + + if ($defaultCorpusId > 0) { + $db->prepare( + 'UPDATE client_corpora + SET name = ?, description = ?, is_default = 1 + WHERE id = ? AND client_id = ?' + )->execute([ + 'Do Better Norge', + 'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.', + $defaultCorpusId, + $clientId, + ]); + } else { + $stmt = $db->prepare( + 'INSERT INTO client_corpora (client_id, name, slug, description, is_default) + VALUES (?, ?, ?, ?, 1)' + ); + $stmt->execute([ + $clientId, + 'Do Better Norge', + 'do-better-norge', + 'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.', + ]); + $defaultCorpusId = (int)$db->lastInsertId(); + } + + $db->prepare('UPDATE client_corpora SET is_default = 0 WHERE client_id = ? AND id <> ?') + ->execute([$clientId, $defaultCorpusId]); + + $stmt = $db->prepare( + 'SELECT id, name, description, url, scraper_type, schedule, cron_day, cron_hour, + depth, pdf_only, is_active, category, language, tags, total_docs_ingested + FROM corpus_sources + WHERE corpus_id = ? + AND scraper_type IN ("website", "pdf", "rss") + ORDER BY is_active DESC, name ASC' + ); + $stmt->execute([$corpusId]); + $sources = $stmt->fetchAll(PDO::FETCH_ASSOC); + + $inserted = 0; + $updated = 0; + foreach ($sources as $source) { + $sourceId = (int)$source['id']; + $marker = '%corpus-source:' . $sourceId . '%'; + $lookup = $db->prepare( + 'SELECT id FROM client_scraper_sources + WHERE client_id = ? AND tags LIKE ? + LIMIT 1' + ); + $lookup->execute([$clientId, $marker]); + $clientSourceId = (int)($lookup->fetchColumn() ?: 0); + + $isActive = (int)!empty($source['is_active']); + $status = $isActive === 1 ? 'approved' : 'paused'; + $schedule = ownershipMapSchedule((string)($source['schedule'] ?? 'manual')); + $tags = ownershipScraperTags($source, $packageSlug, (string)$corpus['slug']); + $description = ownershipScraperDescription($source, $packageSlug, (string)$corpus['name']); + $maxPages = ((string)($source['scraper_type'] ?? '') === 'pdf' || (int)($source['pdf_only'] ?? 0) === 1) ? 100 : 50; + + $values = [ + (string)$source['name'], + (string)$source['url'], + $description, + (string)$source['scraper_type'], + $schedule, + (int)($source['cron_day'] ?? 0), + (int)($source['cron_hour'] ?? 3), + max(1, min(5, (int)($source['depth'] ?? 1))), + $maxPages, + (int)($source['pdf_only'] ?? 0), + (string)($source['category'] ?? 'legal'), + (string)($source['language'] ?? 'no'), + $tags, + $status, + 'Approved as an owned Do Better Norge source mirrored from the package corpus.', + $isActive, + (int)($source['total_docs_ingested'] ?? 0), + ]; + + if ($clientSourceId > 0) { + $stmt = $db->prepare( + 'UPDATE client_scraper_sources + SET name = ?, url = ?, description = ?, scraper_type = ?, schedule = ?, + cron_day = ?, cron_hour = ?, max_depth = ?, max_pages = ?, pdf_only = ?, + category = ?, language = ?, tags = ?, status = ?, approval_notes = ?, + is_active = ?, total_docs_ingested = ? + WHERE id = ? AND client_id = ?' + ); + $stmt->execute([...$values, $clientSourceId, $clientId]); + $updated++; + continue; + } + + $stmt = $db->prepare( + 'INSERT INTO client_scraper_sources + (client_id, name, url, description, scraper_type, schedule, cron_day, cron_hour, + max_depth, max_pages, pdf_only, category, language, tags, status, approval_notes, + approved_at, is_active, total_docs_ingested) + VALUES + (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), ?, ?)' + ); + $stmt->execute([$clientId, ...$values]); + $inserted++; + } + + $stmt = $db->prepare( + 'SELECT COUNT(*) FROM corpus_sources + WHERE corpus_id = ? AND scraper_type NOT IN ("website", "pdf", "rss")' + ); + $stmt->execute([$corpusId]); + $systemOnlySources = (int)$stmt->fetchColumn(); + + $db->commit(); + + echo "Caveau corpus ownership configured.\n"; + echo "Client: {$client['slug']} (#{$clientId})\n"; + echo "Package: {$packageSlug} (#{$packageId})\n"; + echo "Owned corpus: {$corpus['slug']} (#{$corpusId})\n"; + echo "Default client corpus: do-better-norge (#{$defaultCorpusId})\n"; + echo "Client scrapers mirrored: {$inserted} inserted, {$updated} updated\n"; + echo "System-only package sources skipped: {$systemOnlySources}\n"; +} catch (Throwable $e) { + if (isset($db) && $db instanceof PDO && $db->inTransaction()) { + $db->rollBack(); + } + fwrite(STDERR, "Ownership setup failed: {$e->getMessage()}\n"); + exit(1); +}