Files
dobetternorge-tools/scripts/setup-caveau-corpus-ownership.php

259 lines
9.5 KiB
PHP

<?php
declare(strict_types=1);
require_once __DIR__ . '/../includes/bootstrap.php';
if (PHP_SAPI !== 'cli') {
fwrite(STDERR, "This setup script must be run from the command line.\n");
exit(1);
}
function ownershipColumnExists(PDO $db, string $table, string $column): bool
{
$stmt = $db->prepare(
'SELECT COUNT(*)
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? AND COLUMN_NAME = ?'
);
$stmt->execute([$table, $column]);
return (int)$stmt->fetchColumn() > 0;
}
function ownershipJson(array $value): string
{
return json_encode($value, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
}
function ownershipMapSchedule(?string $schedule): string
{
return match ($schedule) {
'daily' => 'daily',
'weekly' => 'weekly',
default => 'manual',
};
}
function ownershipScraperTags(array $source, string $packageSlug, string $corpusSlug): string
{
$tags = [];
foreach (explode(',', (string)($source['tags'] ?? '')) as $tag) {
$tag = trim($tag);
if ($tag !== '') {
$tags[$tag] = true;
}
}
$tags['corpus-source:' . (int)$source['id']] = true;
$tags['package:' . $packageSlug] = true;
$tags['owned-corpus:' . $corpusSlug] = true;
return implode(', ', array_keys($tags));
}
function ownershipScraperDescription(array $source, string $packageSlug, string $corpusName): string
{
$description = trim((string)($source['description'] ?? ''));
$suffix = sprintf(
'Mirrored from owned Caveau package %s / corpus %s / corpus_sources.id=%d.',
$packageSlug,
$corpusName,
(int)$source['id']
);
return trim($description === '' ? $suffix : $description . "\n\n" . $suffix);
}
$packageSlug = dbnToolsEnv('DBN_CAVEAU_OWNER_PACKAGE_SLUG') ?: dbnToolsRequiredPackageSlug();
try {
$db = dbnToolsDb();
$db->beginTransaction();
$client = dbnToolsFetchClient($db);
if (!$client || empty($client['is_active'])) {
throw new RuntimeException('Do Better Norge client tenant is not active or was not found.');
}
$clientId = (int)$client['id'];
$package = dbnToolsFetchPackage($packageSlug, $db);
if (!$package || empty($package['is_active']) || empty($package['corpus_id'])) {
throw new RuntimeException("Active package {$packageSlug} with a corpus_id was not found.");
}
$packageId = (int)$package['id'];
$corpusId = (int)$package['corpus_id'];
if (!dbnToolsHasActiveSubscription($clientId, $packageId, $db)) {
$stmt = $db->prepare(
'INSERT INTO client_corpus_subscriptions (client_id, package_id, is_active, source)
VALUES (?, ?, 1, ?)
ON DUPLICATE KEY UPDATE is_active = VALUES(is_active), source = VALUES(source), cancelled_at = NULL'
);
$stmt->execute([$clientId, $packageId, 'manual']);
}
$stmt = $db->prepare('SELECT id, slug, name FROM corpuses WHERE id = ? LIMIT 1');
$stmt->execute([$corpusId]);
$corpus = $stmt->fetch(PDO::FETCH_ASSOC);
if (!$corpus) {
throw new RuntimeException("Corpus #{$corpusId} was not found.");
}
$ownershipUpdates = [];
$ownershipValues = [];
if (ownershipColumnExists($db, 'corpuses', 'owner_type')) {
$ownershipUpdates[] = 'owner_type = ?';
$ownershipValues[] = 'client';
}
if (ownershipColumnExists($db, 'corpuses', 'owner_client_id')) {
$ownershipUpdates[] = 'owner_client_id = ?';
$ownershipValues[] = $clientId;
}
if (ownershipColumnExists($db, 'corpuses', 'owner_notes')) {
$ownershipUpdates[] = 'owner_notes = ?';
$ownershipValues[] = sprintf(
'Owned and controlled by Do Better Norge (%s) via %s as of %s. Package: %s.',
(string)$client['slug'],
(string)($client['contact_email'] ?? 'daveadmin@dobetternorge.no'),
date('Y-m-d'),
$packageSlug
);
}
if ($ownershipUpdates !== []) {
$stmt = $db->prepare('UPDATE corpuses SET ' . implode(', ', $ownershipUpdates) . ' WHERE id = ?');
$stmt->execute([...$ownershipValues, $corpusId]);
}
$stmt = $db->prepare('SELECT id FROM client_corpora WHERE client_id = ? AND slug = ? LIMIT 1');
$stmt->execute([$clientId, 'do-better-norge']);
$defaultCorpusId = (int)($stmt->fetchColumn() ?: 0);
if ($defaultCorpusId > 0) {
$db->prepare(
'UPDATE client_corpora
SET name = ?, description = ?, is_default = 1
WHERE id = ? AND client_id = ?'
)->execute([
'Do Better Norge',
'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.',
$defaultCorpusId,
$clientId,
]);
} else {
$stmt = $db->prepare(
'INSERT INTO client_corpora (client_id, name, slug, description, is_default)
VALUES (?, ?, ?, ?, 1)'
);
$stmt->execute([
$clientId,
'Do Better Norge',
'do-better-norge',
'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.',
]);
$defaultCorpusId = (int)$db->lastInsertId();
}
$db->prepare('UPDATE client_corpora SET is_default = 0 WHERE client_id = ? AND id <> ?')
->execute([$clientId, $defaultCorpusId]);
$stmt = $db->prepare(
'SELECT id, name, description, url, scraper_type, schedule, cron_day, cron_hour,
depth, pdf_only, is_active, category, language, tags, total_docs_ingested
FROM corpus_sources
WHERE corpus_id = ?
AND scraper_type IN ("website", "pdf", "rss")
ORDER BY is_active DESC, name ASC'
);
$stmt->execute([$corpusId]);
$sources = $stmt->fetchAll(PDO::FETCH_ASSOC);
$inserted = 0;
$updated = 0;
foreach ($sources as $source) {
$sourceId = (int)$source['id'];
$marker = '%corpus-source:' . $sourceId . '%';
$lookup = $db->prepare(
'SELECT id FROM client_scraper_sources
WHERE client_id = ? AND tags LIKE ?
LIMIT 1'
);
$lookup->execute([$clientId, $marker]);
$clientSourceId = (int)($lookup->fetchColumn() ?: 0);
$isActive = (int)!empty($source['is_active']);
$status = $isActive === 1 ? 'approved' : 'paused';
$schedule = ownershipMapSchedule((string)($source['schedule'] ?? 'manual'));
$tags = ownershipScraperTags($source, $packageSlug, (string)$corpus['slug']);
$description = ownershipScraperDescription($source, $packageSlug, (string)$corpus['name']);
$maxPages = ((string)($source['scraper_type'] ?? '') === 'pdf' || (int)($source['pdf_only'] ?? 0) === 1) ? 100 : 50;
$values = [
(string)$source['name'],
(string)$source['url'],
$description,
(string)$source['scraper_type'],
$schedule,
(int)($source['cron_day'] ?? 0),
(int)($source['cron_hour'] ?? 3),
max(1, min(5, (int)($source['depth'] ?? 1))),
$maxPages,
(int)($source['pdf_only'] ?? 0),
(string)($source['category'] ?? 'legal'),
(string)($source['language'] ?? 'no'),
$tags,
$status,
'Approved as an owned Do Better Norge source mirrored from the package corpus.',
$isActive,
(int)($source['total_docs_ingested'] ?? 0),
];
if ($clientSourceId > 0) {
$stmt = $db->prepare(
'UPDATE client_scraper_sources
SET name = ?, url = ?, description = ?, scraper_type = ?, schedule = ?,
cron_day = ?, cron_hour = ?, max_depth = ?, max_pages = ?, pdf_only = ?,
category = ?, language = ?, tags = ?, status = ?, approval_notes = ?,
is_active = ?, total_docs_ingested = ?
WHERE id = ? AND client_id = ?'
);
$stmt->execute([...$values, $clientSourceId, $clientId]);
$updated++;
continue;
}
$stmt = $db->prepare(
'INSERT INTO client_scraper_sources
(client_id, name, url, description, scraper_type, schedule, cron_day, cron_hour,
max_depth, max_pages, pdf_only, category, language, tags, status, approval_notes,
approved_at, is_active, total_docs_ingested)
VALUES
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), ?, ?)'
);
$stmt->execute([$clientId, ...$values]);
$inserted++;
}
$stmt = $db->prepare(
'SELECT COUNT(*) FROM corpus_sources
WHERE corpus_id = ? AND scraper_type NOT IN ("website", "pdf", "rss")'
);
$stmt->execute([$corpusId]);
$systemOnlySources = (int)$stmt->fetchColumn();
$db->commit();
echo "Caveau corpus ownership configured.\n";
echo "Client: {$client['slug']} (#{$clientId})\n";
echo "Package: {$packageSlug} (#{$packageId})\n";
echo "Owned corpus: {$corpus['slug']} (#{$corpusId})\n";
echo "Default client corpus: do-better-norge (#{$defaultCorpusId})\n";
echo "Client scrapers mirrored: {$inserted} inserted, {$updated} updated\n";
echo "System-only package sources skipped: {$systemOnlySources}\n";
} catch (Throwable $e) {
if (isset($db) && $db instanceof PDO && $db->inTransaction()) {
$db->rollBack();
}
fwrite(STDERR, "Ownership setup failed: {$e->getMessage()}\n");
exit(1);
}