Add Caveau corpus ownership setup
This commit is contained in:
@@ -0,0 +1,258 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/../includes/bootstrap.php';
|
||||
|
||||
if (PHP_SAPI !== 'cli') {
|
||||
fwrite(STDERR, "This setup script must be run from the command line.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
function ownershipColumnExists(PDO $db, string $table, string $column): bool
|
||||
{
|
||||
$stmt = $db->prepare(
|
||||
'SELECT COUNT(*)
|
||||
FROM information_schema.COLUMNS
|
||||
WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? AND COLUMN_NAME = ?'
|
||||
);
|
||||
$stmt->execute([$table, $column]);
|
||||
return (int)$stmt->fetchColumn() > 0;
|
||||
}
|
||||
|
||||
function ownershipJson(array $value): string
|
||||
{
|
||||
return json_encode($value, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
|
||||
}
|
||||
|
||||
function ownershipMapSchedule(?string $schedule): string
|
||||
{
|
||||
return match ($schedule) {
|
||||
'daily' => 'daily',
|
||||
'weekly' => 'weekly',
|
||||
default => 'manual',
|
||||
};
|
||||
}
|
||||
|
||||
function ownershipScraperTags(array $source, string $packageSlug, string $corpusSlug): string
|
||||
{
|
||||
$tags = [];
|
||||
foreach (explode(',', (string)($source['tags'] ?? '')) as $tag) {
|
||||
$tag = trim($tag);
|
||||
if ($tag !== '') {
|
||||
$tags[$tag] = true;
|
||||
}
|
||||
}
|
||||
|
||||
$tags['corpus-source:' . (int)$source['id']] = true;
|
||||
$tags['package:' . $packageSlug] = true;
|
||||
$tags['owned-corpus:' . $corpusSlug] = true;
|
||||
|
||||
return implode(', ', array_keys($tags));
|
||||
}
|
||||
|
||||
function ownershipScraperDescription(array $source, string $packageSlug, string $corpusName): string
|
||||
{
|
||||
$description = trim((string)($source['description'] ?? ''));
|
||||
$suffix = sprintf(
|
||||
'Mirrored from owned Caveau package %s / corpus %s / corpus_sources.id=%d.',
|
||||
$packageSlug,
|
||||
$corpusName,
|
||||
(int)$source['id']
|
||||
);
|
||||
|
||||
return trim($description === '' ? $suffix : $description . "\n\n" . $suffix);
|
||||
}
|
||||
|
||||
$packageSlug = dbnToolsEnv('DBN_CAVEAU_OWNER_PACKAGE_SLUG') ?: dbnToolsRequiredPackageSlug();
|
||||
|
||||
try {
|
||||
$db = dbnToolsDb();
|
||||
$db->beginTransaction();
|
||||
|
||||
$client = dbnToolsFetchClient($db);
|
||||
if (!$client || empty($client['is_active'])) {
|
||||
throw new RuntimeException('Do Better Norge client tenant is not active or was not found.');
|
||||
}
|
||||
$clientId = (int)$client['id'];
|
||||
|
||||
$package = dbnToolsFetchPackage($packageSlug, $db);
|
||||
if (!$package || empty($package['is_active']) || empty($package['corpus_id'])) {
|
||||
throw new RuntimeException("Active package {$packageSlug} with a corpus_id was not found.");
|
||||
}
|
||||
$packageId = (int)$package['id'];
|
||||
$corpusId = (int)$package['corpus_id'];
|
||||
|
||||
if (!dbnToolsHasActiveSubscription($clientId, $packageId, $db)) {
|
||||
$stmt = $db->prepare(
|
||||
'INSERT INTO client_corpus_subscriptions (client_id, package_id, is_active, source)
|
||||
VALUES (?, ?, 1, ?)
|
||||
ON DUPLICATE KEY UPDATE is_active = VALUES(is_active), source = VALUES(source), cancelled_at = NULL'
|
||||
);
|
||||
$stmt->execute([$clientId, $packageId, 'manual']);
|
||||
}
|
||||
|
||||
$stmt = $db->prepare('SELECT id, slug, name FROM corpuses WHERE id = ? LIMIT 1');
|
||||
$stmt->execute([$corpusId]);
|
||||
$corpus = $stmt->fetch(PDO::FETCH_ASSOC);
|
||||
if (!$corpus) {
|
||||
throw new RuntimeException("Corpus #{$corpusId} was not found.");
|
||||
}
|
||||
|
||||
$ownershipUpdates = [];
|
||||
$ownershipValues = [];
|
||||
if (ownershipColumnExists($db, 'corpuses', 'owner_type')) {
|
||||
$ownershipUpdates[] = 'owner_type = ?';
|
||||
$ownershipValues[] = 'client';
|
||||
}
|
||||
if (ownershipColumnExists($db, 'corpuses', 'owner_client_id')) {
|
||||
$ownershipUpdates[] = 'owner_client_id = ?';
|
||||
$ownershipValues[] = $clientId;
|
||||
}
|
||||
if (ownershipColumnExists($db, 'corpuses', 'owner_notes')) {
|
||||
$ownershipUpdates[] = 'owner_notes = ?';
|
||||
$ownershipValues[] = sprintf(
|
||||
'Owned and controlled by Do Better Norge (%s) via %s as of %s. Package: %s.',
|
||||
(string)$client['slug'],
|
||||
(string)($client['contact_email'] ?? 'daveadmin@dobetternorge.no'),
|
||||
date('Y-m-d'),
|
||||
$packageSlug
|
||||
);
|
||||
}
|
||||
|
||||
if ($ownershipUpdates !== []) {
|
||||
$stmt = $db->prepare('UPDATE corpuses SET ' . implode(', ', $ownershipUpdates) . ' WHERE id = ?');
|
||||
$stmt->execute([...$ownershipValues, $corpusId]);
|
||||
}
|
||||
|
||||
$stmt = $db->prepare('SELECT id FROM client_corpora WHERE client_id = ? AND slug = ? LIMIT 1');
|
||||
$stmt->execute([$clientId, 'do-better-norge']);
|
||||
$defaultCorpusId = (int)($stmt->fetchColumn() ?: 0);
|
||||
|
||||
if ($defaultCorpusId > 0) {
|
||||
$db->prepare(
|
||||
'UPDATE client_corpora
|
||||
SET name = ?, description = ?, is_default = 1
|
||||
WHERE id = ? AND client_id = ?'
|
||||
)->execute([
|
||||
'Do Better Norge',
|
||||
'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.',
|
||||
$defaultCorpusId,
|
||||
$clientId,
|
||||
]);
|
||||
} else {
|
||||
$stmt = $db->prepare(
|
||||
'INSERT INTO client_corpora (client_id, name, slug, description, is_default)
|
||||
VALUES (?, ?, ?, ?, 1)'
|
||||
);
|
||||
$stmt->execute([
|
||||
$clientId,
|
||||
'Do Better Norge',
|
||||
'do-better-norge',
|
||||
'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.',
|
||||
]);
|
||||
$defaultCorpusId = (int)$db->lastInsertId();
|
||||
}
|
||||
|
||||
$db->prepare('UPDATE client_corpora SET is_default = 0 WHERE client_id = ? AND id <> ?')
|
||||
->execute([$clientId, $defaultCorpusId]);
|
||||
|
||||
$stmt = $db->prepare(
|
||||
'SELECT id, name, description, url, scraper_type, schedule, cron_day, cron_hour,
|
||||
depth, pdf_only, is_active, category, language, tags, total_docs_ingested
|
||||
FROM corpus_sources
|
||||
WHERE corpus_id = ?
|
||||
AND scraper_type IN ("website", "pdf", "rss")
|
||||
ORDER BY is_active DESC, name ASC'
|
||||
);
|
||||
$stmt->execute([$corpusId]);
|
||||
$sources = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
||||
|
||||
$inserted = 0;
|
||||
$updated = 0;
|
||||
foreach ($sources as $source) {
|
||||
$sourceId = (int)$source['id'];
|
||||
$marker = '%corpus-source:' . $sourceId . '%';
|
||||
$lookup = $db->prepare(
|
||||
'SELECT id FROM client_scraper_sources
|
||||
WHERE client_id = ? AND tags LIKE ?
|
||||
LIMIT 1'
|
||||
);
|
||||
$lookup->execute([$clientId, $marker]);
|
||||
$clientSourceId = (int)($lookup->fetchColumn() ?: 0);
|
||||
|
||||
$isActive = (int)!empty($source['is_active']);
|
||||
$status = $isActive === 1 ? 'approved' : 'paused';
|
||||
$schedule = ownershipMapSchedule((string)($source['schedule'] ?? 'manual'));
|
||||
$tags = ownershipScraperTags($source, $packageSlug, (string)$corpus['slug']);
|
||||
$description = ownershipScraperDescription($source, $packageSlug, (string)$corpus['name']);
|
||||
$maxPages = ((string)($source['scraper_type'] ?? '') === 'pdf' || (int)($source['pdf_only'] ?? 0) === 1) ? 100 : 50;
|
||||
|
||||
$values = [
|
||||
(string)$source['name'],
|
||||
(string)$source['url'],
|
||||
$description,
|
||||
(string)$source['scraper_type'],
|
||||
$schedule,
|
||||
(int)($source['cron_day'] ?? 0),
|
||||
(int)($source['cron_hour'] ?? 3),
|
||||
max(1, min(5, (int)($source['depth'] ?? 1))),
|
||||
$maxPages,
|
||||
(int)($source['pdf_only'] ?? 0),
|
||||
(string)($source['category'] ?? 'legal'),
|
||||
(string)($source['language'] ?? 'no'),
|
||||
$tags,
|
||||
$status,
|
||||
'Approved as an owned Do Better Norge source mirrored from the package corpus.',
|
||||
$isActive,
|
||||
(int)($source['total_docs_ingested'] ?? 0),
|
||||
];
|
||||
|
||||
if ($clientSourceId > 0) {
|
||||
$stmt = $db->prepare(
|
||||
'UPDATE client_scraper_sources
|
||||
SET name = ?, url = ?, description = ?, scraper_type = ?, schedule = ?,
|
||||
cron_day = ?, cron_hour = ?, max_depth = ?, max_pages = ?, pdf_only = ?,
|
||||
category = ?, language = ?, tags = ?, status = ?, approval_notes = ?,
|
||||
is_active = ?, total_docs_ingested = ?
|
||||
WHERE id = ? AND client_id = ?'
|
||||
);
|
||||
$stmt->execute([...$values, $clientSourceId, $clientId]);
|
||||
$updated++;
|
||||
continue;
|
||||
}
|
||||
|
||||
$stmt = $db->prepare(
|
||||
'INSERT INTO client_scraper_sources
|
||||
(client_id, name, url, description, scraper_type, schedule, cron_day, cron_hour,
|
||||
max_depth, max_pages, pdf_only, category, language, tags, status, approval_notes,
|
||||
approved_at, is_active, total_docs_ingested)
|
||||
VALUES
|
||||
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), ?, ?)'
|
||||
);
|
||||
$stmt->execute([$clientId, ...$values]);
|
||||
$inserted++;
|
||||
}
|
||||
|
||||
$stmt = $db->prepare(
|
||||
'SELECT COUNT(*) FROM corpus_sources
|
||||
WHERE corpus_id = ? AND scraper_type NOT IN ("website", "pdf", "rss")'
|
||||
);
|
||||
$stmt->execute([$corpusId]);
|
||||
$systemOnlySources = (int)$stmt->fetchColumn();
|
||||
|
||||
$db->commit();
|
||||
|
||||
echo "Caveau corpus ownership configured.\n";
|
||||
echo "Client: {$client['slug']} (#{$clientId})\n";
|
||||
echo "Package: {$packageSlug} (#{$packageId})\n";
|
||||
echo "Owned corpus: {$corpus['slug']} (#{$corpusId})\n";
|
||||
echo "Default client corpus: do-better-norge (#{$defaultCorpusId})\n";
|
||||
echo "Client scrapers mirrored: {$inserted} inserted, {$updated} updated\n";
|
||||
echo "System-only package sources skipped: {$systemOnlySources}\n";
|
||||
} catch (Throwable $e) {
|
||||
if (isset($db) && $db instanceof PDO && $db->inTransaction()) {
|
||||
$db->rollBack();
|
||||
}
|
||||
fwrite(STDERR, "Ownership setup failed: {$e->getMessage()}\n");
|
||||
exit(1);
|
||||
}
|
||||
Reference in New Issue
Block a user