259 lines
9.5 KiB
PHP
259 lines
9.5 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
|
|
require_once __DIR__ . '/../includes/bootstrap.php';
|
|
|
|
if (PHP_SAPI !== 'cli') {
|
|
fwrite(STDERR, "This setup script must be run from the command line.\n");
|
|
exit(1);
|
|
}
|
|
|
|
function ownershipColumnExists(PDO $db, string $table, string $column): bool
|
|
{
|
|
$stmt = $db->prepare(
|
|
'SELECT COUNT(*)
|
|
FROM information_schema.COLUMNS
|
|
WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ? AND COLUMN_NAME = ?'
|
|
);
|
|
$stmt->execute([$table, $column]);
|
|
return (int)$stmt->fetchColumn() > 0;
|
|
}
|
|
|
|
function ownershipJson(array $value): string
|
|
{
|
|
return json_encode($value, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
|
|
}
|
|
|
|
function ownershipMapSchedule(?string $schedule): string
|
|
{
|
|
return match ($schedule) {
|
|
'daily' => 'daily',
|
|
'weekly' => 'weekly',
|
|
default => 'manual',
|
|
};
|
|
}
|
|
|
|
function ownershipScraperTags(array $source, string $packageSlug, string $corpusSlug): string
|
|
{
|
|
$tags = [];
|
|
foreach (explode(',', (string)($source['tags'] ?? '')) as $tag) {
|
|
$tag = trim($tag);
|
|
if ($tag !== '') {
|
|
$tags[$tag] = true;
|
|
}
|
|
}
|
|
|
|
$tags['corpus-source:' . (int)$source['id']] = true;
|
|
$tags['package:' . $packageSlug] = true;
|
|
$tags['owned-corpus:' . $corpusSlug] = true;
|
|
|
|
return implode(', ', array_keys($tags));
|
|
}
|
|
|
|
function ownershipScraperDescription(array $source, string $packageSlug, string $corpusName): string
|
|
{
|
|
$description = trim((string)($source['description'] ?? ''));
|
|
$suffix = sprintf(
|
|
'Mirrored from owned Caveau package %s / corpus %s / corpus_sources.id=%d.',
|
|
$packageSlug,
|
|
$corpusName,
|
|
(int)$source['id']
|
|
);
|
|
|
|
return trim($description === '' ? $suffix : $description . "\n\n" . $suffix);
|
|
}
|
|
|
|
$packageSlug = dbnToolsEnv('DBN_CAVEAU_OWNER_PACKAGE_SLUG') ?: dbnToolsRequiredPackageSlug();
|
|
|
|
try {
|
|
$db = dbnToolsDb();
|
|
$db->beginTransaction();
|
|
|
|
$client = dbnToolsFetchClient($db);
|
|
if (!$client || empty($client['is_active'])) {
|
|
throw new RuntimeException('Do Better Norge client tenant is not active or was not found.');
|
|
}
|
|
$clientId = (int)$client['id'];
|
|
|
|
$package = dbnToolsFetchPackage($packageSlug, $db);
|
|
if (!$package || empty($package['is_active']) || empty($package['corpus_id'])) {
|
|
throw new RuntimeException("Active package {$packageSlug} with a corpus_id was not found.");
|
|
}
|
|
$packageId = (int)$package['id'];
|
|
$corpusId = (int)$package['corpus_id'];
|
|
|
|
if (!dbnToolsHasActiveSubscription($clientId, $packageId, $db)) {
|
|
$stmt = $db->prepare(
|
|
'INSERT INTO client_corpus_subscriptions (client_id, package_id, is_active, source)
|
|
VALUES (?, ?, 1, ?)
|
|
ON DUPLICATE KEY UPDATE is_active = VALUES(is_active), source = VALUES(source), cancelled_at = NULL'
|
|
);
|
|
$stmt->execute([$clientId, $packageId, 'manual']);
|
|
}
|
|
|
|
$stmt = $db->prepare('SELECT id, slug, name FROM corpuses WHERE id = ? LIMIT 1');
|
|
$stmt->execute([$corpusId]);
|
|
$corpus = $stmt->fetch(PDO::FETCH_ASSOC);
|
|
if (!$corpus) {
|
|
throw new RuntimeException("Corpus #{$corpusId} was not found.");
|
|
}
|
|
|
|
$ownershipUpdates = [];
|
|
$ownershipValues = [];
|
|
if (ownershipColumnExists($db, 'corpuses', 'owner_type')) {
|
|
$ownershipUpdates[] = 'owner_type = ?';
|
|
$ownershipValues[] = 'client';
|
|
}
|
|
if (ownershipColumnExists($db, 'corpuses', 'owner_client_id')) {
|
|
$ownershipUpdates[] = 'owner_client_id = ?';
|
|
$ownershipValues[] = $clientId;
|
|
}
|
|
if (ownershipColumnExists($db, 'corpuses', 'owner_notes')) {
|
|
$ownershipUpdates[] = 'owner_notes = ?';
|
|
$ownershipValues[] = sprintf(
|
|
'Owned and controlled by Do Better Norge (%s) via %s as of %s. Package: %s.',
|
|
(string)$client['slug'],
|
|
(string)($client['contact_email'] ?? 'daveadmin@dobetternorge.no'),
|
|
date('Y-m-d'),
|
|
$packageSlug
|
|
);
|
|
}
|
|
|
|
if ($ownershipUpdates !== []) {
|
|
$stmt = $db->prepare('UPDATE corpuses SET ' . implode(', ', $ownershipUpdates) . ' WHERE id = ?');
|
|
$stmt->execute([...$ownershipValues, $corpusId]);
|
|
}
|
|
|
|
$stmt = $db->prepare('SELECT id FROM client_corpora WHERE client_id = ? AND slug = ? LIMIT 1');
|
|
$stmt->execute([$clientId, 'do-better-norge']);
|
|
$defaultCorpusId = (int)($stmt->fetchColumn() ?: 0);
|
|
|
|
if ($defaultCorpusId > 0) {
|
|
$db->prepare(
|
|
'UPDATE client_corpora
|
|
SET name = ?, description = ?, is_default = 1
|
|
WHERE id = ? AND client_id = ?'
|
|
)->execute([
|
|
'Do Better Norge',
|
|
'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.',
|
|
$defaultCorpusId,
|
|
$clientId,
|
|
]);
|
|
} else {
|
|
$stmt = $db->prepare(
|
|
'INSERT INTO client_corpora (client_id, name, slug, description, is_default)
|
|
VALUES (?, ?, ?, ?, 1)'
|
|
);
|
|
$stmt->execute([
|
|
$clientId,
|
|
'Do Better Norge',
|
|
'do-better-norge',
|
|
'Client-owned Do Better Norge working corpus for documents, source review, and corpus governance.',
|
|
]);
|
|
$defaultCorpusId = (int)$db->lastInsertId();
|
|
}
|
|
|
|
$db->prepare('UPDATE client_corpora SET is_default = 0 WHERE client_id = ? AND id <> ?')
|
|
->execute([$clientId, $defaultCorpusId]);
|
|
|
|
$stmt = $db->prepare(
|
|
'SELECT id, name, description, url, scraper_type, schedule, cron_day, cron_hour,
|
|
depth, pdf_only, is_active, category, language, tags, total_docs_ingested
|
|
FROM corpus_sources
|
|
WHERE corpus_id = ?
|
|
AND scraper_type IN ("website", "pdf", "rss")
|
|
ORDER BY is_active DESC, name ASC'
|
|
);
|
|
$stmt->execute([$corpusId]);
|
|
$sources = $stmt->fetchAll(PDO::FETCH_ASSOC);
|
|
|
|
$inserted = 0;
|
|
$updated = 0;
|
|
foreach ($sources as $source) {
|
|
$sourceId = (int)$source['id'];
|
|
$marker = '%corpus-source:' . $sourceId . '%';
|
|
$lookup = $db->prepare(
|
|
'SELECT id FROM client_scraper_sources
|
|
WHERE client_id = ? AND tags LIKE ?
|
|
LIMIT 1'
|
|
);
|
|
$lookup->execute([$clientId, $marker]);
|
|
$clientSourceId = (int)($lookup->fetchColumn() ?: 0);
|
|
|
|
$isActive = (int)!empty($source['is_active']);
|
|
$status = $isActive === 1 ? 'approved' : 'paused';
|
|
$schedule = ownershipMapSchedule((string)($source['schedule'] ?? 'manual'));
|
|
$tags = ownershipScraperTags($source, $packageSlug, (string)$corpus['slug']);
|
|
$description = ownershipScraperDescription($source, $packageSlug, (string)$corpus['name']);
|
|
$maxPages = ((string)($source['scraper_type'] ?? '') === 'pdf' || (int)($source['pdf_only'] ?? 0) === 1) ? 100 : 50;
|
|
|
|
$values = [
|
|
(string)$source['name'],
|
|
(string)$source['url'],
|
|
$description,
|
|
(string)$source['scraper_type'],
|
|
$schedule,
|
|
(int)($source['cron_day'] ?? 0),
|
|
(int)($source['cron_hour'] ?? 3),
|
|
max(1, min(5, (int)($source['depth'] ?? 1))),
|
|
$maxPages,
|
|
(int)($source['pdf_only'] ?? 0),
|
|
(string)($source['category'] ?? 'legal'),
|
|
(string)($source['language'] ?? 'no'),
|
|
$tags,
|
|
$status,
|
|
'Approved as an owned Do Better Norge source mirrored from the package corpus.',
|
|
$isActive,
|
|
(int)($source['total_docs_ingested'] ?? 0),
|
|
];
|
|
|
|
if ($clientSourceId > 0) {
|
|
$stmt = $db->prepare(
|
|
'UPDATE client_scraper_sources
|
|
SET name = ?, url = ?, description = ?, scraper_type = ?, schedule = ?,
|
|
cron_day = ?, cron_hour = ?, max_depth = ?, max_pages = ?, pdf_only = ?,
|
|
category = ?, language = ?, tags = ?, status = ?, approval_notes = ?,
|
|
is_active = ?, total_docs_ingested = ?
|
|
WHERE id = ? AND client_id = ?'
|
|
);
|
|
$stmt->execute([...$values, $clientSourceId, $clientId]);
|
|
$updated++;
|
|
continue;
|
|
}
|
|
|
|
$stmt = $db->prepare(
|
|
'INSERT INTO client_scraper_sources
|
|
(client_id, name, url, description, scraper_type, schedule, cron_day, cron_hour,
|
|
max_depth, max_pages, pdf_only, category, language, tags, status, approval_notes,
|
|
approved_at, is_active, total_docs_ingested)
|
|
VALUES
|
|
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), ?, ?)'
|
|
);
|
|
$stmt->execute([$clientId, ...$values]);
|
|
$inserted++;
|
|
}
|
|
|
|
$stmt = $db->prepare(
|
|
'SELECT COUNT(*) FROM corpus_sources
|
|
WHERE corpus_id = ? AND scraper_type NOT IN ("website", "pdf", "rss")'
|
|
);
|
|
$stmt->execute([$corpusId]);
|
|
$systemOnlySources = (int)$stmt->fetchColumn();
|
|
|
|
$db->commit();
|
|
|
|
echo "Caveau corpus ownership configured.\n";
|
|
echo "Client: {$client['slug']} (#{$clientId})\n";
|
|
echo "Package: {$packageSlug} (#{$packageId})\n";
|
|
echo "Owned corpus: {$corpus['slug']} (#{$corpusId})\n";
|
|
echo "Default client corpus: do-better-norge (#{$defaultCorpusId})\n";
|
|
echo "Client scrapers mirrored: {$inserted} inserted, {$updated} updated\n";
|
|
echo "System-only package sources skipped: {$systemOnlySources}\n";
|
|
} catch (Throwable $e) {
|
|
if (isset($db) && $db instanceof PDO && $db->inTransaction()) {
|
|
$db->rollBack();
|
|
}
|
|
fwrite(STDERR, "Ownership setup failed: {$e->getMessage()}\n");
|
|
exit(1);
|
|
}
|