06d01a3bce
Full private corpus dashboard for tools.dobetternorge.no users — each SSO
account gets an auto-provisioned CaveauAI tenant (clients row, corpus) on
first visit. Includes upload (file/paste/URL), RAG chat with SSE streaming
and citation chips, document CRUD, FalkorDB graph relations tab, and
improved save-from-tool flow with tag/preview support.
- dashboard/{index,documents,document,upload,chat,settings}.php
- api/dashboard/{corpus-init,documents,upload,ingest-status,chat-stream,
save-from-tool,graph}.php
- includes/{CorpusProvision,layout_dashboard,layout_dashboard_footer}.php
- assets/css/dashboard.css assets/js/corpus-save.js (routing upgrade)
- includes/{bootstrap,layout}.php extended for dashboard provisioning
Migration 141 (clients.dbn_sso_uid + import_method enum) applied on chloe.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
242 lines
8.9 KiB
PHP
242 lines
8.9 KiB
PHP
<?php
|
|
/**
|
|
* POST /api/dashboard/upload.php
|
|
*
|
|
* Three input modes:
|
|
* - multipart/form-data with `file` field (PDF/DOCX/TXT, <= 8 MB)
|
|
* - JSON body { "kind":"text", "title":..., "content":..., "category"?, "tags"?, "author"?, "language"? }
|
|
* - JSON body { "kind":"url", "title":..., "url":... } (fetched via ClientUniversalScraper; queued)
|
|
*
|
|
* For file + text: writes pending row, runs ClientRagPipeline::ingestDocument() synchronously,
|
|
* returns { ok, document_id, chunks, status }
|
|
* For url: writes pending row, returns immediately with status:'pending' — a separate cron job
|
|
* (run_client_one.php on the ai-portal) does the ingest.
|
|
*
|
|
* If file text extraction yields less than 200 chars, attempts OCR via `tesseract` shell util.
|
|
*/
|
|
|
|
declare(strict_types=1);
|
|
|
|
require_once dirname(__DIR__, 2) . '/includes/bootstrap.php';
|
|
|
|
dbnToolsRequireMethod('POST');
|
|
dbnToolsRequireAuth();
|
|
|
|
try {
|
|
$tenant = dbnToolsEnsureDashboardTenant();
|
|
} catch (DbnToolsHttpException $e) {
|
|
dbnToolsError($e->getMessage(), $e->status, $e->errorCode);
|
|
}
|
|
$clientId = (int)$tenant['client_id'];
|
|
$corpusId = (int)$tenant['corpus_id'];
|
|
|
|
dbnToolsBootCaveau();
|
|
$db = getDb();
|
|
|
|
$contentType = (string)($_SERVER['CONTENT_TYPE'] ?? '');
|
|
$isMultipart = stripos($contentType, 'multipart/form-data') === 0;
|
|
|
|
try {
|
|
if ($isMultipart) {
|
|
$result = handleFileUpload($db, $clientId, $corpusId);
|
|
} else {
|
|
$input = dbnToolsJsonInput(2_500_000);
|
|
$kind = (string)($input['kind'] ?? 'text');
|
|
$result = match ($kind) {
|
|
'text' => handleTextPaste($db, $clientId, $corpusId, $input),
|
|
'url' => handleUrlImport($db, $clientId, $corpusId, $input),
|
|
default => dbnToolsError('Unknown kind: ' . $kind, 400, 'unknown_kind'),
|
|
};
|
|
}
|
|
} catch (DbnToolsHttpException $e) {
|
|
dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
|
|
} catch (Throwable $e) {
|
|
dbnToolsError('Upload failed: ' . $e->getMessage(), 500, 'upload_failed');
|
|
}
|
|
|
|
dbnToolsRespond($result, 201);
|
|
|
|
|
|
function handleFileUpload(PDO $db, int $clientId, int $corpusId): array
|
|
{
|
|
if (empty($_FILES['file'])) {
|
|
dbnToolsError('No file uploaded.', 400, 'missing_file');
|
|
}
|
|
|
|
$extract = dbnToolsExtractUploadedFile($_FILES['file']);
|
|
$text = (string)$extract['text'];
|
|
$filename = (string)$extract['filename'];
|
|
$ext = strtolower(pathinfo($filename, PATHINFO_EXTENSION));
|
|
|
|
$sourceType = match ($ext) {
|
|
'pdf' => 'pdf',
|
|
'docx' => 'docx',
|
|
default => 'text',
|
|
};
|
|
|
|
if (mb_strlen($text, 'UTF-8') < 200 && $ext === 'pdf') {
|
|
$ocrText = tryOcrPdf((string)($_FILES['file']['tmp_name'] ?? ''));
|
|
if ($ocrText !== null && mb_strlen($ocrText, 'UTF-8') > mb_strlen($text, 'UTF-8')) {
|
|
$text = $ocrText;
|
|
$importMethod = 'ocr_scan';
|
|
}
|
|
}
|
|
$importMethod = $importMethod ?? 'dbn_upload';
|
|
|
|
$title = trim((string)($_POST['title'] ?? '')) ?: pathinfo($filename, PATHINFO_FILENAME);
|
|
$category = sanitizeCategory((string)($_POST['category'] ?? 'uncategorized'));
|
|
$tags = sanitizeTagsCsv((string)($_POST['tags'] ?? ''));
|
|
$author = trim((string)($_POST['author'] ?? '')) ?: null;
|
|
$language = trim((string)($_POST['language'] ?? 'no')) ?: 'no';
|
|
|
|
return persistAndIngest($db, $clientId, $corpusId, [
|
|
'title' => $title,
|
|
'source_type' => $sourceType,
|
|
'content' => $text,
|
|
'category' => $category,
|
|
'tags' => $tags,
|
|
'author' => $author,
|
|
'language' => $language,
|
|
'import_method' => $importMethod,
|
|
'original_filename' => $filename,
|
|
'file_size_bytes' => (int)($_FILES['file']['size'] ?? 0),
|
|
'source_tool' => 'dashboard-upload',
|
|
]);
|
|
}
|
|
|
|
function handleTextPaste(PDO $db, int $clientId, int $corpusId, array $input): array
|
|
{
|
|
$title = trim((string)($input['title'] ?? ''));
|
|
$content = trim((string)($input['content'] ?? ''));
|
|
if ($title === '') dbnToolsError('title is required.', 400, 'missing_title');
|
|
if (mb_strlen($content, 'UTF-8') < 30) dbnToolsError('content too short (min 30 chars).', 400, 'content_too_short');
|
|
if (mb_strlen($content, 'UTF-8') > 2_000_000) dbnToolsError('content exceeds 2 MB.', 400, 'content_too_large');
|
|
|
|
return persistAndIngest($db, $clientId, $corpusId, [
|
|
'title' => $title,
|
|
'source_type' => 'text',
|
|
'content' => $content,
|
|
'category' => sanitizeCategory((string)($input['category'] ?? 'uncategorized')),
|
|
'tags' => sanitizeTagsCsv((string)($input['tags'] ?? '')),
|
|
'author' => trim((string)($input['author'] ?? '')) ?: null,
|
|
'language' => trim((string)($input['language'] ?? 'no')) ?: 'no',
|
|
'import_method' => 'manual',
|
|
'source_tool' => 'dashboard-paste',
|
|
]);
|
|
}
|
|
|
|
function handleUrlImport(PDO $db, int $clientId, int $corpusId, array $input): array
|
|
{
|
|
$url = trim((string)($input['url'] ?? ''));
|
|
$title = trim((string)($input['title'] ?? ''));
|
|
if ($url === '' || !filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
|
|
dbnToolsError('Valid URL is required.', 400, 'invalid_url');
|
|
}
|
|
$scheme = strtolower((string)parse_url($url, PHP_URL_SCHEME));
|
|
if (!in_array($scheme, ['http', 'https'], true)) {
|
|
dbnToolsError('URL must use http or https.', 400, 'invalid_scheme');
|
|
}
|
|
if ($title === '') $title = $url;
|
|
|
|
$stmt = $db->prepare("
|
|
INSERT INTO client_documents
|
|
(client_id, corpus_id, title, source_type, source_url, content,
|
|
category, tags, language, import_method, source_tool, status)
|
|
VALUES (?, ?, ?, 'url', ?, '', ?, ?, ?, 'url', 'dashboard-url', 'pending')
|
|
");
|
|
$stmt->execute([
|
|
$clientId, $corpusId, $title, $url,
|
|
sanitizeCategory((string)($input['category'] ?? 'uncategorized')),
|
|
sanitizeTagsCsv((string)($input['tags'] ?? '')),
|
|
trim((string)($input['language'] ?? 'no')) ?: 'no',
|
|
]);
|
|
|
|
return [
|
|
'ok' => true,
|
|
'document_id' => (int)$db->lastInsertId(),
|
|
'status' => 'pending',
|
|
'chunks' => 0,
|
|
'note' => 'URL queued for background ingest.',
|
|
];
|
|
}
|
|
|
|
function persistAndIngest(PDO $db, int $clientId, int $corpusId, array $doc): array
|
|
{
|
|
$wordCount = str_word_count($doc['content']);
|
|
|
|
$stmt = $db->prepare("
|
|
INSERT INTO client_documents
|
|
(client_id, corpus_id, title, source_type, original_filename, file_size_bytes,
|
|
content, category, tags, author, language,
|
|
import_method, source_tool, word_count, status)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'pending')
|
|
");
|
|
$stmt->execute([
|
|
$clientId,
|
|
$corpusId,
|
|
$doc['title'],
|
|
$doc['source_type'],
|
|
$doc['original_filename'] ?? null,
|
|
$doc['file_size_bytes'] ?? 0,
|
|
$doc['content'],
|
|
$doc['category'],
|
|
$doc['tags'],
|
|
$doc['author'] ?? null,
|
|
$doc['language'],
|
|
$doc['import_method'],
|
|
$doc['source_tool'],
|
|
$wordCount,
|
|
]);
|
|
$docId = (int)$db->lastInsertId();
|
|
|
|
try {
|
|
$rag = new ClientRagPipeline($clientId);
|
|
$chunks = $rag->ingestDocument($docId);
|
|
return [
|
|
'ok' => true,
|
|
'document_id' => $docId,
|
|
'chunks' => (int)$chunks,
|
|
'status' => 'ready',
|
|
'word_count' => $wordCount,
|
|
];
|
|
} catch (Throwable $e) {
|
|
$db->prepare("UPDATE client_documents SET status='error', error_message=? WHERE id=?")
|
|
->execute([substr($e->getMessage(), 0, 1000), $docId]);
|
|
return [
|
|
'ok' => false,
|
|
'document_id' => $docId,
|
|
'status' => 'error',
|
|
'error' => ['code' => 'index_failed', 'message' => 'Saved, but indexing failed: ' . $e->getMessage()],
|
|
];
|
|
}
|
|
}
|
|
|
|
function sanitizeCategory(string $cat): string
|
|
{
|
|
$cat = strtolower(trim($cat));
|
|
$cat = preg_replace('/[^a-z0-9\-_]/', '', $cat) ?: 'uncategorized';
|
|
return substr($cat, 0, 50);
|
|
}
|
|
|
|
function sanitizeTagsCsv(string $raw): string
|
|
{
|
|
$tags = array_filter(array_map('trim', explode(',', $raw)));
|
|
$tags = array_values(array_slice(array_map(fn($t) => substr($t, 0, 32), $tags), 0, 20));
|
|
return implode(',', $tags);
|
|
}
|
|
|
|
function tryOcrPdf(string $tmpPath): ?string
|
|
{
|
|
if ($tmpPath === '' || !is_readable($tmpPath)) return null;
|
|
if (!function_exists('shell_exec')) return null;
|
|
|
|
$check = @shell_exec('command -v tesseract 2>/dev/null');
|
|
if (!$check) return null;
|
|
|
|
$out = trim((string)@shell_exec(
|
|
'pdftoppm -r 200 ' . escapeshellarg($tmpPath) . ' - -png 2>/dev/null | '
|
|
. 'tesseract -l nor+eng stdin stdout 2>/dev/null'
|
|
));
|
|
return $out !== '' ? $out : null;
|
|
}
|