feat(dashboard): add corpus dashboard at /dashboard/

Full private corpus dashboard for tools.dobetternorge.no users — each SSO
account gets an auto-provisioned CaveauAI tenant (clients row, corpus) on
first visit. Includes upload (file/paste/URL), RAG chat with SSE streaming
and citation chips, document CRUD, FalkorDB graph relations tab, and
improved save-from-tool flow with tag/preview support.

- dashboard/{index,documents,document,upload,chat,settings}.php
- api/dashboard/{corpus-init,documents,upload,ingest-status,chat-stream,
  save-from-tool,graph}.php
- includes/{CorpusProvision,layout_dashboard,layout_dashboard_footer}.php
- assets/css/dashboard.css  assets/js/corpus-save.js (routing upgrade)
- includes/{bootstrap,layout}.php extended for dashboard provisioning

Migration 141 (clients.dbn_sso_uid + import_method enum) applied on chloe.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 17:15:40 +02:00
parent 83fc71414f
commit 06d01a3bce
20 changed files with 2632 additions and 28 deletions
+110
View File
@@ -0,0 +1,110 @@
<?php
/**
* POST /api/dashboard/chat-stream.php (SSE)
*
* Streams a RAG chat answer using the user's private corpus + the dobetter
* legal package. Each output token is delivered as an SSE event named "token".
* On completion, sources, chunks_used, model, and elapsed_ms are sent as a
* "done" event. Errors are sent as a "fail" event.
*
* Request body (JSON):
* {
* "question": "Hva sier barnevernloven § 4-12?",
* "history": [{role:"user"|"assistant", content:"..."}], // optional, capped at 8
* "category": "barnevern" (optional),
* "language": "no" | "en" (optional, default no)
* }
*/
declare(strict_types=1);
require_once dirname(__DIR__, 2) . '/includes/bootstrap.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
try {
$tenant = dbnToolsEnsureDashboardTenant();
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode);
}
$clientId = (int)$tenant['client_id'];
$input = dbnToolsJsonInput(80_000);
$question = trim((string)($input['question'] ?? ''));
if ($question === '') {
dbnToolsError('question is required.', 400, 'missing_question');
}
if (mb_strlen($question, 'UTF-8') > 4000) {
dbnToolsError('question is too long (max 4000 chars).', 422, 'question_too_long');
}
$history = is_array($input['history'] ?? null) ? $input['history'] : [];
$history = array_slice($history, -8);
$history = array_values(array_filter($history, fn($m) => is_array($m)
&& in_array($m['role'] ?? '', ['user', 'assistant'], true)
&& is_string($m['content'] ?? null)));
$category = trim((string)($input['category'] ?? '')) ?: null;
$language = in_array($input['language'] ?? 'no', ['no', 'en'], true) ? $input['language'] : 'no';
// SSE setup
header('Content-Type: text/event-stream');
header('Cache-Control: no-cache, no-transform');
header('X-Accel-Buffering: no');
@ini_set('output_buffering', 'off');
@ini_set('zlib.output_compression', '0');
while (ob_get_level() > 0) ob_end_flush();
ob_implicit_flush(true);
function sseEmit(string $event, array $data): void {
echo "event: {$event}\n";
echo 'data: ' . json_encode($data, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES) . "\n\n";
if (function_exists('flush')) @flush();
}
dbnToolsBootCaveau();
try {
$rag = new ClientRagPipeline($clientId);
$options = [
'conversation_history' => $history,
'language' => $language,
'user_id' => (int)($tenant['client_user_id'] ?? 0),
'user_role' => 'owner',
];
$result = $rag->askStreaming(
$question,
null, // model: let pipeline choose default
$category,
$options,
function (string $chunk): void {
if ($chunk !== '') sseEmit('token', ['t' => $chunk]);
}
);
$sources = [];
foreach (($result['fullChunks'] ?? $result['chunks'] ?? []) as $c) {
if (!is_array($c)) continue;
$sources[] = [
'document_id' => (int)($c['document_id'] ?? 0),
'title' => (string)($c['title'] ?? ''),
'section' => (string)($c['section_title'] ?? $c['section'] ?? ''),
'source_url' => (string)($c['source_url'] ?? ''),
'score' => isset($c['score']) ? (float)$c['score'] : null,
];
}
sseEmit('done', [
'ok' => true,
'chunks_used' => (int)($result['chunks_used'] ?? count($sources)),
'model' => (string)($result['model'] ?? ''),
'response_time_ms'=> (int)($result['response_time_ms'] ?? 0),
'sources' => $sources,
]);
} catch (Throwable $e) {
sseEmit('fail', ['ok' => false, 'message' => $e->getMessage()]);
}
exit;
+38
View File
@@ -0,0 +1,38 @@
<?php
/**
* GET /api/dashboard/corpus-init.php
*
* Idempotent: ensures the current session has a CaveauAI client tenant +
* default corpus, lazy-creating both on first hit. Safe to call on every
* dashboard page load (results are session-cached).
*
* Response:
* {
* "ok": true,
* "client_id": 102,
* "client_user_id": 257,
* "corpus_id": 18,
* "created": false
* }
*/
declare(strict_types=1);
require_once dirname(__DIR__, 2) . '/includes/bootstrap.php';
dbnToolsRequireMethod('GET');
dbnToolsRequireAuth();
try {
$tenant = dbnToolsEnsureDashboardTenant();
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
}
dbnToolsRespond([
'ok' => true,
'client_id' => (int)$tenant['client_id'],
'client_user_id' => (int)$tenant['client_user_id'],
'corpus_id' => (int)$tenant['corpus_id'],
'created' => (bool)($tenant['created'] ?? false),
]);
+249
View File
@@ -0,0 +1,249 @@
<?php
/**
* /api/dashboard/documents.php — CRUD for the current user's CaveauAI documents.
*
* GET ?action=list&offset=0&limit=20&q=&status=&category=
* → { ok, total, documents: [...] }
* GET ?action=get&id=123
* → { ok, document: {...} }
* POST ?action=update body: { id, title?, category?, tags?, language?, author? }
* → { ok, document: {...} }
* POST ?action=delete body: { ids: [1,2,3] }
* → { ok, deleted: N }
*
* All filtered by client_id from the dashboard session — no cross-tenant access possible.
*/
declare(strict_types=1);
require_once dirname(__DIR__, 2) . '/includes/bootstrap.php';
dbnToolsRequireAuth();
try {
$tenant = dbnToolsEnsureDashboardTenant();
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode);
}
$clientId = (int)$tenant['client_id'];
$method = strtoupper((string)($_SERVER['REQUEST_METHOD'] ?? 'GET'));
$action = (string)($_GET['action'] ?? ($method === 'POST' ? '' : 'list'));
$db = dbnToolsDb();
switch ($action) {
case 'list':
dbnToolsRequireMethod('GET');
respondList($db, $clientId);
break;
case 'get':
dbnToolsRequireMethod('GET');
respondGet($db, $clientId);
break;
case 'update':
dbnToolsRequireMethod('POST');
respondUpdate($db, $clientId);
break;
case 'delete':
dbnToolsRequireMethod('POST');
respondDelete($db, $clientId);
break;
default:
dbnToolsError('Unknown action.', 400, 'unknown_action');
}
function respondList(PDO $db, int $clientId): void
{
$offset = max(0, (int)($_GET['offset'] ?? 0));
$limit = max(1, min(100, (int)($_GET['limit'] ?? 20)));
$q = trim((string)($_GET['q'] ?? ''));
$status = trim((string)($_GET['status'] ?? ''));
$category = trim((string)($_GET['category'] ?? ''));
$where = ['client_id = ?'];
$params = [$clientId];
if ($q !== '') {
$where[] = '(title LIKE ? OR tags LIKE ?)';
$like = '%' . str_replace(['%', '_'], ['\%', '\_'], $q) . '%';
$params[] = $like;
$params[] = $like;
}
$allowedStatus = ['pending', 'processing', 'ready', 'error'];
if ($status !== '' && in_array($status, $allowedStatus, true)) {
$where[] = 'status = ?';
$params[] = $status;
}
if ($category !== '') {
$where[] = 'category = ?';
$params[] = $category;
}
$whereSql = 'WHERE ' . implode(' AND ', $where);
$countStmt = $db->prepare("SELECT COUNT(*) FROM client_documents {$whereSql}");
$countStmt->execute($params);
$total = (int)$countStmt->fetchColumn();
$sql = "SELECT id, title, source_type, language, category, tags, author,
source_tool, import_method, status, word_count, chunk_count,
file_size_bytes, source_url, error_message,
created_at, updated_at
FROM client_documents
{$whereSql}
ORDER BY id DESC
LIMIT {$limit} OFFSET {$offset}";
$stmt = $db->prepare($sql);
$stmt->execute($params);
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
dbnToolsRespond([
'ok' => true,
'total' => $total,
'offset' => $offset,
'limit' => $limit,
'documents' => array_map('shapeDoc', $rows),
]);
}
function respondGet(PDO $db, int $clientId): void
{
$id = (int)($_GET['id'] ?? 0);
if ($id <= 0) {
dbnToolsError('id is required.', 400, 'missing_id');
}
$stmt = $db->prepare(
'SELECT * FROM client_documents WHERE id = ? AND client_id = ? LIMIT 1'
);
$stmt->execute([$id, $clientId]);
$doc = $stmt->fetch(PDO::FETCH_ASSOC);
if (!$doc) {
dbnToolsError('Document not found.', 404, 'not_found');
}
$chunks = $db->prepare(
'SELECT id, content, section_title
FROM client_chunks
WHERE client_id = ? AND document_id = ?
ORDER BY id ASC
LIMIT 200'
);
try {
$chunks->execute([$clientId, $id]);
$chunkRows = $chunks->fetchAll(PDO::FETCH_ASSOC);
} catch (Throwable $e) {
$chunkRows = [];
}
dbnToolsRespond([
'ok' => true,
'document' => shapeDoc($doc) + ['content' => (string)$doc['content']],
'chunks' => $chunkRows,
]);
}
function respondUpdate(PDO $db, int $clientId): void
{
$input = dbnToolsJsonInput(20_000);
$id = (int)($input['id'] ?? 0);
if ($id <= 0) {
dbnToolsError('id is required.', 400, 'missing_id');
}
$fields = [];
$params = [];
$allowed = [
'title' => ['VARCHAR', 500],
'category' => ['VARCHAR', 50],
'tags' => ['VARCHAR', 500],
'language' => ['VARCHAR', 10],
'author' => ['VARCHAR', 200],
];
foreach ($allowed as $col => [$kind, $max]) {
if (!array_key_exists($col, $input)) {
continue;
}
$val = trim((string)$input[$col]);
if (mb_strlen($val, 'UTF-8') > $max) {
dbnToolsError("Field {$col} exceeds {$max} chars.", 422, 'field_too_long');
}
$fields[] = "{$col} = ?";
$params[] = $val !== '' ? $val : null;
}
if (!$fields) {
dbnToolsError('No editable fields supplied.', 400, 'no_fields');
}
$params[] = $id;
$params[] = $clientId;
$stmt = $db->prepare(
'UPDATE client_documents SET ' . implode(', ', $fields)
. ', updated_at = NOW() WHERE id = ? AND client_id = ?'
);
$stmt->execute($params);
$stmt = $db->prepare('SELECT * FROM client_documents WHERE id = ? AND client_id = ? LIMIT 1');
$stmt->execute([$id, $clientId]);
$doc = $stmt->fetch(PDO::FETCH_ASSOC);
dbnToolsRespond(['ok' => true, 'document' => shapeDoc($doc ?: [])]);
}
function respondDelete(PDO $db, int $clientId): void
{
$input = dbnToolsJsonInput(50_000);
$ids = $input['ids'] ?? [];
if (!is_array($ids) || !$ids) {
dbnToolsError('ids array is required.', 400, 'missing_ids');
}
$ids = array_values(array_unique(array_map('intval', $ids)));
$ids = array_filter($ids, fn($v) => $v > 0);
if (!$ids) {
dbnToolsError('No valid ids.', 400, 'invalid_ids');
}
if (count($ids) > 200) {
dbnToolsError('Cannot delete more than 200 documents at once.', 422, 'too_many');
}
$placeholders = implode(',', array_fill(0, count($ids), '?'));
$stmt = $db->prepare(
"DELETE FROM client_documents
WHERE client_id = ? AND id IN ({$placeholders})"
);
$stmt->execute(array_merge([$clientId], $ids));
try {
$chunks = $db->prepare(
"DELETE FROM client_chunks WHERE client_id = ? AND document_id IN ({$placeholders})"
);
$chunks->execute(array_merge([$clientId], $ids));
} catch (Throwable $e) {
// table may be filtered to client_id only; non-fatal
}
dbnToolsRespond(['ok' => true, 'deleted' => $stmt->rowCount()]);
}
function shapeDoc(array $row): array
{
return [
'id' => (int)($row['id'] ?? 0),
'title' => (string)($row['title'] ?? ''),
'source_type' => (string)($row['source_type'] ?? ''),
'language' => (string)($row['language'] ?? ''),
'category' => (string)($row['category'] ?? ''),
'tags' => (string)($row['tags'] ?? ''),
'author' => $row['author'] ?? null,
'source_url' => $row['source_url'] ?? null,
'source_tool' => $row['source_tool'] ?? null,
'import_method' => (string)($row['import_method'] ?? ''),
'status' => (string)($row['status'] ?? ''),
'word_count' => (int)($row['word_count'] ?? 0),
'chunk_count' => (int)($row['chunk_count'] ?? 0),
'file_size_bytes'=> (int)($row['file_size_bytes'] ?? 0),
'error_message' => $row['error_message'] ?? null,
'created_at' => (string)($row['created_at'] ?? ''),
'updated_at' => (string)($row['updated_at'] ?? ''),
];
}
+81
View File
@@ -0,0 +1,81 @@
<?php
/**
* GET /api/dashboard/graph.php?action=cites|cited_by|implements|chain&doc_id=N&limit=20&depth=2
*
* Wraps ai-portal/lib/ai/LegalGraphAgent for the dashboard. Reads the FalkorDB
* `bnl_legal` graph on Colin (10.0.2.10:6379). Public graph metadata — no
* sensitive content — but we still gate on dashboard auth to avoid being a
* generic open proxy.
*
* Response shape mirrors ai-portal/api/graph-search.php:
* { ok, action, doc_id, count, results: [ {rel_type, doc_id, title, ...}, ...] }
*/
declare(strict_types=1);
require_once dirname(__DIR__, 2) . '/includes/bootstrap.php';
dbnToolsRequireMethod('GET');
dbnToolsRequireAuth();
// Don't require dashboard provisioning here — graph is public metadata.
$action = trim((string)($_GET['action'] ?? ''));
$docId = (int)($_GET['doc_id'] ?? 0);
$limit = max(1, min(100, (int)($_GET['limit'] ?? 20)));
$depth = max(1, min(3, (int)($_GET['depth'] ?? 2)));
$validActions = ['cites', 'cited_by', 'implements', 'chain'];
if (!in_array($action, $validActions, true)) {
dbnToolsError(
'action must be one of: ' . implode(', ', $validActions),
400, 'invalid_action', ['actions' => $validActions]
);
}
if ($docId <= 0) {
dbnToolsError('doc_id must be a positive integer.', 400, 'missing_doc_id');
}
$root = dbnToolsAiPortalRoot();
$graphFile = $root . '/lib/ai/GraphClient.php';
$agentFile = $root . '/lib/ai/LegalGraphAgent.php';
if (!is_file($graphFile) || !is_file($agentFile)) {
dbnToolsError('Graph backend not installed.', 503, 'graph_unavailable');
}
require_once $graphFile;
require_once $agentFile;
try {
$config = file_exists('/etc/bnl/config.php') ? include '/etc/bnl/config.php' : [];
$host = (string)($config['falkordb']['host'] ?? dbnToolsEnv('DBN_FALKORDB_HOST', '10.0.2.10'));
$port = (int) ($config['falkordb']['port'] ?? (int)dbnToolsEnv('DBN_FALKORDB_PORT', '6379'));
$pass = (string)($config['falkordb']['password'] ?? dbnToolsEnv('DBN_FALKORDB_PASSWORD', ''));
$client = new GraphClient($host, $port, $pass);
$agent = new LegalGraphAgent($client);
$results = match ($action) {
'cites' => $agent->cites($docId, $limit),
'cited_by' => $agent->citedBy($docId, $limit),
'implements' => $agent->implements($docId, $limit),
'chain' => $agent->chain($docId, $depth),
};
} catch (Throwable $e) {
dbnToolsRespond([
'ok' => true,
'action' => $action,
'doc_id' => $docId,
'count' => 0,
'results' => [],
'warning' => 'Graph backend unavailable: ' . $e->getMessage(),
]);
}
dbnToolsRespond([
'ok' => true,
'action' => $action,
'doc_id' => $docId,
'count' => count($results),
'results' => $results,
]);
+53
View File
@@ -0,0 +1,53 @@
<?php
/**
* GET /api/dashboard/ingest-status.php?ids=1,2,3
*
* Returns per-doc status for polling during URL ingest (background) or to
* surface error messages after a failed sync upload.
*
* Response:
* { ok, statuses: [ {id, status, chunk_count, error_message}, ... ] }
*/
declare(strict_types=1);
require_once dirname(__DIR__, 2) . '/includes/bootstrap.php';
dbnToolsRequireMethod('GET');
dbnToolsRequireAuth();
try {
$tenant = dbnToolsEnsureDashboardTenant();
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode);
}
$clientId = (int)$tenant['client_id'];
$raw = (string)($_GET['ids'] ?? '');
$ids = array_values(array_filter(
array_map('intval', explode(',', $raw)),
fn($v) => $v > 0
));
if (!$ids) {
dbnToolsRespond(['ok' => true, 'statuses' => []]);
}
$ids = array_slice($ids, 0, 100);
$db = dbnToolsDb();
$placeholders = implode(',', array_fill(0, count($ids), '?'));
$sql = "SELECT id, status, chunk_count, error_message
FROM client_documents
WHERE client_id = ? AND id IN ({$placeholders})";
$stmt = $db->prepare($sql);
$stmt->execute(array_merge([$clientId], $ids));
$rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
dbnToolsRespond([
'ok' => true,
'statuses' => array_map(fn($r) => [
'id' => (int)$r['id'],
'status' => (string)$r['status'],
'chunk_count' => (int)$r['chunk_count'],
'error_message' => $r['error_message'] ?? null,
], $rows),
]);
+136
View File
@@ -0,0 +1,136 @@
<?php
/**
* POST /api/dashboard/save-from-tool.php
*
* Improved successor to /api/save-to-corpus.php — adds:
* - tags as either CSV string or array
* - source_tool slug recorded as import provenance
* - chat-answer kind (records import_method='chat_answer')
* - preview flag: if true, returns the proposed chunks WITHOUT persisting (dry-run)
*
* Request body (JSON, max 2 MB):
* title: string (required)
* content: string (required, min 30 chars)
* source_tool: string (optional slug; default 'dashboard-save')
* tags: string[] | string CSV (optional, max 20 tags, 32 chars each)
* category: string (optional; default 'tool-output')
* language: string (optional; default 'no')
* author: string (optional)
* kind: 'tool_output'|'chat_answer'|'manual' (default 'tool_output')
* preview: bool (optional; if true, return chunk preview without saving)
*
* Response (saved):
* { ok, document_id, chunks, status }
* Response (preview):
* { ok, preview:true, chunks: [...], word_count }
*/
declare(strict_types=1);
require_once dirname(__DIR__, 2) . '/includes/bootstrap.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
try {
$tenant = dbnToolsEnsureDashboardTenant();
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode);
}
$clientId = (int)$tenant['client_id'];
$corpusId = (int)$tenant['corpus_id'];
$input = dbnToolsJsonInput(2_000_000);
$title = trim((string)($input['title'] ?? ''));
if ($title === '') dbnToolsError('title is required.', 400, 'missing_title');
if (mb_strlen($title, 'UTF-8') > 500) dbnToolsError('title too long (max 500).', 422, 'title_too_long');
$content = trim((string)($input['content'] ?? ''));
if (mb_strlen($content, 'UTF-8') < 30) dbnToolsError('content too short (min 30 chars).', 400, 'content_too_short');
if (mb_strlen($content, 'UTF-8') > 1_900_000) dbnToolsError('content exceeds 2 MB.', 422, 'content_too_large');
$sourceTool = trim((string)($input['source_tool'] ?? 'dashboard-save'));
$sourceTool = substr(preg_replace('/[^a-z0-9\-_]/', '', strtolower($sourceTool)) ?: 'dashboard-save', 0, 64);
$rawTags = $input['tags'] ?? '';
$tagList = is_array($rawTags)
? array_map('strval', $rawTags)
: array_map('trim', explode(',', (string)$rawTags));
$tagList = array_values(array_filter(array_map(fn($t) => substr(trim($t), 0, 32), $tagList)));
$tagList = array_slice($tagList, 0, 20);
$tagsCsv = implode(',', $tagList);
$category = strtolower(trim((string)($input['category'] ?? 'tool-output')));
$category = substr(preg_replace('/[^a-z0-9\-_]/', '', $category) ?: 'tool-output', 0, 50);
$language = trim((string)($input['language'] ?? 'no')) ?: 'no';
$author = trim((string)($input['author'] ?? '')) ?: null;
$kind = (string)($input['kind'] ?? 'tool_output');
$importMethod = match ($kind) {
'chat_answer' => 'chat_answer',
'manual' => 'manual',
default => 'tool_output',
};
$preview = !empty($input['preview']);
$wordCount = str_word_count($content);
dbnToolsBootCaveau();
try {
if ($preview) {
require_once dbnToolsAiPortalRoot() . '/lib/ai/TextChunker.php';
$chunker = new TextChunker();
$chunks = $chunker->chunk($content);
$sample = array_slice($chunks, 0, 8);
dbnToolsRespond([
'ok' => true,
'preview' => true,
'word_count' => $wordCount,
'chunks' => array_map(fn($c) => [
'section_title' => (string)($c['section_title'] ?? ''),
'word_count' => (int)str_word_count((string)($c['content'] ?? '')),
'snippet' => mb_substr((string)($c['content'] ?? ''), 0, 240, 'UTF-8'),
], $sample),
'total_chunks' => count($chunks),
]);
}
$db = getDb();
$ins = $db->prepare("
INSERT INTO client_documents
(client_id, corpus_id, title, source_type, content, category, language,
tags, author, import_method, source_tool, word_count, status)
VALUES (?, ?, ?, 'text', ?, ?, ?, ?, ?, ?, ?, ?, 'pending')
");
$ins->execute([
$clientId, $corpusId, $title, $content, $category, $language,
$tagsCsv, $author, $importMethod, $sourceTool, $wordCount,
]);
$docId = (int)$db->lastInsertId();
$rag = new ClientRagPipeline($clientId);
$chunks = $rag->ingestDocument($docId);
dbnToolsRespond([
'ok' => true,
'document_id' => $docId,
'chunks' => (int)$chunks,
'status' => 'ready',
], 201);
} catch (Throwable $e) {
if (isset($docId)) {
try {
$db->prepare("UPDATE client_documents SET status='error', error_message=? WHERE id=?")
->execute([substr($e->getMessage(), 0, 1000), $docId]);
} catch (Throwable $ignored) { /* non-fatal */ }
dbnToolsError(
'Saved to corpus but indexing failed: ' . $e->getMessage(),
500, 'index_failed',
['document_id' => $docId]
);
}
dbnToolsError('Save failed: ' . $e->getMessage(), 500, 'save_failed');
}
+241
View File
@@ -0,0 +1,241 @@
<?php
/**
* POST /api/dashboard/upload.php
*
* Three input modes:
* - multipart/form-data with `file` field (PDF/DOCX/TXT, <= 8 MB)
* - JSON body { "kind":"text", "title":..., "content":..., "category"?, "tags"?, "author"?, "language"? }
* - JSON body { "kind":"url", "title":..., "url":... } (fetched via ClientUniversalScraper; queued)
*
* For file + text: writes pending row, runs ClientRagPipeline::ingestDocument() synchronously,
* returns { ok, document_id, chunks, status }
* For url: writes pending row, returns immediately with status:'pending' — a separate cron job
* (run_client_one.php on the ai-portal) does the ingest.
*
* If file text extraction yields less than 200 chars, attempts OCR via `tesseract` shell util.
*/
declare(strict_types=1);
require_once dirname(__DIR__, 2) . '/includes/bootstrap.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
try {
$tenant = dbnToolsEnsureDashboardTenant();
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode);
}
$clientId = (int)$tenant['client_id'];
$corpusId = (int)$tenant['corpus_id'];
dbnToolsBootCaveau();
$db = getDb();
$contentType = (string)($_SERVER['CONTENT_TYPE'] ?? '');
$isMultipart = stripos($contentType, 'multipart/form-data') === 0;
try {
if ($isMultipart) {
$result = handleFileUpload($db, $clientId, $corpusId);
} else {
$input = dbnToolsJsonInput(2_500_000);
$kind = (string)($input['kind'] ?? 'text');
$result = match ($kind) {
'text' => handleTextPaste($db, $clientId, $corpusId, $input),
'url' => handleUrlImport($db, $clientId, $corpusId, $input),
default => dbnToolsError('Unknown kind: ' . $kind, 400, 'unknown_kind'),
};
}
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
} catch (Throwable $e) {
dbnToolsError('Upload failed: ' . $e->getMessage(), 500, 'upload_failed');
}
dbnToolsRespond($result, 201);
function handleFileUpload(PDO $db, int $clientId, int $corpusId): array
{
if (empty($_FILES['file'])) {
dbnToolsError('No file uploaded.', 400, 'missing_file');
}
$extract = dbnToolsExtractUploadedFile($_FILES['file']);
$text = (string)$extract['text'];
$filename = (string)$extract['filename'];
$ext = strtolower(pathinfo($filename, PATHINFO_EXTENSION));
$sourceType = match ($ext) {
'pdf' => 'pdf',
'docx' => 'docx',
default => 'text',
};
if (mb_strlen($text, 'UTF-8') < 200 && $ext === 'pdf') {
$ocrText = tryOcrPdf((string)($_FILES['file']['tmp_name'] ?? ''));
if ($ocrText !== null && mb_strlen($ocrText, 'UTF-8') > mb_strlen($text, 'UTF-8')) {
$text = $ocrText;
$importMethod = 'ocr_scan';
}
}
$importMethod = $importMethod ?? 'dbn_upload';
$title = trim((string)($_POST['title'] ?? '')) ?: pathinfo($filename, PATHINFO_FILENAME);
$category = sanitizeCategory((string)($_POST['category'] ?? 'uncategorized'));
$tags = sanitizeTagsCsv((string)($_POST['tags'] ?? ''));
$author = trim((string)($_POST['author'] ?? '')) ?: null;
$language = trim((string)($_POST['language'] ?? 'no')) ?: 'no';
return persistAndIngest($db, $clientId, $corpusId, [
'title' => $title,
'source_type' => $sourceType,
'content' => $text,
'category' => $category,
'tags' => $tags,
'author' => $author,
'language' => $language,
'import_method' => $importMethod,
'original_filename' => $filename,
'file_size_bytes' => (int)($_FILES['file']['size'] ?? 0),
'source_tool' => 'dashboard-upload',
]);
}
function handleTextPaste(PDO $db, int $clientId, int $corpusId, array $input): array
{
$title = trim((string)($input['title'] ?? ''));
$content = trim((string)($input['content'] ?? ''));
if ($title === '') dbnToolsError('title is required.', 400, 'missing_title');
if (mb_strlen($content, 'UTF-8') < 30) dbnToolsError('content too short (min 30 chars).', 400, 'content_too_short');
if (mb_strlen($content, 'UTF-8') > 2_000_000) dbnToolsError('content exceeds 2 MB.', 400, 'content_too_large');
return persistAndIngest($db, $clientId, $corpusId, [
'title' => $title,
'source_type' => 'text',
'content' => $content,
'category' => sanitizeCategory((string)($input['category'] ?? 'uncategorized')),
'tags' => sanitizeTagsCsv((string)($input['tags'] ?? '')),
'author' => trim((string)($input['author'] ?? '')) ?: null,
'language' => trim((string)($input['language'] ?? 'no')) ?: 'no',
'import_method' => 'manual',
'source_tool' => 'dashboard-paste',
]);
}
function handleUrlImport(PDO $db, int $clientId, int $corpusId, array $input): array
{
$url = trim((string)($input['url'] ?? ''));
$title = trim((string)($input['title'] ?? ''));
if ($url === '' || !filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) {
dbnToolsError('Valid URL is required.', 400, 'invalid_url');
}
$scheme = strtolower((string)parse_url($url, PHP_URL_SCHEME));
if (!in_array($scheme, ['http', 'https'], true)) {
dbnToolsError('URL must use http or https.', 400, 'invalid_scheme');
}
if ($title === '') $title = $url;
$stmt = $db->prepare("
INSERT INTO client_documents
(client_id, corpus_id, title, source_type, source_url, content,
category, tags, language, import_method, source_tool, status)
VALUES (?, ?, ?, 'url', ?, '', ?, ?, ?, 'url', 'dashboard-url', 'pending')
");
$stmt->execute([
$clientId, $corpusId, $title, $url,
sanitizeCategory((string)($input['category'] ?? 'uncategorized')),
sanitizeTagsCsv((string)($input['tags'] ?? '')),
trim((string)($input['language'] ?? 'no')) ?: 'no',
]);
return [
'ok' => true,
'document_id' => (int)$db->lastInsertId(),
'status' => 'pending',
'chunks' => 0,
'note' => 'URL queued for background ingest.',
];
}
function persistAndIngest(PDO $db, int $clientId, int $corpusId, array $doc): array
{
$wordCount = str_word_count($doc['content']);
$stmt = $db->prepare("
INSERT INTO client_documents
(client_id, corpus_id, title, source_type, original_filename, file_size_bytes,
content, category, tags, author, language,
import_method, source_tool, word_count, status)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'pending')
");
$stmt->execute([
$clientId,
$corpusId,
$doc['title'],
$doc['source_type'],
$doc['original_filename'] ?? null,
$doc['file_size_bytes'] ?? 0,
$doc['content'],
$doc['category'],
$doc['tags'],
$doc['author'] ?? null,
$doc['language'],
$doc['import_method'],
$doc['source_tool'],
$wordCount,
]);
$docId = (int)$db->lastInsertId();
try {
$rag = new ClientRagPipeline($clientId);
$chunks = $rag->ingestDocument($docId);
return [
'ok' => true,
'document_id' => $docId,
'chunks' => (int)$chunks,
'status' => 'ready',
'word_count' => $wordCount,
];
} catch (Throwable $e) {
$db->prepare("UPDATE client_documents SET status='error', error_message=? WHERE id=?")
->execute([substr($e->getMessage(), 0, 1000), $docId]);
return [
'ok' => false,
'document_id' => $docId,
'status' => 'error',
'error' => ['code' => 'index_failed', 'message' => 'Saved, but indexing failed: ' . $e->getMessage()],
];
}
}
function sanitizeCategory(string $cat): string
{
$cat = strtolower(trim($cat));
$cat = preg_replace('/[^a-z0-9\-_]/', '', $cat) ?: 'uncategorized';
return substr($cat, 0, 50);
}
function sanitizeTagsCsv(string $raw): string
{
$tags = array_filter(array_map('trim', explode(',', $raw)));
$tags = array_values(array_slice(array_map(fn($t) => substr($t, 0, 32), $tags), 0, 20));
return implode(',', $tags);
}
function tryOcrPdf(string $tmpPath): ?string
{
if ($tmpPath === '' || !is_readable($tmpPath)) return null;
if (!function_exists('shell_exec')) return null;
$check = @shell_exec('command -v tesseract 2>/dev/null');
if (!$check) return null;
$out = trim((string)@shell_exec(
'pdftoppm -r 200 ' . escapeshellarg($tmpPath) . ' - -png 2>/dev/null | '
. 'tesseract -l nor+eng stdin stdout 2>/dev/null'
));
return $out !== '' ? $out : null;
}