getMessage(), $e->status, $e->errorCode); } $clientId = (int)$tenant['client_id']; $corpusId = (int)$tenant['corpus_id']; dbnToolsBootCaveau(); $db = getDb(); $contentType = (string)($_SERVER['CONTENT_TYPE'] ?? ''); $isMultipart = stripos($contentType, 'multipart/form-data') === 0; try { if ($isMultipart) { $result = handleFileUpload($db, $clientId, $corpusId); } else { $input = dbnToolsJsonInput(2_500_000); $kind = (string)($input['kind'] ?? 'text'); $result = match ($kind) { 'text' => handleTextPaste($db, $clientId, $corpusId, $input), 'url' => handleUrlImport($db, $clientId, $corpusId, $input), default => dbnToolsError('Unknown kind: ' . $kind, 400, 'unknown_kind'), }; } } catch (DbnToolsHttpException $e) { dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra); } catch (Throwable $e) { dbnToolsError('Upload failed: ' . $e->getMessage(), 500, 'upload_failed'); } dbnToolsRespond($result, 201); function handleFileUpload(PDO $db, int $clientId, int $corpusId): array { if (empty($_FILES['file'])) { dbnToolsError('No file uploaded.', 400, 'missing_file'); } $extract = dbnToolsExtractUploadedFile($_FILES['file']); $text = (string)$extract['text']; $filename = (string)$extract['filename']; $ext = strtolower(pathinfo($filename, PATHINFO_EXTENSION)); $sourceType = match ($ext) { 'pdf' => 'pdf', 'docx' => 'docx', default => 'text', }; if (mb_strlen($text, 'UTF-8') < 200 && $ext === 'pdf') { $ocrText = tryOcrPdf((string)($_FILES['file']['tmp_name'] ?? '')); if ($ocrText !== null && mb_strlen($ocrText, 'UTF-8') > mb_strlen($text, 'UTF-8')) { $text = $ocrText; $importMethod = 'ocr_scan'; } } $importMethod = $importMethod ?? 'dbn_upload'; $title = trim((string)($_POST['title'] ?? '')) ?: pathinfo($filename, PATHINFO_FILENAME); $category = sanitizeCategory((string)($_POST['category'] ?? 'uncategorized')); $tags = sanitizeTagsCsv((string)($_POST['tags'] ?? '')); $author = trim((string)($_POST['author'] ?? '')) ?: null; $language = trim((string)($_POST['language'] ?? 'no')) ?: 'no'; return persistAndIngest($db, $clientId, $corpusId, [ 'title' => $title, 'source_type' => $sourceType, 'content' => $text, 'category' => $category, 'tags' => $tags, 'author' => $author, 'language' => $language, 'import_method' => $importMethod, 'original_filename' => $filename, 'file_size_bytes' => (int)($_FILES['file']['size'] ?? 0), 'source_tool' => 'dashboard-upload', ]); } function handleTextPaste(PDO $db, int $clientId, int $corpusId, array $input): array { $title = trim((string)($input['title'] ?? '')); $content = trim((string)($input['content'] ?? '')); if ($title === '') dbnToolsError('title is required.', 400, 'missing_title'); if (mb_strlen($content, 'UTF-8') < 30) dbnToolsError('content too short (min 30 chars).', 400, 'content_too_short'); if (mb_strlen($content, 'UTF-8') > 2_000_000) dbnToolsError('content exceeds 2 MB.', 400, 'content_too_large'); return persistAndIngest($db, $clientId, $corpusId, [ 'title' => $title, 'source_type' => 'text', 'content' => $content, 'category' => sanitizeCategory((string)($input['category'] ?? 'uncategorized')), 'tags' => sanitizeTagsCsv((string)($input['tags'] ?? '')), 'author' => trim((string)($input['author'] ?? '')) ?: null, 'language' => trim((string)($input['language'] ?? 'no')) ?: 'no', 'import_method' => 'manual', 'source_tool' => 'dashboard-paste', ]); } function handleUrlImport(PDO $db, int $clientId, int $corpusId, array $input): array { $url = trim((string)($input['url'] ?? '')); $title = trim((string)($input['title'] ?? '')); if ($url === '' || !filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED)) { dbnToolsError('Valid URL is required.', 400, 'invalid_url'); } $scheme = strtolower((string)parse_url($url, PHP_URL_SCHEME)); if (!in_array($scheme, ['http', 'https'], true)) { dbnToolsError('URL must use http or https.', 400, 'invalid_scheme'); } if ($title === '') $title = $url; $stmt = $db->prepare(" INSERT INTO client_documents (client_id, corpus_id, title, source_type, source_url, content, category, tags, language, import_method, source_tool, status) VALUES (?, ?, ?, 'url', ?, '', ?, ?, ?, 'url', 'dashboard-url', 'pending') "); $stmt->execute([ $clientId, $corpusId, $title, $url, sanitizeCategory((string)($input['category'] ?? 'uncategorized')), sanitizeTagsCsv((string)($input['tags'] ?? '')), trim((string)($input['language'] ?? 'no')) ?: 'no', ]); return [ 'ok' => true, 'document_id' => (int)$db->lastInsertId(), 'status' => 'pending', 'chunks' => 0, 'note' => 'URL queued for background ingest.', ]; } function persistAndIngest(PDO $db, int $clientId, int $corpusId, array $doc): array { $wordCount = str_word_count($doc['content']); $stmt = $db->prepare(" INSERT INTO client_documents (client_id, corpus_id, title, source_type, original_filename, file_size_bytes, content, category, tags, author, language, import_method, source_tool, word_count, status) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'pending') "); $stmt->execute([ $clientId, $corpusId, $doc['title'], $doc['source_type'], $doc['original_filename'] ?? null, $doc['file_size_bytes'] ?? 0, $doc['content'], $doc['category'], $doc['tags'], $doc['author'] ?? null, $doc['language'], $doc['import_method'], $doc['source_tool'], $wordCount, ]); $docId = (int)$db->lastInsertId(); try { $rag = new ClientRagPipeline($clientId); $chunks = $rag->ingestDocument($docId); return [ 'ok' => true, 'document_id' => $docId, 'chunks' => (int)$chunks, 'status' => 'ready', 'word_count' => $wordCount, ]; } catch (Throwable $e) { $db->prepare("UPDATE client_documents SET status='error', error_message=? WHERE id=?") ->execute([substr($e->getMessage(), 0, 1000), $docId]); return [ 'ok' => false, 'document_id' => $docId, 'status' => 'error', 'error' => ['code' => 'index_failed', 'message' => 'Saved, but indexing failed: ' . $e->getMessage()], ]; } } function sanitizeCategory(string $cat): string { $cat = strtolower(trim($cat)); $cat = preg_replace('/[^a-z0-9\-_]/', '', $cat) ?: 'uncategorized'; return substr($cat, 0, 50); } function sanitizeTagsCsv(string $raw): string { $tags = array_filter(array_map('trim', explode(',', $raw))); $tags = array_values(array_slice(array_map(fn($t) => substr($t, 0, 32), $tags), 0, 20)); return implode(',', $tags); } function tryOcrPdf(string $tmpPath): ?string { if ($tmpPath === '' || !is_readable($tmpPath)) return null; if (!function_exists('shell_exec')) return null; $check = @shell_exec('command -v tesseract 2>/dev/null'); if (!$check) return null; $out = trim((string)@shell_exec( 'pdftoppm -r 200 ' . escapeshellarg($tmpPath) . ' - -png 2>/dev/null | ' . 'tesseract -l nor+eng stdin stdout 2>/dev/null' )); return $out !== '' ? $out : null; }