Add Deep Research tool — agent + rank/rerank RAG

New surface at /deep-research.php where the user pastes a question or
uploads PDF/DOCX/TXT case files and a LLM-orchestrated agent researches
the Do Better Norge legal corpus from 3-5 angles, with hybrid retrieval,
cross-encoder rerank, and synthesis that emits an inline-[n]-cited
markdown brief plus a numbered sources panel.

Uploaded documents are chunked + embedded in memory only (nomic-embed-text
via LiteLLM) and searched alongside the shared corpus during the same
request — never persisted to disk, DB, or Qdrant.

Reuses ClientRagPipeline::searchAll (hybrid + rerank), dbnV6 slice
helpers, and the existing extract.php text-extraction logic via a new
dbnToolsExtractUploadedFile() helper. Also adds dbnToolsCallGpuLlm()
helper in bootstrap.php — fixes a latent bug where LegalTools.php
was already calling that name with no definition.

Search.php is unchanged.
This commit is contained in:
2026-05-15 10:30:47 +02:00
parent 55e11cb649
commit 4cbe0a4ac4
10 changed files with 2119 additions and 125 deletions
+67
View File
@@ -0,0 +1,67 @@
<?php
declare(strict_types=1);
require_once __DIR__ . '/../includes/bootstrap.php';
require_once __DIR__ . '/../includes/DeepResearchAgent.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
$isMultipart = stripos((string)($_SERVER['CONTENT_TYPE'] ?? ''), 'multipart/form-data') !== false;
if ($isMultipart) {
$payloadRaw = (string)($_POST['payload'] ?? '');
if ($payloadRaw === '') {
dbnToolsError('Multipart request is missing the "payload" JSON field.', 422, 'missing_payload');
}
$input = json_decode($payloadRaw, true);
if (!is_array($input)) {
dbnToolsError('Multipart "payload" field must be valid JSON.', 422, 'invalid_payload_json');
}
} else {
$input = dbnToolsJsonInput(120000);
}
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
dbnToolsWithTelemetry('deep_research', $language, function () use ($input, $language) {
$seedQuery = dbnToolsString($input, 'query', 4000, false);
$pastedText = dbnToolsString($input, 'paste_text', 64000, false);
$sliceInput = $input['slices'] ?? null;
$engine = (string)($input['engine'] ?? 'azure_mini');
$controls = is_array($input['controls'] ?? null) ? $input['controls'] : [];
$uploadedFiles = [];
if (!empty($_FILES['files']) && is_array($_FILES['files']['tmp_name'] ?? null)) {
$count = count($_FILES['files']['tmp_name']);
if ($count > 5) {
dbnToolsAbort('At most 5 files can be uploaded per request.', 413, 'too_many_files');
}
for ($i = 0; $i < $count; $i++) {
$file = [
'name' => $_FILES['files']['name'][$i] ?? '',
'type' => $_FILES['files']['type'][$i] ?? '',
'tmp_name' => $_FILES['files']['tmp_name'][$i] ?? '',
'error' => $_FILES['files']['error'][$i] ?? UPLOAD_ERR_NO_FILE,
'size' => $_FILES['files']['size'][$i] ?? 0,
];
$extracted = dbnToolsExtractUploadedFile($file);
$uploadedFiles[] = [
'filename' => $extracted['filename'],
'text' => $extracted['text'],
'chars' => $extracted['chars'],
'truncated' => $extracted['truncated'],
];
}
}
return (new DbnDeepResearchAgent())->run(
$seedQuery,
$pastedText,
$uploadedFiles,
is_array($sliceInput) ? $sliceInput : [],
$engine,
$language,
$controls
);
});
+2 -118
View File
@@ -6,132 +6,16 @@ require_once __DIR__ . '/../includes/bootstrap.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
const EXTRACT_MAX_BYTES = 4 * 1024 * 1024;
const EXTRACT_TEXT_LIMIT = 128000;
const EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
try {
if (empty($_FILES['file']) || !is_array($_FILES['file'])) {
dbnToolsError('No file was uploaded.', 422, 'missing_file');
}
$file = $_FILES['file'];
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
if ($errCode !== UPLOAD_ERR_OK) {
$msg = match ($errCode) {
UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.',
UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.',
UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.',
default => 'File upload failed.',
};
dbnToolsError($msg, 422, 'upload_error');
}
$originalName = basename((string)($file['name'] ?? ''));
$tmpPath = (string)($file['tmp_name'] ?? '');
$size = (int)($file['size'] ?? 0);
if (!is_uploaded_file($tmpPath)) {
dbnToolsError('Invalid file upload.', 400, 'invalid_upload');
}
if ($size === 0) {
dbnToolsError('The uploaded file is empty.', 422, 'file_empty');
}
if ($size > EXTRACT_MAX_BYTES) {
dbnToolsError('File exceeds the 4 MB limit.', 413, 'file_too_large');
}
$ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION));
if (!in_array($ext, EXTRACT_ALLOWED_EXTS, true)) {
dbnToolsError('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type');
}
$text = match ($ext) {
'txt' => extractTxt($tmpPath),
'pdf' => extractPdf($tmpPath),
'docx' => extractDocx($tmpPath),
};
$text = trim($text);
if ($text === '') {
dbnToolsError('No text could be extracted from this file.', 422, 'no_text');
}
$truncated = false;
if (mb_strlen($text, 'UTF-8') > EXTRACT_TEXT_LIMIT) {
$text = mb_substr($text, 0, EXTRACT_TEXT_LIMIT, 'UTF-8');
$truncated = true;
}
dbnToolsRespond([
'ok' => true,
'text' => $text,
'filename' => $originalName,
'chars' => mb_strlen($text, 'UTF-8'),
'truncated' => $truncated,
]);
$result = dbnToolsExtractUploadedFile($_FILES['file']);
dbnToolsRespond($result);
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
} catch (Throwable $e) {
error_log('DBN extract error: ' . $e->getMessage());
dbnToolsError('Text extraction failed.', 500, 'extract_error');
}
function extractTxt(string $path): string
{
$content = file_get_contents($path);
if ($content === false) {
throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error');
}
return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252');
}
function extractPdf(string $path): string
{
$cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null';
$output = shell_exec($cmd);
if ($output === null || $output === false || trim($output) === '') {
throw new DbnToolsHttpException(
'PDF text extraction failed. The file may be image-only or encrypted.',
422,
'pdf_extract_failed'
);
}
return $output;
}
function extractDocx(string $path): string
{
$zip = new ZipArchive();
$result = $zip->open($path);
if ($result !== true) {
throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed');
}
$xml = $zip->getFromName('word/document.xml');
$zip->close();
if ($xml === false) {
throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content');
}
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadXML($xml);
libxml_clear_errors();
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
$paragraphs = [];
foreach ($xpath->query('//w:p') as $para) {
$runs = [];
foreach ($xpath->query('.//w:t', $para) as $t) {
$runs[] = $t->textContent;
}
$paragraphs[] = implode('', $runs);
}
return implode("\n", $paragraphs);
}