Add Deep Research tool — agent + rank/rerank RAG
New surface at /deep-research.php where the user pastes a question or uploads PDF/DOCX/TXT case files and a LLM-orchestrated agent researches the Do Better Norge legal corpus from 3-5 angles, with hybrid retrieval, cross-encoder rerank, and synthesis that emits an inline-[n]-cited markdown brief plus a numbered sources panel. Uploaded documents are chunked + embedded in memory only (nomic-embed-text via LiteLLM) and searched alongside the shared corpus during the same request — never persisted to disk, DB, or Qdrant. Reuses ClientRagPipeline::searchAll (hybrid + rerank), dbnV6 slice helpers, and the existing extract.php text-extraction logic via a new dbnToolsExtractUploadedFile() helper. Also adds dbnToolsCallGpuLlm() helper in bootstrap.php — fixes a latent bug where LegalTools.php was already calling that name with no definition. Search.php is unchanged.
This commit is contained in:
@@ -487,3 +487,192 @@ function dbnToolsExcerpt(string $text, int $limit = 520): string
|
||||
}
|
||||
return rtrim(mb_substr($text, 0, $limit - 1, 'UTF-8')) . '…';
|
||||
}
|
||||
|
||||
const DBN_TOOLS_EXTRACT_MAX_BYTES = 4 * 1024 * 1024;
|
||||
const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000;
|
||||
const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
|
||||
|
||||
function dbnToolsExtractUploadedFile(array $file): array
|
||||
{
|
||||
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
|
||||
if ($errCode !== UPLOAD_ERR_OK) {
|
||||
$msg = match ($errCode) {
|
||||
UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.',
|
||||
UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.',
|
||||
UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.',
|
||||
default => 'File upload failed.',
|
||||
};
|
||||
dbnToolsAbort($msg, 422, 'upload_error');
|
||||
}
|
||||
|
||||
$originalName = basename((string)($file['name'] ?? ''));
|
||||
$tmpPath = (string)($file['tmp_name'] ?? '');
|
||||
$size = (int)($file['size'] ?? 0);
|
||||
|
||||
if (!is_uploaded_file($tmpPath)) {
|
||||
dbnToolsAbort('Invalid file upload.', 400, 'invalid_upload');
|
||||
}
|
||||
if ($size === 0) {
|
||||
dbnToolsAbort('The uploaded file is empty.', 422, 'file_empty');
|
||||
}
|
||||
if ($size > DBN_TOOLS_EXTRACT_MAX_BYTES) {
|
||||
dbnToolsAbort('File exceeds the 4 MB limit.', 413, 'file_too_large');
|
||||
}
|
||||
|
||||
$ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION));
|
||||
if (!in_array($ext, DBN_TOOLS_EXTRACT_ALLOWED_EXTS, true)) {
|
||||
dbnToolsAbort('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type');
|
||||
}
|
||||
|
||||
$text = match ($ext) {
|
||||
'txt' => dbnToolsExtractTxt($tmpPath),
|
||||
'pdf' => dbnToolsExtractPdf($tmpPath),
|
||||
'docx' => dbnToolsExtractDocx($tmpPath),
|
||||
};
|
||||
|
||||
$text = trim($text);
|
||||
if ($text === '') {
|
||||
dbnToolsAbort('No text could be extracted from this file.', 422, 'no_text');
|
||||
}
|
||||
|
||||
$truncated = false;
|
||||
if (mb_strlen($text, 'UTF-8') > DBN_TOOLS_EXTRACT_TEXT_LIMIT) {
|
||||
$text = mb_substr($text, 0, DBN_TOOLS_EXTRACT_TEXT_LIMIT, 'UTF-8');
|
||||
$truncated = true;
|
||||
}
|
||||
|
||||
return [
|
||||
'ok' => true,
|
||||
'text' => $text,
|
||||
'filename' => $originalName,
|
||||
'chars' => mb_strlen($text, 'UTF-8'),
|
||||
'truncated' => $truncated,
|
||||
];
|
||||
}
|
||||
|
||||
function dbnToolsExtractTxt(string $path): string
|
||||
{
|
||||
$content = file_get_contents($path);
|
||||
if ($content === false) {
|
||||
throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error');
|
||||
}
|
||||
return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252');
|
||||
}
|
||||
|
||||
function dbnToolsExtractPdf(string $path): string
|
||||
{
|
||||
$cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null';
|
||||
$output = shell_exec($cmd);
|
||||
if ($output === null || $output === false || trim($output) === '') {
|
||||
throw new DbnToolsHttpException(
|
||||
'PDF text extraction failed. The file may be image-only or encrypted.',
|
||||
422,
|
||||
'pdf_extract_failed'
|
||||
);
|
||||
}
|
||||
return $output;
|
||||
}
|
||||
|
||||
function dbnToolsExtractDocx(string $path): string
|
||||
{
|
||||
$zip = new ZipArchive();
|
||||
$result = $zip->open($path);
|
||||
if ($result !== true) {
|
||||
throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed');
|
||||
}
|
||||
|
||||
$xml = $zip->getFromName('word/document.xml');
|
||||
$zip->close();
|
||||
|
||||
if ($xml === false) {
|
||||
throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content');
|
||||
}
|
||||
|
||||
$doc = new DOMDocument();
|
||||
libxml_use_internal_errors(true);
|
||||
$doc->loadXML($xml);
|
||||
libxml_clear_errors();
|
||||
|
||||
$xpath = new DOMXPath($doc);
|
||||
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
|
||||
|
||||
$paragraphs = [];
|
||||
foreach ($xpath->query('//w:p') as $para) {
|
||||
$runs = [];
|
||||
foreach ($xpath->query('.//w:t', $para) as $t) {
|
||||
$runs[] = $t->textContent;
|
||||
}
|
||||
$paragraphs[] = implode('', $runs);
|
||||
}
|
||||
|
||||
return implode("\n", $paragraphs);
|
||||
}
|
||||
|
||||
function dbnToolsCallGpuLlm(array $messages, array $options = []): array
|
||||
{
|
||||
$url = 'http://10.0.1.10:4000/v1/chat/completions';
|
||||
$apiKey = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d');
|
||||
$model = (string)($options['model'] ?? 'qwen2.5:14b');
|
||||
$timeout = (int)($options['timeout'] ?? 90);
|
||||
|
||||
$payload = [
|
||||
'model' => $model,
|
||||
'messages' => $messages,
|
||||
'temperature' => $options['temperature'] ?? 0.1,
|
||||
'max_tokens' => $options['max_tokens'] ?? 8000,
|
||||
];
|
||||
if (!empty($options['json'])) {
|
||||
$payload['response_format'] = ['type' => 'json_object'];
|
||||
}
|
||||
|
||||
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
$headers = [
|
||||
'Content-Type: application/json',
|
||||
'Authorization: Bearer ' . $apiKey,
|
||||
];
|
||||
|
||||
if (function_exists('curl_init')) {
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => $headers,
|
||||
CURLOPT_TIMEOUT => $timeout,
|
||||
]);
|
||||
$response = curl_exec($ch);
|
||||
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
$err = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($response === false) {
|
||||
throw new RuntimeException('GPU LiteLLM request failed: ' . $err);
|
||||
}
|
||||
} else {
|
||||
$ctx = stream_context_create(['http' => [
|
||||
'method' => 'POST',
|
||||
'header' => implode("\r\n", $headers),
|
||||
'content' => $body,
|
||||
'timeout' => $timeout,
|
||||
'ignore_errors' => true,
|
||||
]]);
|
||||
$response = @file_get_contents($url, false, $ctx);
|
||||
$code = 0;
|
||||
if (isset($http_response_header[0]) && preg_match('/\s(\d{3})\s/', $http_response_header[0], $m)) {
|
||||
$code = (int)$m[1];
|
||||
}
|
||||
if ($response === false) {
|
||||
throw new RuntimeException('GPU LiteLLM request failed.');
|
||||
}
|
||||
}
|
||||
|
||||
$decoded = json_decode($response, true);
|
||||
if (!is_array($decoded)) {
|
||||
throw new RuntimeException('GPU LiteLLM returned non-JSON response.');
|
||||
}
|
||||
if ($code < 200 || $code >= 300) {
|
||||
$msg = $decoded['error']['message'] ?? ('HTTP ' . $code);
|
||||
throw new RuntimeException('GPU LiteLLM error: ' . $msg);
|
||||
}
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user