Add Deep Research tool — agent + rank/rerank RAG

New surface at /deep-research.php where the user pastes a question or
uploads PDF/DOCX/TXT case files and a LLM-orchestrated agent researches
the Do Better Norge legal corpus from 3-5 angles, with hybrid retrieval,
cross-encoder rerank, and synthesis that emits an inline-[n]-cited
markdown brief plus a numbered sources panel.

Uploaded documents are chunked + embedded in memory only (nomic-embed-text
via LiteLLM) and searched alongside the shared corpus during the same
request — never persisted to disk, DB, or Qdrant.

Reuses ClientRagPipeline::searchAll (hybrid + rerank), dbnV6 slice
helpers, and the existing extract.php text-extraction logic via a new
dbnToolsExtractUploadedFile() helper. Also adds dbnToolsCallGpuLlm()
helper in bootstrap.php — fixes a latent bug where LegalTools.php
was already calling that name with no definition.

Search.php is unchanged.
This commit is contained in:
2026-05-15 10:30:47 +02:00
parent 55e11cb649
commit 4cbe0a4ac4
10 changed files with 2119 additions and 125 deletions
+67
View File
@@ -0,0 +1,67 @@
<?php
declare(strict_types=1);
require_once __DIR__ . '/../includes/bootstrap.php';
require_once __DIR__ . '/../includes/DeepResearchAgent.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
$isMultipart = stripos((string)($_SERVER['CONTENT_TYPE'] ?? ''), 'multipart/form-data') !== false;
if ($isMultipart) {
$payloadRaw = (string)($_POST['payload'] ?? '');
if ($payloadRaw === '') {
dbnToolsError('Multipart request is missing the "payload" JSON field.', 422, 'missing_payload');
}
$input = json_decode($payloadRaw, true);
if (!is_array($input)) {
dbnToolsError('Multipart "payload" field must be valid JSON.', 422, 'invalid_payload_json');
}
} else {
$input = dbnToolsJsonInput(120000);
}
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
dbnToolsWithTelemetry('deep_research', $language, function () use ($input, $language) {
$seedQuery = dbnToolsString($input, 'query', 4000, false);
$pastedText = dbnToolsString($input, 'paste_text', 64000, false);
$sliceInput = $input['slices'] ?? null;
$engine = (string)($input['engine'] ?? 'azure_mini');
$controls = is_array($input['controls'] ?? null) ? $input['controls'] : [];
$uploadedFiles = [];
if (!empty($_FILES['files']) && is_array($_FILES['files']['tmp_name'] ?? null)) {
$count = count($_FILES['files']['tmp_name']);
if ($count > 5) {
dbnToolsAbort('At most 5 files can be uploaded per request.', 413, 'too_many_files');
}
for ($i = 0; $i < $count; $i++) {
$file = [
'name' => $_FILES['files']['name'][$i] ?? '',
'type' => $_FILES['files']['type'][$i] ?? '',
'tmp_name' => $_FILES['files']['tmp_name'][$i] ?? '',
'error' => $_FILES['files']['error'][$i] ?? UPLOAD_ERR_NO_FILE,
'size' => $_FILES['files']['size'][$i] ?? 0,
];
$extracted = dbnToolsExtractUploadedFile($file);
$uploadedFiles[] = [
'filename' => $extracted['filename'],
'text' => $extracted['text'],
'chars' => $extracted['chars'],
'truncated' => $extracted['truncated'],
];
}
}
return (new DbnDeepResearchAgent())->run(
$seedQuery,
$pastedText,
$uploadedFiles,
is_array($sliceInput) ? $sliceInput : [],
$engine,
$language,
$controls
);
});
+2 -118
View File
@@ -6,132 +6,16 @@ require_once __DIR__ . '/../includes/bootstrap.php';
dbnToolsRequireMethod('POST');
dbnToolsRequireAuth();
const EXTRACT_MAX_BYTES = 4 * 1024 * 1024;
const EXTRACT_TEXT_LIMIT = 128000;
const EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
try {
if (empty($_FILES['file']) || !is_array($_FILES['file'])) {
dbnToolsError('No file was uploaded.', 422, 'missing_file');
}
$file = $_FILES['file'];
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
if ($errCode !== UPLOAD_ERR_OK) {
$msg = match ($errCode) {
UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.',
UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.',
UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.',
default => 'File upload failed.',
};
dbnToolsError($msg, 422, 'upload_error');
}
$originalName = basename((string)($file['name'] ?? ''));
$tmpPath = (string)($file['tmp_name'] ?? '');
$size = (int)($file['size'] ?? 0);
if (!is_uploaded_file($tmpPath)) {
dbnToolsError('Invalid file upload.', 400, 'invalid_upload');
}
if ($size === 0) {
dbnToolsError('The uploaded file is empty.', 422, 'file_empty');
}
if ($size > EXTRACT_MAX_BYTES) {
dbnToolsError('File exceeds the 4 MB limit.', 413, 'file_too_large');
}
$ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION));
if (!in_array($ext, EXTRACT_ALLOWED_EXTS, true)) {
dbnToolsError('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type');
}
$text = match ($ext) {
'txt' => extractTxt($tmpPath),
'pdf' => extractPdf($tmpPath),
'docx' => extractDocx($tmpPath),
};
$text = trim($text);
if ($text === '') {
dbnToolsError('No text could be extracted from this file.', 422, 'no_text');
}
$truncated = false;
if (mb_strlen($text, 'UTF-8') > EXTRACT_TEXT_LIMIT) {
$text = mb_substr($text, 0, EXTRACT_TEXT_LIMIT, 'UTF-8');
$truncated = true;
}
dbnToolsRespond([
'ok' => true,
'text' => $text,
'filename' => $originalName,
'chars' => mb_strlen($text, 'UTF-8'),
'truncated' => $truncated,
]);
$result = dbnToolsExtractUploadedFile($_FILES['file']);
dbnToolsRespond($result);
} catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
} catch (Throwable $e) {
error_log('DBN extract error: ' . $e->getMessage());
dbnToolsError('Text extraction failed.', 500, 'extract_error');
}
function extractTxt(string $path): string
{
$content = file_get_contents($path);
if ($content === false) {
throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error');
}
return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252');
}
function extractPdf(string $path): string
{
$cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null';
$output = shell_exec($cmd);
if ($output === null || $output === false || trim($output) === '') {
throw new DbnToolsHttpException(
'PDF text extraction failed. The file may be image-only or encrypted.',
422,
'pdf_extract_failed'
);
}
return $output;
}
function extractDocx(string $path): string
{
$zip = new ZipArchive();
$result = $zip->open($path);
if ($result !== true) {
throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed');
}
$xml = $zip->getFromName('word/document.xml');
$zip->close();
if ($xml === false) {
throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content');
}
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadXML($xml);
libxml_clear_errors();
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
$paragraphs = [];
foreach ($xpath->query('//w:p') as $para) {
$runs = [];
foreach ($xpath->query('.//w:t', $para) as $t) {
$runs[] = $t->textContent;
}
$paragraphs[] = implode('', $runs);
}
return implode("\n", $paragraphs);
}
+475
View File
@@ -1701,3 +1701,478 @@ p {
font-weight: 500;
margin: 0;
}
/* =========================================================================
Deep Research — agent + rank/rerank RAG surface
========================================================================= */
.deep-research .lang-switcher {
display: inline-flex;
gap: 6px;
}
.deep-research .lang-btn {
padding: 6px 10px;
border-radius: 999px;
background: #fff;
border: 1px solid var(--line);
color: var(--muted);
font-weight: 700;
}
.deep-research .lang-btn.is-active {
background: var(--soft-teal);
color: var(--teal-dark);
border-color: rgba(15, 118, 110, 0.30);
}
.dr-slice-section {
display: grid;
gap: 8px;
}
.dr-slice-grid {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 10px;
}
.dr-slice {
text-align: left;
background: #fbfcfe;
border: 1px solid var(--line);
border-radius: 8px;
padding: 12px 13px;
cursor: pointer;
min-height: 96px;
display: grid;
gap: 6px;
align-content: start;
transition: border-color 120ms ease, background 120ms ease;
}
.dr-slice:hover {
border-color: rgba(15, 118, 110, 0.30);
}
.dr-slice.is-on {
background: var(--soft-teal);
border-color: rgba(15, 118, 110, 0.45);
}
.dr-slice__head {
display: flex;
align-items: center;
justify-content: space-between;
gap: 8px;
}
.dr-slice__title {
font-weight: 800;
color: var(--ink);
}
.dr-slice__badge {
background: #fff;
border: 1px solid var(--line);
border-radius: 999px;
color: var(--muted);
font-size: 0.66rem;
font-weight: 800;
letter-spacing: 0.06em;
padding: 3px 8px;
text-transform: uppercase;
}
.dr-slice.is-on .dr-slice__badge {
background: var(--teal);
border-color: var(--teal);
color: #fff;
}
.dr-slice__tagline {
margin: 0;
color: var(--muted);
font-size: 0.86rem;
line-height: 1.4;
}
.advanced-panel .dr-control-grid {
display: grid;
grid-template-columns: repeat(5, minmax(0, 1fr));
gap: 8px;
margin-top: 10px;
}
.dr-control-card {
background: #fbfcfe;
border: 1px solid var(--line);
border-radius: 8px;
padding: 10px;
}
.dr-control-card label {
display: flex;
justify-content: space-between;
gap: 8px;
align-items: center;
font-weight: 800;
color: var(--ink);
font-size: 0.85rem;
}
.dr-control-card small {
display: block;
margin-top: 8px;
color: var(--muted);
font-size: 0.74rem;
line-height: 1.4;
}
.dr-control-card input[type="range"] {
width: 100%;
margin-top: 8px;
accent-color: var(--teal);
}
.dr-control-value {
color: var(--coral);
font-variant-numeric: tabular-nums;
}
@media (max-width: 980px) {
.advanced-panel .dr-control-grid {
grid-template-columns: repeat(2, minmax(0, 1fr));
}
.dr-slice-grid {
grid-template-columns: 1fr;
}
}
.deep-research-results {
display: grid;
gap: 14px;
}
.dr-result-block {
border: 1px solid var(--line);
border-radius: 8px;
padding: 16px;
background: #fff;
}
.dr-brief {
line-height: 1.65;
color: var(--ink);
font-size: 1.0rem;
}
.dr-brief p {
margin: 0 0 12px;
}
.dr-brief code {
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
background: var(--soft-teal);
padding: 1px 5px;
border-radius: 4px;
font-size: 0.86em;
}
.dr-brief strong { color: var(--ink); }
.dr-brief em { color: var(--muted); }
.dr-cite {
display: inline-flex;
align-items: center;
justify-content: center;
min-width: 18px;
height: 18px;
margin: 0 1px;
padding: 0 5px;
border-radius: 999px;
background: var(--soft-coral);
color: var(--coral);
font-size: 0.72rem;
font-weight: 800;
font-variant-numeric: tabular-nums;
cursor: pointer;
border: 1px solid rgba(194, 65, 12, 0.25);
vertical-align: 1px;
}
.dr-cite:hover { background: var(--coral); color: #fff; }
.dr-sources-head {
display: flex;
align-items: baseline;
justify-content: space-between;
margin-bottom: 10px;
}
.dr-sources-head h3 {
margin: 0;
font-size: 1rem;
}
.dr-sources-head small {
color: var(--muted);
font-size: 0.82rem;
}
.dr-source-list {
display: grid;
gap: 10px;
}
.dr-source-card {
display: grid;
grid-template-columns: 34px 1fr auto;
gap: 12px;
align-items: start;
border: 1px solid var(--line);
border-radius: 8px;
padding: 12px;
background: #fbfcfe;
cursor: pointer;
text-align: left;
width: 100%;
}
.dr-source-card:hover { border-color: rgba(15, 118, 110, 0.40); }
.dr-source-card.is-highlight {
border-color: var(--coral);
background: var(--soft-coral);
}
.dr-source-number {
display: inline-flex;
align-items: center;
justify-content: center;
width: 28px;
height: 28px;
border-radius: 999px;
background: var(--soft-coral);
color: var(--coral);
font-weight: 900;
font-variant-numeric: tabular-nums;
}
.dr-source-body {
min-width: 0;
}
.dr-source-title {
font-weight: 800;
color: var(--ink);
line-height: 1.35;
}
.dr-source-meta {
display: flex;
flex-wrap: wrap;
gap: 6px;
margin-top: 6px;
}
.dr-source-tag {
background: var(--soft-teal);
color: var(--teal-dark);
border-radius: 999px;
font-size: 0.7rem;
font-weight: 800;
padding: 3px 8px;
text-transform: uppercase;
}
.dr-source-tag--upload { background: #fff0e8; color: #8a4524; }
.dr-source-tag--score { background: #eef3fb; color: #314158; }
.dr-source-excerpt {
color: var(--muted);
margin-top: 8px;
line-height: 1.5;
font-size: 0.92rem;
}
.dr-source-aside {
align-self: stretch;
display: grid;
grid-template-rows: auto auto;
gap: 6px;
font-size: 0.78rem;
color: var(--muted);
text-align: right;
min-width: 90px;
}
.dr-source-aside b {
color: var(--ink);
font-variant-numeric: tabular-nums;
font-size: 0.92rem;
}
/* Method trace — overrides for #traceList rendered in rich mode */
.trace-list.is-rich {
display: grid;
gap: 8px;
}
.trace-list.is-rich .trace-step {
display: grid;
grid-template-columns: 28px 1fr;
gap: 10px;
align-items: start;
padding: 10px 12px;
border: 1px solid var(--line);
border-radius: 8px;
background: #fbfcfe;
list-style: none;
}
.trace-list.is-rich .trace-step__marker {
display: inline-flex;
align-items: center;
justify-content: center;
width: 22px;
height: 22px;
border-radius: 999px;
border: 1px solid var(--line);
background: #fff;
color: var(--muted);
font-size: 0.72rem;
font-weight: 900;
font-variant-numeric: tabular-nums;
}
.trace-list.is-rich .trace-step__label {
display: block;
font-weight: 800;
color: var(--ink);
font-size: 0.94rem;
}
.trace-list.is-rich .trace-step__detail {
display: block;
margin-top: 4px;
color: var(--muted);
font-size: 0.83rem;
line-height: 1.45;
}
.trace-list.is-rich .trace-step.is-running {
background: var(--soft-coral);
}
.trace-list.is-rich .trace-step.is-running .trace-step__marker {
background: rgba(194, 65, 12, 0.18);
border-color: rgba(194, 65, 12, 0.35);
color: var(--coral);
animation: drTracePulse 950ms ease-in-out infinite;
}
.trace-list.is-rich .trace-step.is-done .trace-step__marker {
background: var(--soft-teal);
border-color: rgba(15, 118, 110, 0.30);
color: var(--teal-dark);
}
.trace-list.is-rich .trace-step.is-warning .trace-step__marker {
background: #fff4dc;
border-color: rgba(183, 121, 31, 0.35);
color: var(--amber);
}
.trace-list.is-rich .trace-step.is-error {
background: #fff0e8;
}
.trace-list.is-rich .trace-step.is-error .trace-step__marker {
background: rgba(180, 30, 30, 0.10);
border-color: rgba(180, 30, 30, 0.30);
color: #b41e1e;
}
@keyframes drTracePulse {
0%, 100% { opacity: 0.55; transform: scale(0.92); }
50% { opacity: 1; transform: scale(1.04); }
}
/* Source modal */
.dr-source-modal {
position: fixed;
inset: 0;
background: rgba(23, 32, 51, 0.62);
display: flex;
align-items: center;
justify-content: center;
padding: 24px;
z-index: 9999;
}
.dr-source-modal__dialog {
width: min(960px, 100%);
max-height: 90vh;
background: #fff;
border-radius: 8px;
box-shadow: 0 28px 92px rgba(0, 0, 0, 0.34);
overflow: hidden;
display: grid;
grid-template-rows: auto 1fr;
}
.dr-source-modal__head {
display: flex;
align-items: start;
justify-content: space-between;
gap: 14px;
padding: 16px 18px;
border-bottom: 1px solid var(--line);
}
.dr-source-modal__head h3 {
margin: 0;
color: var(--ink);
line-height: 1.25;
font-size: 1.2rem;
}
.dr-source-modal__body {
display: grid;
grid-template-columns: 260px minmax(0, 1fr);
overflow: hidden;
}
.dr-source-modal__meta,
.dr-source-modal__text {
padding: 16px 18px;
overflow: auto;
}
.dr-source-modal__meta {
border-right: 1px solid var(--line);
background: #fbfcfe;
color: var(--muted);
font-size: 0.88rem;
line-height: 1.55;
}
.dr-source-modal__meta dt {
color: var(--ink);
font-weight: 800;
margin-top: 8px;
}
.dr-source-modal__meta dt:first-of-type { margin-top: 0; }
.dr-source-modal__text {
white-space: pre-wrap;
line-height: 1.7;
color: var(--ink);
}
@media (max-width: 720px) {
.dr-source-modal__body { grid-template-columns: 1fr; }
.dr-source-modal__meta { border-right: 0; border-bottom: 1px solid var(--line); }
.dr-source-card { grid-template-columns: 32px 1fr; }
.dr-source-aside { display: none; }
}
+481
View File
@@ -0,0 +1,481 @@
/* deep-research.js — page-scoped UI for /deep-research.php */
(function () {
'use strict';
const els = {};
let lang = 'en';
let uploadFiles = [];
let lastResult = null;
const SLICE_DEFS = [
{ id: 'family_core', label: 'Family Law Core' },
{ id: 'child_welfare', label: 'Child Welfare' },
{ id: 'echr_hague', label: 'ECHR and Hague' },
{ id: 'broader_legal', label: 'Broader Legal Support' },
];
const STEP_LABELS = [
'Query interpretation',
'Query expansion',
'Slice resolution',
'Upload indexing',
'Retrieval',
'Synthesis',
'Citation confidence',
];
document.addEventListener('DOMContentLoaded', () => {
if (!document.body.dataset.activeTool || document.body.dataset.activeTool !== 'deep-research') return;
Object.assign(els, {
form: document.getElementById('deepResearchForm'),
input: document.getElementById('drInput'),
status: document.getElementById('drStatus'),
runButton: document.getElementById('drRunButton'),
results: document.getElementById('drResults'),
traceList: document.getElementById('traceList'),
slices: Array.from(document.querySelectorAll('.dr-slice')),
langButtons: Array.from(document.querySelectorAll('#drLangSwitcher .lang-btn')),
engineRadios: Array.from(document.querySelectorAll('input[name="drEngine"]')),
subQ: document.getElementById('drSubQ'),
subQVal: document.getElementById('drSubQValue'),
chunkLimit: document.getElementById('drChunkLimit'),
chunkLimitVal: document.getElementById('drChunkLimitValue'),
sim: document.getElementById('drSim'),
simVal: document.getElementById('drSimValue'),
topK: document.getElementById('drTopK'),
topKVal: document.getElementById('drTopKValue'),
temp: document.getElementById('drTemp'),
tempVal: document.getElementById('drTempValue'),
uploadZone: document.getElementById('drUploadZone'),
uploadInput: document.getElementById('drUploadInput'),
uploadPrompt: document.getElementById('drUploadPrompt'),
uploadFileInfo: document.getElementById('drUploadFileInfo'),
uploadFileList: document.getElementById('drUploadFileList'),
uploadClear: document.getElementById('drUploadClear'),
modal: document.getElementById('drSourceModal'),
modalClose: document.getElementById('drSourceModalClose'),
modalTitle: document.getElementById('drSourceModalTitle'),
modalEyebrow: document.getElementById('drSourceModalEyebrow'),
modalMeta: document.getElementById('drSourceModalMeta'),
modalText: document.getElementById('drSourceModalText'),
});
if (!els.form) return;
bindSlices();
bindLang();
bindRanges();
bindUpload();
bindModal();
els.form.addEventListener('submit', onSubmit);
// Pre-render placeholder trace
renderTrace(STEP_LABELS.map((label) => ({ label, detail: 'Waiting…', status: 'idle' })));
});
function bindSlices() {
els.slices.forEach((btn) => {
btn.addEventListener('click', () => {
const isOn = btn.classList.toggle('is-on');
btn.setAttribute('aria-pressed', isOn ? 'true' : 'false');
const badge = btn.querySelector('.dr-slice__badge');
if (badge) badge.textContent = isOn ? 'on' : 'off';
});
});
}
function bindLang() {
els.langButtons.forEach((b) => {
b.addEventListener('click', () => {
els.langButtons.forEach((x) => x.classList.remove('is-active'));
b.classList.add('is-active');
lang = b.dataset.lang || 'en';
});
});
}
function bindRanges() {
const pairs = [
[els.subQ, els.subQVal, (v) => v],
[els.chunkLimit, els.chunkLimitVal, (v) => v],
[els.sim, els.simVal, (v) => Number(v).toFixed(2)],
[els.topK, els.topKVal, (v) => v],
[els.temp, els.tempVal, (v) => Number(v).toFixed(2)],
];
pairs.forEach(([range, label, fmt]) => {
if (!range || !label) return;
const sync = () => { label.textContent = fmt(range.value); };
range.addEventListener('input', sync);
sync();
});
}
function bindUpload() {
if (!els.uploadZone) return;
const onFiles = (fileList) => {
const files = Array.from(fileList || []).slice(0, 5);
if (uploadFiles.length + files.length > 5) {
setStatus('At most 5 files can be uploaded per request.', 'error');
return;
}
files.forEach((f) => {
if (f.size > 4 * 1024 * 1024) {
setStatus(`${f.name} exceeds the 4 MB limit.`, 'error');
return;
}
const ext = (f.name.split('.').pop() || '').toLowerCase();
if (!['pdf', 'docx', 'txt'].includes(ext)) {
setStatus(`${f.name} is not a supported file type.`, 'error');
return;
}
uploadFiles.push(f);
});
renderUploadList();
};
els.uploadInput.addEventListener('change', (e) => onFiles(e.target.files));
els.uploadZone.addEventListener('dragover', (e) => { e.preventDefault(); els.uploadZone.classList.add('is-drop'); });
els.uploadZone.addEventListener('dragleave', () => els.uploadZone.classList.remove('is-drop'));
els.uploadZone.addEventListener('drop', (e) => {
e.preventDefault();
els.uploadZone.classList.remove('is-drop');
onFiles(e.dataTransfer?.files);
});
els.uploadClear?.addEventListener('click', () => {
uploadFiles = [];
els.uploadInput.value = '';
renderUploadList();
});
}
function renderUploadList() {
if (!uploadFiles.length) {
els.uploadFileInfo.classList.add('is-hidden');
els.uploadPrompt.classList.remove('is-hidden');
return;
}
els.uploadPrompt.classList.add('is-hidden');
els.uploadFileInfo.classList.remove('is-hidden');
els.uploadFileList.innerHTML = uploadFiles.map((f, i) => {
const kb = (f.size / 1024).toFixed(0);
return `<li><span class="upload-filename">${escapeHtml(f.name)}</span><span class="upload-chars">${kb} KB</span></li>`;
}).join('');
}
function bindModal() {
els.modalClose?.addEventListener('click', closeModal);
els.modal?.addEventListener('click', (e) => {
if (e.target === els.modal) closeModal();
});
document.addEventListener('keydown', (e) => {
if (e.key === 'Escape' && els.modal && !els.modal.classList.contains('is-hidden')) closeModal();
});
}
function closeModal() {
els.modal?.classList.add('is-hidden');
}
function openModal(source) {
if (!source) return;
els.modalEyebrow.textContent = source.source_origin === 'upload' ? 'Uploaded file' : 'Corpus source';
els.modalTitle.textContent = source.title || 'Source';
const metaRows = [
['Number', `[${source.n}]`],
source.section ? ['Section', source.section] : null,
['Corpus / package', source.package_or_corpus || '—'],
source.authority_type ? ['Authority', source.authority_type] : null,
source.jurisdiction ? ['Jurisdiction', source.jurisdiction] : null,
source.similarity != null ? ['Similarity', String(source.similarity)] : null,
source.reranker_score != null ? ['Rerank score', String(source.reranker_score)] : null,
source.matched_sub_questions?.length ? ['Matched sub-Q', source.matched_sub_questions.join(', ')] : null,
].filter(Boolean);
els.modalMeta.innerHTML = '<dl>' + metaRows.map(([k, v]) => `<dt>${escapeHtml(k)}</dt><dd>${escapeHtml(String(v))}</dd>`).join('') + '</dl>';
els.modalText.textContent = source.chunk_text || source.excerpt || '';
els.modal.classList.remove('is-hidden');
}
function getSelectedSlices() {
const out = {};
SLICE_DEFS.forEach((s) => {
const btn = els.slices.find((b) => b.dataset.slice === s.id);
out[s.id] = !!(btn && btn.classList.contains('is-on'));
});
return out;
}
function getEngine() {
const checked = els.engineRadios.find((r) => r.checked);
return checked ? checked.value : 'azure_mini';
}
function getControls() {
return {
sub_q_count: parseInt(els.subQ.value, 10),
chunk_limit: parseInt(els.chunkLimit.value, 10),
similarity_threshold: parseFloat(els.sim.value),
reranker_top_k: parseInt(els.topK.value, 10),
temperature: parseFloat(els.temp.value),
};
}
async function onSubmit(e) {
e.preventDefault();
const query = (els.input.value || '').trim();
if (!query && uploadFiles.length === 0) {
setStatus('Type a question or upload a file before running deep research.', 'error');
return;
}
const slices = getSelectedSlices();
if (!Object.values(slices).some(Boolean)) {
setStatus('Enable at least one corpus slice.', 'error');
return;
}
setStatus('Running deep research…', 'busy');
els.runButton.disabled = true;
els.results.innerHTML = `<div class="empty-state"><h3>Working…</h3><p>The agent is expanding the question, retrieving from the corpus, and synthesising the brief. This usually takes 615 seconds.</p></div>`;
// Render placeholder trace with first step running
const placeholder = STEP_LABELS.map((label, i) => ({
label,
detail: i === 0 ? 'Running…' : 'Queued',
status: i === 0 ? 'running' : 'idle',
}));
renderTrace(placeholder);
const payload = {
query,
paste_text: '',
slices,
engine: getEngine(),
language: lang,
controls: getControls(),
};
let response;
try {
if (uploadFiles.length > 0) {
const form = new FormData();
form.append('payload', JSON.stringify(payload));
uploadFiles.forEach((f) => form.append('files[]', f));
response = await fetch('api/deep-research.php', { method: 'POST', body: form, credentials: 'same-origin' });
} else {
response = await fetch('api/deep-research.php', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload),
credentials: 'same-origin',
});
}
} catch (err) {
setStatus(`Network error: ${err.message || err}`, 'error');
els.runButton.disabled = false;
return;
}
let data = null;
try { data = await response.json(); } catch (_) {}
if (!response.ok || !data || data.ok === false) {
const msg = (data && data.error && data.error.message) || `Request failed (${response.status}).`;
setStatus(msg, 'error');
els.runButton.disabled = false;
renderTrace(placeholder.map((s, i) => i === 0 ? { ...s, status: 'error', detail: msg } : s));
return;
}
lastResult = data;
setStatus(`Done in ${data.latency_ms || 0} ms · ${data.trace_metadata?.source_count || 0} sources · confidence ${data.trace_metadata?.citation_confidence || '?'}`, 'ok');
els.runButton.disabled = false;
renderTrace(data.trace || []);
renderResults(data);
}
function setStatus(message, kind) {
els.status.textContent = message;
els.status.style.color = kind === 'error' ? '#b41e1e' : (kind === 'ok' ? 'var(--teal-dark)' : 'var(--muted)');
}
function renderTrace(steps) {
if (!els.traceList) return;
els.traceList.classList.add('is-rich');
els.traceList.innerHTML = steps.map((step, i) => {
const statusClass = step.status === 'running' ? 'is-running'
: step.status === 'complete' ? 'is-done'
: step.status === 'warning' ? 'is-warning'
: step.status === 'error' ? 'is-error'
: '';
const marker = step.status === 'complete' ? '✓'
: step.status === 'warning' ? '!'
: step.status === 'error' ? '×'
: (i + 1);
return `<li class="trace-step ${statusClass}">
<span class="trace-step__marker">${marker}</span>
<div>
<span class="trace-step__label">${escapeHtml(step.label || '')}</span>
<span class="trace-step__detail">${escapeHtml(step.detail || '')}</span>
</div>
</li>`;
}).join('');
}
function renderResults(data) {
const sources = data.sources || [];
const subs = data.sub_questions || [];
const briefHtml = renderBrief(data.brief_markdown || '', sources);
const subQHtml = subs.length ? `
<div class="dr-result-block">
<h3 style="margin:0 0 8px;font-size:1rem">Angles the agent explored</h3>
<ol style="padding-left:1.2em;margin:0;color:var(--muted);line-height:1.55">
${subs.map((sq) => `<li><strong style="color:var(--ink)">${escapeHtml(sq.question)}</strong>${sq.rationale ? `<br><small>${escapeHtml(sq.rationale)}</small>` : ''}</li>`).join('')}
</ol>
</div>` : '';
const sourcesHtml = `
<div class="dr-result-block">
<div class="dr-sources-head">
<h3>Sources (${sources.length})</h3>
<small>Click a card to see the full chunk + scores</small>
</div>
<div class="dr-source-list">
${sources.map((s) => renderSourceCard(s)).join('')}
</div>
</div>`;
const uncertHtml = (data.what_remains_uncertain || []).length ? `
<div class="dr-result-block">
<h3 style="margin:0 0 8px;font-size:0.95rem;color:var(--muted)">What remains uncertain</h3>
<ul style="padding-left:1.2em;margin:0;color:var(--muted);line-height:1.55">
${(data.what_remains_uncertain || []).map((u) => `<li>${escapeHtml(String(u))}</li>`).join('')}
</ul>
</div>` : '';
const nextHtml = data.next_practical_step ? `
<div class="dr-result-block">
<h3 style="margin:0 0 6px;font-size:0.95rem">Next practical step</h3>
<p style="margin:0;color:var(--ink);line-height:1.5">${escapeHtml(data.next_practical_step)}</p>
</div>` : '';
els.results.innerHTML = `
<div class="dr-result-block">
<div class="dr-brief">${briefHtml}</div>
</div>
${subQHtml}
${sourcesHtml}
${uncertHtml}
${nextHtml}
`;
// Bind source-card click handlers + citation marker click handlers
els.results.querySelectorAll('[data-source-n]').forEach((node) => {
node.addEventListener('click', () => {
const n = parseInt(node.dataset.sourceN, 10);
const src = sources.find((s) => s.n === n);
if (src) {
openModal(src);
flashSource(n);
}
});
});
}
function flashSource(n) {
document.querySelectorAll('.dr-source-card.is-highlight').forEach((c) => c.classList.remove('is-highlight'));
const target = document.querySelector(`.dr-source-card[data-source-n="${n}"]`);
if (target) {
target.classList.add('is-highlight');
target.scrollIntoView({ behavior: 'smooth', block: 'center' });
setTimeout(() => target.classList.remove('is-highlight'), 1800);
}
}
function renderSourceCard(s) {
const score = s.reranker_score != null ? s.reranker_score : s.similarity;
const originTagClass = s.source_origin === 'upload' ? 'dr-source-tag dr-source-tag--upload' : 'dr-source-tag';
const originLabel = s.source_origin === 'upload' ? 'upload' : 'corpus';
return `<button type="button" class="dr-source-card" data-source-n="${s.n}">
<span class="dr-source-number">${s.n}</span>
<div class="dr-source-body">
<div class="dr-source-title">${escapeHtml(s.title || 'Untitled')}</div>
${s.section ? `<div class="dr-source-meta"><span class="dr-source-tag">${escapeHtml(s.section)}</span></div>` : ''}
<div class="dr-source-meta">
<span class="${originTagClass}">${originLabel}</span>
<span class="dr-source-tag dr-source-tag--score">${escapeHtml(s.package_or_corpus || '—')}</span>
${(s.matched_sub_questions || []).map((q) => `<span class="dr-source-tag">${escapeHtml(q)}</span>`).join('')}
</div>
<p class="dr-source-excerpt">${escapeHtml(truncate(s.excerpt || '', 240))}</p>
</div>
<div class="dr-source-aside">
<span>score<br><b>${score != null ? Number(score).toFixed(2) : '—'}</b></span>
${s.reranker_score != null && s.similarity != null ? `<span>sim<br><b>${Number(s.similarity).toFixed(2)}</b></span>` : ''}
</div>
</button>`;
}
// Markdown renderer — minimal: paragraphs, bold/italic, code, [n] citation badges
function renderBrief(markdown, sources) {
if (!markdown) return '<p><em>No brief was returned.</em></p>';
const sourceSet = new Set((sources || []).map((s) => s.n));
const escaped = escapeHtml(markdown);
// Citation markers [1], [1,2], [1-3]
const withCites = escaped.replace(/\[(\d+(?:\s*[-,]\s*\d+)*)\]/g, (_, group) => {
const nums = expandCiteGroup(group);
return nums.map((n) => {
const known = sourceSet.has(n);
const cls = known ? 'dr-cite' : 'dr-cite';
return `<span class="${cls}" data-source-n="${n}" role="button" tabindex="0">${n}</span>`;
}).join('');
});
// Bold/italic
const withBold = withCites
.replace(/\*\*([^*]+)\*\*/g, '<strong>$1</strong>')
.replace(/(^|[^*])\*([^*]+)\*(?!\*)/g, '$1<em>$2</em>')
.replace(/`([^`]+)`/g, '<code>$1</code>');
// Paragraphs
const paragraphs = withBold.split(/\n{2,}/).map((p) => {
const t = p.trim();
if (!t) return '';
if (/^### /.test(t)) return `<h4 style="margin:14px 0 6px;color:var(--ink);font-size:1rem">${t.replace(/^### /, '')}</h4>`;
return `<p>${t.replace(/\n/g, '<br>')}</p>`;
}).join('');
return paragraphs;
}
function expandCiteGroup(group) {
const out = [];
group.split(',').forEach((part) => {
const range = part.trim().match(/^(\d+)\s*-\s*(\d+)$/);
if (range) {
const a = parseInt(range[1], 10);
const b = parseInt(range[2], 10);
for (let i = a; i <= b; i++) out.push(i);
} else {
const n = parseInt(part.trim(), 10);
if (!Number.isNaN(n)) out.push(n);
}
});
return Array.from(new Set(out));
}
function escapeHtml(s) {
return String(s)
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
}
function truncate(s, n) {
if (!s) return '';
if (s.length <= n) return s;
return s.slice(0, n - 1) + '…';
}
})();
+162
View File
@@ -0,0 +1,162 @@
<?php
declare(strict_types=1);
$toolName = 'deep-research';
$toolTitle = 'Deep Research';
$toolKind = 'Agent + Rank/Rerank RAG';
$toolBadge = 'family-legal';
$extraScripts = ['assets/js/deep-research.js'];
require_once __DIR__ . '/includes/layout.php';
?>
<form id="deepResearchForm" class="tool-form deep-research" enctype="multipart/form-data">
<div class="lang-switcher" id="drLangSwitcher" role="group" aria-label="UI language">
<button type="button" class="lang-btn is-active" data-lang="en">&#127468;&#127463; EN</button>
<button type="button" class="lang-btn" data-lang="no">&#127475;&#127476; NO</button>
</div>
<div class="control-row" id="drEngineControl">
<span class="control-label">Engine</span>
<label><input type="radio" name="drEngine" value="azure_mini" checked> Azure gpt-4o-mini &#9733; <small class="control-hint">(fast)</small></label>
<label><input type="radio" name="drEngine" value="azure_full"> Azure gpt-4o <small class="control-hint">(best)</small></label>
<label><input type="radio" name="drEngine" value="gpu"> GPU (cuttlefish) <small class="control-hint">(local)</small></label>
</div>
<p class="upload-hint">Azure engines use your BNL Azure credits. GPU runs qwen2.5:14b via LiteLLM on cuttlefish.</p>
<div class="dr-slice-section">
<p class="control-label">Corpus slices</p>
<p class="upload-hint">Select which slices of the Do Better Norge legal corpus the agent searches. Toggle Broader Legal on when the question reaches beyond family law.</p>
<div class="dr-slice-grid">
<button type="button" class="dr-slice is-on" data-slice="family_core" aria-pressed="true">
<div class="dr-slice__head">
<span class="dr-slice__title">Family Law Core</span>
<span class="dr-slice__badge">on</span>
</div>
<p class="dr-slice__tagline">Barneloven, custody, samvær, mediation</p>
</button>
<button type="button" class="dr-slice is-on" data-slice="child_welfare" aria-pressed="true">
<div class="dr-slice__head">
<span class="dr-slice__title">Child Welfare</span>
<span class="dr-slice__badge">on</span>
</div>
<p class="dr-slice__tagline">Barnevern, omsorgsovertakelse, foster care</p>
</button>
<button type="button" class="dr-slice is-on" data-slice="echr_hague" aria-pressed="true">
<div class="dr-slice__head">
<span class="dr-slice__title">ECHR and Hague</span>
<span class="dr-slice__badge">on</span>
</div>
<p class="dr-slice__tagline">Article 8, EMD, HCCH, cross-border family</p>
</button>
<button type="button" class="dr-slice" data-slice="broader_legal" aria-pressed="false">
<div class="dr-slice__head">
<span class="dr-slice__title">Broader Legal Support</span>
<span class="dr-slice__badge">off</span>
</div>
<p class="dr-slice__tagline">Arbeidsmiljøloven, NOUer, statutes, government background</p>
</button>
</div>
</div>
<details class="advanced-panel" id="drAdvanced">
<summary class="advanced-toggle">Advanced controls</summary>
<div class="dr-control-grid">
<div class="dr-control-card">
<label>Sub-questions <span id="drSubQValue" class="dr-control-value">4</span></label>
<input type="range" id="drSubQ" min="3" max="5" step="1" value="4">
<small>How many angles the agent expands the question into before retrieval.</small>
</div>
<div class="dr-control-card">
<label>Chunks / sub-Q <span id="drChunkLimitValue" class="dr-control-value">6</span></label>
<input type="range" id="drChunkLimit" min="4" max="10" step="1" value="6">
<small>How many corpus chunks the hybrid retriever pulls per sub-question.</small>
</div>
<div class="dr-control-card">
<label>Similarity floor <span id="drSimValue" class="dr-control-value">0.30</span></label>
<input type="range" id="drSim" min="0.20" max="0.60" step="0.05" value="0.30">
<small>Minimum cosine similarity for uploaded-doc chunks to count as a match.</small>
</div>
<div class="dr-control-card">
<label>Sources kept <span id="drTopKValue" class="dr-control-value">12</span></label>
<input type="range" id="drTopK" min="8" max="14" step="1" value="12">
<small>Top sources kept after dedupe + rerank to feed synthesis.</small>
</div>
<div class="dr-control-card">
<label>Temperature <span id="drTempValue" class="dr-control-value">0.15</span></label>
<input type="range" id="drTemp" min="0.05" max="0.40" step="0.05" value="0.15">
<small>Synthesis creativity. Keep low for grounded legal briefs.</small>
</div>
</div>
</details>
<div class="upload-zone" id="drUploadZone" role="region" aria-label="File upload">
<input type="file" id="drUploadInput" multiple accept=".pdf,.docx,.txt" aria-label="Choose files">
<div id="drUploadPrompt" class="upload-prompt">
<span class="upload-icon" aria-hidden="true">&#8679;</span>
<p>Drop up to 5 case files here, or <label for="drUploadInput" class="upload-browse">browse</label></p>
<p class="upload-hint"><strong>PDF</strong>, <strong>DOCX</strong>, <strong>TXT</strong> &mdash; chunked + embedded in memory only, never stored.</p>
</div>
<div id="drUploadFileInfo" class="upload-file is-hidden">
<ul id="drUploadFileList" class="upload-file-list"></ul>
<button type="button" id="drUploadClear" class="upload-clear">&times; Clear</button>
</div>
</div>
<label class="input-label" for="drInput">Question or pasted text</label>
<textarea id="drInput" name="drInput" rows="8" placeholder="Describe the legal question, paste case notes, or both. The agent will research the corpus from 35 angles."></textarea>
<div class="form-footer">
<p id="drStatus" class="form-status" role="status" aria-live="polite"></p>
<button id="drRunButton" type="submit">Run deep research</button>
</div>
</form>
<section id="drResults" class="results deep-research-results" aria-live="polite">
<div class="empty-state">
<h3>Ready</h3>
<p>Pick slices, drop a case file or paste a question, then run. The agent will expand the question, retrieve from the corpus + your upload, rerank, and synthesise a cited brief.</p>
</div>
</section>
<!-- Source modal -->
<div id="drSourceModal" class="dr-source-modal is-hidden" role="dialog" aria-modal="true" aria-labelledby="drSourceModalTitle">
<div class="dr-source-modal__dialog">
<header class="dr-source-modal__head">
<div>
<p class="eyebrow" id="drSourceModalEyebrow">Source</p>
<h3 id="drSourceModalTitle"></h3>
</div>
<button type="button" id="drSourceModalClose" class="upload-clear" aria-label="Close">&times;</button>
</header>
<div class="dr-source-modal__body">
<aside class="dr-source-modal__meta" id="drSourceModalMeta"></aside>
<div class="dr-source-modal__text" id="drSourceModalText"></div>
</div>
</div>
</div>
<!-- Hidden stubs so tools.js element refs don't crash on this page -->
<div class="is-hidden" id="languageControl" aria-hidden="true"><input type="radio" name="language" value="en" checked></div>
<div class="is-hidden" id="redactionControl" aria-hidden="true"></div>
<div class="is-hidden" id="audioZone" aria-hidden="true">
<input type="file" id="audioInput" style="display:none">
<div id="audioPrompt"></div>
<div id="audioFileInfo"><ol id="audioQueueList"></ol><button type="button" id="audioClear"></button></div>
</div>
<div class="is-hidden" id="diarizeControl" aria-hidden="true">
<input type="checkbox" id="diarizeCheck">
<input type="number" id="numSpeakersInput">
</div>
<div class="is-hidden" id="transcribeLangControl" aria-hidden="true"><input type="radio" name="transcribeLang" value="no" checked></div>
<div class="is-hidden" id="vocabControl" aria-hidden="true">
<div id="vocabPresets"></div>
<textarea id="initPromptInput"></textarea>
</div>
<div class="is-hidden" id="aliasSection" aria-hidden="true">
<button type="button" id="addAliasRow"></button>
<div id="aliasRows"></div>
</div>
<div class="is-hidden" id="exemptSection" aria-hidden="true">
<button type="button" id="addExemptRow"></button>
<div id="exemptRows"></div>
</div>
<?php require_once __DIR__ . '/includes/layout_footer.php'; ?>
+727
View File
@@ -0,0 +1,727 @@
<?php
declare(strict_types=1);
require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/AzureOpenAiGateway.php';
final class DbnDeepResearchAgent
{
private const MAX_SEED_CHARS = 16000;
private const MAX_UPLOAD_CHARS = 64000;
private const CHUNK_WORDS = 600;
private const CHUNK_OVERLAP_WORDS = 75;
private const MIN_CHUNK_WORDS = 50;
private const POOL_CAP = 30;
private DbnAzureOpenAiGateway $azure;
private ?AiGateway $ai = null;
private array $uploadVecs = [];
private array $stepTimings = [];
public function __construct(?DbnAzureOpenAiGateway $azure = null)
{
$this->azure = $azure ?: new DbnAzureOpenAiGateway();
}
public function run(
string $seedQuery,
string $pastedText,
array $uploadedFiles,
array $sliceSelection,
string $engine,
string $language,
array $controls
): array {
$seedQuery = trim($seedQuery);
$pastedText = trim($pastedText);
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
$language = in_array($language, ['en', 'no'], true) ? $language : 'en';
$controls = $this->normalizeControls($controls);
if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) {
dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed');
}
$client = dbnToolsRequireClient();
$package = $this->requireFamilyPackage((int)$client['id']);
dbnToolsBootCaveau();
$aiPortalRoot = dbnToolsAiPortalRoot();
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
require_once $aiPortalRoot . '/lib/ai/AiGateway.php';
$this->ai = new AiGateway();
$this->uploadVecs = [];
$this->stepTimings = [];
$trace = [];
$seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles);
// STEP 1: Query interpretation — build research brief
$stepStart = microtime(true);
$interpretation = $this->interpretSeed($seedDescription, $language);
$this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
$trace[] = $this->trace(
'Query interpretation',
$interpretation['detail'],
'complete'
);
// STEP 2: Query expansion
$stepStart = microtime(true);
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language);
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
$subQuestions = $expansion['questions'];
$expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
$trace[] = $this->trace(
'Query expansion',
$expansion['fallback']
? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.'
: sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions)),
$expansionStatus
);
// STEP 3: Slice resolution
$stepStart = microtime(true);
$sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
if (!array_filter($sliceSelectionNormalized)) {
dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices');
}
$ragDb = dbnToolsRagDb();
try {
$sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
$sliceStatus = 'complete';
$sliceDetail = sprintf(
'%d slice(s) active → %d candidate documents constrain the corpus search.',
count(array_filter($sliceSelectionNormalized)),
count($sharedDocIds)
);
} catch (Throwable $e) {
error_log('DBN deep research slice resolve failed: ' . $e->getMessage());
$sharedDocIds = [];
$sliceStatus = 'warning';
$sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
}
$this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
$trace[] = $this->trace('Slice resolution', $sliceDetail, $sliceStatus);
// STEP 4: Upload indexing (in-memory, ephemeral)
$stepStart = microtime(true);
$uploadChunks = [];
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
$text = (string)($file['text'] ?? '');
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
}
$uploadStatus = 'complete';
$uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
if ($uploadChunks) {
try {
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
$vecs = $this->ai->embedBatch($texts, 'nomic-embed-text');
if (count($vecs) === count($uploadChunks)) {
foreach ($uploadChunks as $i => $chunk) {
$this->uploadVecs[] = [
'meta' => $chunk,
'vec' => $vecs[$i],
];
}
} else {
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.';
}
} catch (Throwable $e) {
error_log('DBN deep research upload embed failed: ' . $e->getMessage());
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.';
$this->uploadVecs = [];
}
} elseif (empty($uploadedFiles)) {
$uploadDetail = 'No files uploaded; agent will research the corpus only.';
}
$this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
$trace[] = $this->trace('Upload indexing', $uploadDetail, $uploadStatus);
// STEP 5: Retrieval (per sub-question)
$stepStart = microtime(true);
$retrievalQueries = $subQuestions ?: [[
'id' => 'q1',
'question' => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'),
'rationale' => 'Seed query (no sub-question expansion).',
]];
try {
$rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
} catch (Throwable $e) {
dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
}
$rawPool = [];
$retrievalWarnings = 0;
foreach ($retrievalQueries as $sq) {
try {
$corpusChunks = $rag->searchAll(
$sq['question'],
$controls['chunk_limit'],
null,
[
'search_private' => false,
'search_shared' => true,
'package_ids' => [(int)$package['id']],
'shared_doc_ids' => $sharedDocIds,
'chunk_limit' => $controls['chunk_limit'],
'search_method' => 'hybrid',
'reranker_enabled' => true,
]
);
} catch (Throwable $e) {
error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage());
$corpusChunks = [];
$retrievalWarnings++;
}
foreach ($corpusChunks as $chunk) {
$rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
}
// Upload chunk retrieval via cosine sim
if (!empty($this->uploadVecs)) {
$uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
foreach ($uploadHits as $hit) {
$hit['matched_sub_questions'] = [$sq['id']];
$rawPool[] = $hit;
}
}
}
$merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
$this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
$retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
$trace[] = $this->trace(
'Retrieval',
sprintf(
'%d sub-question(s) × hybrid + RRF + rerank → %d raw chunks → %d unique after dedupe.',
count($retrievalQueries),
count($rawPool),
count($merged)
),
$retrievalStatus
);
// Cap pool to reranker top-K for synthesis
$synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
$numberedSources = $this->numberSources($synthesisPool);
// STEP 6: Synthesis
$stepStart = microtime(true);
$synthesis = $this->synthesise(
$seedDescription,
$interpretation['brief'],
$retrievalQueries,
$numberedSources,
$engine,
$language,
$controls['temperature']
);
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
$trace[] = $this->trace(
'Synthesis',
sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)),
'complete'
);
// STEP 7: Confidence
$confidence = $this->citationConfidence($numberedSources);
$trace[] = $this->trace(
'Citation confidence',
sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)),
$confidence === 'low' ? 'warning' : 'complete'
);
// Stitch sub-question chunk_ids
$subQOut = [];
foreach ($retrievalQueries as $sq) {
$matchedChunks = array_values(array_filter(
$numberedSources,
fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
));
$subQOut[] = [
'id' => $sq['id'],
'question' => $sq['question'],
'rationale' => $sq['rationale'] ?? '',
'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
];
}
return [
'tool' => 'deep_research',
'language' => $language,
'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''),
'sub_questions' => $subQOut,
'sources' => $numberedSources,
'what_we_found' => (string)($synthesis['json']['what_we_found'] ?? ''),
'evidence_trail' => $numberedSources,
'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [],
'next_practical_step' => (string)($synthesis['json']['next_practical_step'] ?? ''),
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($merged),
'source_count' => count($numberedSources),
'sub_question_count' => count($retrievalQueries),
'upload_chunk_count' => count($this->uploadVecs),
'deployment' => $synthesis['deploy_label'],
'engine_used' => $engine,
'citation_confidence' => $confidence,
'elapsed_ms_per_step' => $this->stepTimings,
'slices_active' => array_keys(array_filter($sliceSelectionNormalized)),
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function normalizeControls(array $controls): array
{
return [
'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
];
}
private function requireFamilyPackage(int $clientId): array
{
$package = dbnToolsFetchPackage('family-legal');
if (!$package || empty($package['is_active'])) {
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
}
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
}
return $package;
}
private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string
{
$parts = [];
if ($seedQuery !== '') {
$parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8');
}
if ($pastedText !== '') {
$parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8');
}
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
$text = (string)($file['text'] ?? '');
if ($text === '') {
continue;
}
$parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8'));
}
return implode("\n\n", $parts);
}
private function interpretSeed(string $seedDescription, string $language): array
{
$locale = $language === 'no' ? 'Norwegian' : 'English';
$prompt = <<<PROMPT
You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.
Input:
{$seedDescription}
In {$locale}, produce JSON with:
{
"brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
"key_signals": ["short keywords or terms that should drive retrieval"]
}
PROMPT;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
$json = $this->azure->decodeJsonObject($raw);
if (is_array($json) && !empty($json['brief'])) {
$signals = $json['key_signals'] ?? [];
$signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
return [
'brief' => (string)$json['brief'],
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
];
}
} catch (Throwable $e) {
error_log('DBN deep research interpretation failed: ' . $e->getMessage());
}
return [
'brief' => '',
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
];
}
private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language): array
{
$locale = $language === 'no' ? 'Norwegian' : 'English';
$prompt = <<<PROMPT
You are decomposing a Do Better Norge legal-research request into {$targetCount} focused sub-questions that should each be answered by the legal corpus (Norwegian family law, child welfare, ECHR/Hague).
Research brief:
{$brief}
Raw input:
{$seedDescription}
Return JSON only:
{
"sub_questions": [
{"id":"q1","question":"... ({$locale})","rationale":"why this angle matters (≤ 140 chars)"}
]
}
Rules:
- Exactly {$targetCount} sub-questions, no more, no fewer.
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
- Sub-questions must be self-contained — readable without seeing the seed text.
- Write the questions in {$locale}.
PROMPT;
try {
$raw = $this->azure->chatText([
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
], ['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 35]);
$json = $this->azure->decodeJsonObject($raw);
$items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : [];
$normalized = [];
foreach ($items as $i => $item) {
if (!is_array($item) || empty($item['question'])) {
continue;
}
$normalized[] = [
'id' => 'q' . ($i + 1),
'question' => trim((string)$item['question']),
'rationale' => trim((string)($item['rationale'] ?? '')),
];
if (count($normalized) >= $targetCount) break;
}
if (count($normalized) >= 2) {
return ['questions' => $normalized, 'fallback' => false];
}
} catch (Throwable $e) {
error_log('DBN deep research expansion failed: ' . $e->getMessage());
}
return ['questions' => [], 'fallback' => true];
}
private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
{
$text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
if ($text === '') {
return [];
}
$words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
if (!$words) {
return [];
}
$chunks = [];
$i = 0;
$chunkIdx = 0;
$total = count($words);
while ($i < $total) {
$slice = array_slice($words, $i, self::CHUNK_WORDS);
if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
$chunks[] = [
'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
'file_index' => $fileIdx,
'chunk_index'=> $chunkIdx,
'filename' => $filename,
'text' => implode(' ', $slice),
];
$chunkIdx++;
}
$advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
if ($advance < 1) $advance = 1;
$i += $advance;
if (count($slice) < self::CHUNK_WORDS) {
break;
}
}
return $chunks;
}
private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
{
if (empty($this->uploadVecs)) {
return [];
}
try {
$qVec = $this->ai->embed($question, 'nomic-embed-text');
} catch (Throwable $e) {
error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage());
return [];
}
if (empty($qVec)) {
return [];
}
$scored = [];
foreach ($this->uploadVecs as $entry) {
$sim = $this->cosineSim($qVec, $entry['vec']);
if ($sim < $threshold) {
continue;
}
$scored[] = [
'chunk_id' => $entry['meta']['chunk_id'],
'title' => 'uploaded: ' . $entry['meta']['filename'],
'section' => null,
'package_or_corpus' => 'Your upload',
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620),
'chunk_text' => $entry['meta']['text'],
'similarity' => round($sim, 4),
'reranker_score' => null,
'document_id' => null,
'source_origin' => 'upload',
'authority_type' => null,
'jurisdiction' => null,
];
}
usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
$keep = (int)ceil($limitPerSubQ / 2);
return array_slice($scored, 0, max(1, $keep));
}
private function cosineSim(array $a, array $b): float
{
$len = min(count($a), count($b));
if ($len === 0) return 0.0;
$dot = 0.0;
$na = 0.0;
$nb = 0.0;
for ($i = 0; $i < $len; $i++) {
$x = (float)$a[$i];
$y = (float)$b[$i];
$dot += $x * $y;
$na += $x * $x;
$nb += $y * $y;
}
if ($na === 0.0 || $nb === 0.0) return 0.0;
return $dot / (sqrt($na) * sqrt($nb));
}
private function normalizeCorpusChunk(array $chunk, string $subQId): array
{
$similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
$rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null;
return [
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
'section' => $chunk['section_title'] ?? null,
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Norge'),
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
'chunk_text' => (string)($chunk['content'] ?? ''),
'similarity' => $similarity,
'reranker_score' => $rerankerScore,
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
'source_origin' => 'corpus',
'authority_type' => $chunk['authority_type'] ?? null,
'jurisdiction' => $chunk['jurisdiction'] ?? null,
'matched_sub_questions' => [$subQId],
];
}
private function mergeAndDedupe(array $rawPool, int $cap): array
{
$byKey = [];
foreach ($rawPool as $chunk) {
$key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
if (!isset($byKey[$key])) {
$byKey[$key] = $chunk;
continue;
}
$existing = $byKey[$key];
$existing['matched_sub_questions'] = array_values(array_unique(array_merge(
$existing['matched_sub_questions'] ?? [],
$chunk['matched_sub_questions'] ?? []
)));
// Keep the higher similarity score
if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
$existing['similarity'] = $chunk['similarity'];
}
if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
$existing['reranker_score'] = $chunk['reranker_score'];
}
$byKey[$key] = $existing;
}
$merged = array_values($byKey);
usort($merged, function (array $a, array $b): int {
$aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
$bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
return $bScore <=> $aScore;
});
return array_slice($merged, 0, $cap);
}
private function numberSources(array $chunks): array
{
$out = [];
foreach ($chunks as $i => $c) {
$c['n'] = $i + 1;
$out[] = $c;
}
return $out;
}
private function synthesise(
string $seedDescription,
string $brief,
array $subQuestions,
array $numberedSources,
string $engine,
string $language,
float $temperature
): array {
$locale = $language === 'no' ? 'Norwegian' : 'English';
if (empty($numberedSources)) {
return [
'json' => [
'brief_markdown' => $language === 'no'
? 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.'
: 'I did not find enough source support in the corpus to give a grounded answer.',
'what_we_found' => 'No retrieved sources passed the similarity threshold.',
'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'],
'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.',
],
'deploy_label' => $engine === 'gpu' ? 'GPU (cuttlefish)' : ($engine === 'azure_full' ? 'gpt-4o' : $this->azure->chatDeployment()),
];
}
$sourcesContext = [];
foreach ($numberedSources as $s) {
$sourcesContext[] = sprintf(
"[%d] (%s) %s%s\n Corpus: %s\n Excerpt: %s",
$s['n'],
$s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
$s['title'],
!empty($s['section']) ? ' — ' . $s['section'] : '',
$s['package_or_corpus'],
$s['excerpt']
);
}
$sourcesText = implode("\n\n", $sourcesContext);
$subQText = '';
if ($subQuestions) {
$lines = array_map(
fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']),
$subQuestions,
array_keys($subQuestions)
);
$subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
}
$prompt = <<<PROMPT
You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
User input:
{$seedDescription}
Research brief:
{$brief}
{$subQText}
Sources (numbered):
{$sourcesText}
Return JSON only in {$locale}:
{
"brief_markdown": "Markdown legal brief, 250-700 words, with inline [n] citation markers keyed to the sources above. Use short paragraphs. End with a one-line caveat. Do NOT include headings above level 3 (###).",
"what_we_found": "1-2 sentence plain-language summary of the grounded finding",
"what_remains_uncertain": ["gaps or caveats — what the corpus did not cover or where confidence is limited"],
"next_practical_step": "one concrete next action the user can take"
}
Rules:
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
- If no source supports a point, omit the point.
- Respond in {$locale}.
- Output valid JSON only — no markdown fences around the JSON.
PROMPT;
$messages = [
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
['role' => 'user', 'content' => $prompt],
];
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 2200, 'timeout' => 120];
try {
if ($engine === 'gpu') {
$response = dbnToolsCallGpuLlm($messages, $opts);
$deployLabel = 'GPU (cuttlefish)';
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
} elseif ($engine === 'azure_full') {
$raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
$deployLabel = 'gpt-4o';
} else {
$raw = $this->azure->chatText($messages, $opts);
$deployLabel = $this->azure->chatDeployment();
}
} catch (Throwable $e) {
dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
}
$json = $this->azure->decodeJsonObject($raw);
if (!is_array($json) || empty($json['brief_markdown'])) {
// Salvage as plain markdown
$json = [
'brief_markdown' => $raw,
'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
'next_practical_step' => 'Review the brief manually before relying on it.',
];
}
return [
'json' => $json,
'deploy_label' => $deployLabel,
];
}
private function citationConfidence(array $sources): string
{
if (!$sources) {
return 'low';
}
$scores = array_values(array_filter(array_map(
fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
$sources
), 'is_numeric'));
$best = $scores ? max($scores) : 0;
if (count($sources) >= 6 && $best >= 0.5) {
return 'high';
}
if (count($sources) >= 3 && $best >= 0.35) {
return 'medium';
}
return 'low';
}
private function trace(string $label, string $detail, string $status = 'complete'): array
{
return [
'label' => $label,
'detail' => $detail,
'status' => $status,
];
}
private function elapsedMs(float $start): int
{
return (int)round((microtime(true) - $start) * 1000);
}
}
+189
View File
@@ -487,3 +487,192 @@ function dbnToolsExcerpt(string $text, int $limit = 520): string
}
return rtrim(mb_substr($text, 0, $limit - 1, 'UTF-8')) . '…';
}
const DBN_TOOLS_EXTRACT_MAX_BYTES = 4 * 1024 * 1024;
const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000;
const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
function dbnToolsExtractUploadedFile(array $file): array
{
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
if ($errCode !== UPLOAD_ERR_OK) {
$msg = match ($errCode) {
UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.',
UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.',
UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.',
default => 'File upload failed.',
};
dbnToolsAbort($msg, 422, 'upload_error');
}
$originalName = basename((string)($file['name'] ?? ''));
$tmpPath = (string)($file['tmp_name'] ?? '');
$size = (int)($file['size'] ?? 0);
if (!is_uploaded_file($tmpPath)) {
dbnToolsAbort('Invalid file upload.', 400, 'invalid_upload');
}
if ($size === 0) {
dbnToolsAbort('The uploaded file is empty.', 422, 'file_empty');
}
if ($size > DBN_TOOLS_EXTRACT_MAX_BYTES) {
dbnToolsAbort('File exceeds the 4 MB limit.', 413, 'file_too_large');
}
$ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION));
if (!in_array($ext, DBN_TOOLS_EXTRACT_ALLOWED_EXTS, true)) {
dbnToolsAbort('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type');
}
$text = match ($ext) {
'txt' => dbnToolsExtractTxt($tmpPath),
'pdf' => dbnToolsExtractPdf($tmpPath),
'docx' => dbnToolsExtractDocx($tmpPath),
};
$text = trim($text);
if ($text === '') {
dbnToolsAbort('No text could be extracted from this file.', 422, 'no_text');
}
$truncated = false;
if (mb_strlen($text, 'UTF-8') > DBN_TOOLS_EXTRACT_TEXT_LIMIT) {
$text = mb_substr($text, 0, DBN_TOOLS_EXTRACT_TEXT_LIMIT, 'UTF-8');
$truncated = true;
}
return [
'ok' => true,
'text' => $text,
'filename' => $originalName,
'chars' => mb_strlen($text, 'UTF-8'),
'truncated' => $truncated,
];
}
function dbnToolsExtractTxt(string $path): string
{
$content = file_get_contents($path);
if ($content === false) {
throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error');
}
return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252');
}
function dbnToolsExtractPdf(string $path): string
{
$cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null';
$output = shell_exec($cmd);
if ($output === null || $output === false || trim($output) === '') {
throw new DbnToolsHttpException(
'PDF text extraction failed. The file may be image-only or encrypted.',
422,
'pdf_extract_failed'
);
}
return $output;
}
function dbnToolsExtractDocx(string $path): string
{
$zip = new ZipArchive();
$result = $zip->open($path);
if ($result !== true) {
throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed');
}
$xml = $zip->getFromName('word/document.xml');
$zip->close();
if ($xml === false) {
throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content');
}
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadXML($xml);
libxml_clear_errors();
$xpath = new DOMXPath($doc);
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
$paragraphs = [];
foreach ($xpath->query('//w:p') as $para) {
$runs = [];
foreach ($xpath->query('.//w:t', $para) as $t) {
$runs[] = $t->textContent;
}
$paragraphs[] = implode('', $runs);
}
return implode("\n", $paragraphs);
}
function dbnToolsCallGpuLlm(array $messages, array $options = []): array
{
$url = 'http://10.0.1.10:4000/v1/chat/completions';
$apiKey = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d');
$model = (string)($options['model'] ?? 'qwen2.5:14b');
$timeout = (int)($options['timeout'] ?? 90);
$payload = [
'model' => $model,
'messages' => $messages,
'temperature' => $options['temperature'] ?? 0.1,
'max_tokens' => $options['max_tokens'] ?? 8000,
];
if (!empty($options['json'])) {
$payload['response_format'] = ['type' => 'json_object'];
}
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
$headers = [
'Content-Type: application/json',
'Authorization: Bearer ' . $apiKey,
];
if (function_exists('curl_init')) {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => $body,
CURLOPT_HTTPHEADER => $headers,
CURLOPT_TIMEOUT => $timeout,
]);
$response = curl_exec($ch);
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
$err = curl_error($ch);
curl_close($ch);
if ($response === false) {
throw new RuntimeException('GPU LiteLLM request failed: ' . $err);
}
} else {
$ctx = stream_context_create(['http' => [
'method' => 'POST',
'header' => implode("\r\n", $headers),
'content' => $body,
'timeout' => $timeout,
'ignore_errors' => true,
]]);
$response = @file_get_contents($url, false, $ctx);
$code = 0;
if (isset($http_response_header[0]) && preg_match('/\s(\d{3})\s/', $http_response_header[0], $m)) {
$code = (int)$m[1];
}
if ($response === false) {
throw new RuntimeException('GPU LiteLLM request failed.');
}
}
$decoded = json_decode($response, true);
if (!is_array($decoded)) {
throw new RuntimeException('GPU LiteLLM returned non-JSON response.');
}
if ($code < 200 || $code >= 300) {
$msg = $decoded['error']['message'] ?? ('HTTP ' . $code);
throw new RuntimeException('GPU LiteLLM error: ' . $msg);
}
return $decoded;
}
+7 -6
View File
@@ -9,12 +9,13 @@ if (!dbnToolsIsAuthenticated()) {
}
$navItems = [
'ask' => ['Ask', 'Source-grounded'],
'search' => ['Search', 'Legal sources'],
'summarize' => ['Summarize', 'Pasted text'],
'timeline' => ['Timeline', 'Events'],
'redact' => ['Redact', 'Privacy'],
'transcribe' => ['Transcribe', 'Audio'],
'ask' => ['Ask', 'Source-grounded'],
'search' => ['Search', 'Legal sources'],
'deep-research' => ['Deep research', 'Agent + RAG'],
'summarize' => ['Summarize', 'Pasted text'],
'timeline' => ['Timeline', 'Events'],
'redact' => ['Redact', 'Privacy'],
'transcribe' => ['Transcribe', 'Audio'],
];
$toolName = $toolName ?? 'ask';
$toolTitle = $toolTitle ?? 'Legal Tools';
+3
View File
@@ -18,5 +18,8 @@
</section><!-- /workspace -->
</main><!-- /appShell -->
<script src="assets/js/tools.js" defer></script>
<?php if (!empty($extraScripts) && is_array($extraScripts)): foreach ($extraScripts as $extraScript): ?>
<script src="<?= htmlspecialchars((string)$extraScript) ?>" defer></script>
<?php endforeach; endif; ?>
</body>
</html>
+6 -1
View File
@@ -91,7 +91,7 @@ if (dbnToolsIsAuthenticated()) {
<section class="cap-section">
<div class="section-inner">
<h2 class="section-heading">Six tools, one suite</h2>
<h2 class="section-heading">Seven tools, one suite</h2>
<div class="cap-grid">
<div class="cap-card">
<span class="cap-label">Ask</span>
@@ -103,6 +103,11 @@ if (dbnToolsIsAuthenticated()) {
<h3>Search</h3>
<p>Retrieve up to seven relevant legal sources with titles, sections, and excerpts.</p>
</div>
<div class="cap-card">
<span class="cap-label">Deep research</span>
<h3>Deep research</h3>
<p>Upload a case file or paste a question. An agent expands it into 35 angles, runs hybrid rank/rerank RAG across the corpus + your upload, and returns a cited brief.</p>
</div>
<div class="cap-card">
<span class="cap-label">Summarize</span>
<h3>Summarize</h3>