diff --git a/api/deep-research.php b/api/deep-research.php new file mode 100644 index 0000000..b54366a --- /dev/null +++ b/api/deep-research.php @@ -0,0 +1,67 @@ + 5) { + dbnToolsAbort('At most 5 files can be uploaded per request.', 413, 'too_many_files'); + } + for ($i = 0; $i < $count; $i++) { + $file = [ + 'name' => $_FILES['files']['name'][$i] ?? '', + 'type' => $_FILES['files']['type'][$i] ?? '', + 'tmp_name' => $_FILES['files']['tmp_name'][$i] ?? '', + 'error' => $_FILES['files']['error'][$i] ?? UPLOAD_ERR_NO_FILE, + 'size' => $_FILES['files']['size'][$i] ?? 0, + ]; + $extracted = dbnToolsExtractUploadedFile($file); + $uploadedFiles[] = [ + 'filename' => $extracted['filename'], + 'text' => $extracted['text'], + 'chars' => $extracted['chars'], + 'truncated' => $extracted['truncated'], + ]; + } + } + + return (new DbnDeepResearchAgent())->run( + $seedQuery, + $pastedText, + $uploadedFiles, + is_array($sliceInput) ? $sliceInput : [], + $engine, + $language, + $controls + ); +}); diff --git a/api/extract.php b/api/extract.php index b3cd7e7..613ffdc 100644 --- a/api/extract.php +++ b/api/extract.php @@ -6,132 +6,16 @@ require_once __DIR__ . '/../includes/bootstrap.php'; dbnToolsRequireMethod('POST'); dbnToolsRequireAuth(); -const EXTRACT_MAX_BYTES = 4 * 1024 * 1024; -const EXTRACT_TEXT_LIMIT = 128000; -const EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx']; - try { if (empty($_FILES['file']) || !is_array($_FILES['file'])) { dbnToolsError('No file was uploaded.', 422, 'missing_file'); } - $file = $_FILES['file']; - $errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE); - - if ($errCode !== UPLOAD_ERR_OK) { - $msg = match ($errCode) { - UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.', - UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.', - UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.', - default => 'File upload failed.', - }; - dbnToolsError($msg, 422, 'upload_error'); - } - - $originalName = basename((string)($file['name'] ?? '')); - $tmpPath = (string)($file['tmp_name'] ?? ''); - $size = (int)($file['size'] ?? 0); - - if (!is_uploaded_file($tmpPath)) { - dbnToolsError('Invalid file upload.', 400, 'invalid_upload'); - } - if ($size === 0) { - dbnToolsError('The uploaded file is empty.', 422, 'file_empty'); - } - if ($size > EXTRACT_MAX_BYTES) { - dbnToolsError('File exceeds the 4 MB limit.', 413, 'file_too_large'); - } - - $ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION)); - if (!in_array($ext, EXTRACT_ALLOWED_EXTS, true)) { - dbnToolsError('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type'); - } - - $text = match ($ext) { - 'txt' => extractTxt($tmpPath), - 'pdf' => extractPdf($tmpPath), - 'docx' => extractDocx($tmpPath), - }; - - $text = trim($text); - if ($text === '') { - dbnToolsError('No text could be extracted from this file.', 422, 'no_text'); - } - - $truncated = false; - if (mb_strlen($text, 'UTF-8') > EXTRACT_TEXT_LIMIT) { - $text = mb_substr($text, 0, EXTRACT_TEXT_LIMIT, 'UTF-8'); - $truncated = true; - } - - dbnToolsRespond([ - 'ok' => true, - 'text' => $text, - 'filename' => $originalName, - 'chars' => mb_strlen($text, 'UTF-8'), - 'truncated' => $truncated, - ]); + $result = dbnToolsExtractUploadedFile($_FILES['file']); + dbnToolsRespond($result); } catch (DbnToolsHttpException $e) { dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra); } catch (Throwable $e) { error_log('DBN extract error: ' . $e->getMessage()); dbnToolsError('Text extraction failed.', 500, 'extract_error'); } - -function extractTxt(string $path): string -{ - $content = file_get_contents($path); - if ($content === false) { - throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error'); - } - return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252'); -} - -function extractPdf(string $path): string -{ - $cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null'; - $output = shell_exec($cmd); - if ($output === null || $output === false || trim($output) === '') { - throw new DbnToolsHttpException( - 'PDF text extraction failed. The file may be image-only or encrypted.', - 422, - 'pdf_extract_failed' - ); - } - return $output; -} - -function extractDocx(string $path): string -{ - $zip = new ZipArchive(); - $result = $zip->open($path); - if ($result !== true) { - throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed'); - } - - $xml = $zip->getFromName('word/document.xml'); - $zip->close(); - - if ($xml === false) { - throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content'); - } - - $doc = new DOMDocument(); - libxml_use_internal_errors(true); - $doc->loadXML($xml); - libxml_clear_errors(); - - $xpath = new DOMXPath($doc); - $xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'); - - $paragraphs = []; - foreach ($xpath->query('//w:p') as $para) { - $runs = []; - foreach ($xpath->query('.//w:t', $para) as $t) { - $runs[] = $t->textContent; - } - $paragraphs[] = implode('', $runs); - } - - return implode("\n", $paragraphs); -} diff --git a/assets/css/tools.css b/assets/css/tools.css index e3536ac..cc1902f 100644 --- a/assets/css/tools.css +++ b/assets/css/tools.css @@ -1701,3 +1701,478 @@ p { font-weight: 500; margin: 0; } + +/* ========================================================================= + Deep Research — agent + rank/rerank RAG surface + ========================================================================= */ + +.deep-research .lang-switcher { + display: inline-flex; + gap: 6px; +} + +.deep-research .lang-btn { + padding: 6px 10px; + border-radius: 999px; + background: #fff; + border: 1px solid var(--line); + color: var(--muted); + font-weight: 700; +} + +.deep-research .lang-btn.is-active { + background: var(--soft-teal); + color: var(--teal-dark); + border-color: rgba(15, 118, 110, 0.30); +} + +.dr-slice-section { + display: grid; + gap: 8px; +} + +.dr-slice-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 10px; +} + +.dr-slice { + text-align: left; + background: #fbfcfe; + border: 1px solid var(--line); + border-radius: 8px; + padding: 12px 13px; + cursor: pointer; + min-height: 96px; + display: grid; + gap: 6px; + align-content: start; + transition: border-color 120ms ease, background 120ms ease; +} + +.dr-slice:hover { + border-color: rgba(15, 118, 110, 0.30); +} + +.dr-slice.is-on { + background: var(--soft-teal); + border-color: rgba(15, 118, 110, 0.45); +} + +.dr-slice__head { + display: flex; + align-items: center; + justify-content: space-between; + gap: 8px; +} + +.dr-slice__title { + font-weight: 800; + color: var(--ink); +} + +.dr-slice__badge { + background: #fff; + border: 1px solid var(--line); + border-radius: 999px; + color: var(--muted); + font-size: 0.66rem; + font-weight: 800; + letter-spacing: 0.06em; + padding: 3px 8px; + text-transform: uppercase; +} + +.dr-slice.is-on .dr-slice__badge { + background: var(--teal); + border-color: var(--teal); + color: #fff; +} + +.dr-slice__tagline { + margin: 0; + color: var(--muted); + font-size: 0.86rem; + line-height: 1.4; +} + +.advanced-panel .dr-control-grid { + display: grid; + grid-template-columns: repeat(5, minmax(0, 1fr)); + gap: 8px; + margin-top: 10px; +} + +.dr-control-card { + background: #fbfcfe; + border: 1px solid var(--line); + border-radius: 8px; + padding: 10px; +} + +.dr-control-card label { + display: flex; + justify-content: space-between; + gap: 8px; + align-items: center; + font-weight: 800; + color: var(--ink); + font-size: 0.85rem; +} + +.dr-control-card small { + display: block; + margin-top: 8px; + color: var(--muted); + font-size: 0.74rem; + line-height: 1.4; +} + +.dr-control-card input[type="range"] { + width: 100%; + margin-top: 8px; + accent-color: var(--teal); +} + +.dr-control-value { + color: var(--coral); + font-variant-numeric: tabular-nums; +} + +@media (max-width: 980px) { + .advanced-panel .dr-control-grid { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + .dr-slice-grid { + grid-template-columns: 1fr; + } +} + +.deep-research-results { + display: grid; + gap: 14px; +} + +.dr-result-block { + border: 1px solid var(--line); + border-radius: 8px; + padding: 16px; + background: #fff; +} + +.dr-brief { + line-height: 1.65; + color: var(--ink); + font-size: 1.0rem; +} + +.dr-brief p { + margin: 0 0 12px; +} + +.dr-brief code { + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + background: var(--soft-teal); + padding: 1px 5px; + border-radius: 4px; + font-size: 0.86em; +} + +.dr-brief strong { color: var(--ink); } +.dr-brief em { color: var(--muted); } + +.dr-cite { + display: inline-flex; + align-items: center; + justify-content: center; + min-width: 18px; + height: 18px; + margin: 0 1px; + padding: 0 5px; + border-radius: 999px; + background: var(--soft-coral); + color: var(--coral); + font-size: 0.72rem; + font-weight: 800; + font-variant-numeric: tabular-nums; + cursor: pointer; + border: 1px solid rgba(194, 65, 12, 0.25); + vertical-align: 1px; +} + +.dr-cite:hover { background: var(--coral); color: #fff; } + +.dr-sources-head { + display: flex; + align-items: baseline; + justify-content: space-between; + margin-bottom: 10px; +} + +.dr-sources-head h3 { + margin: 0; + font-size: 1rem; +} + +.dr-sources-head small { + color: var(--muted); + font-size: 0.82rem; +} + +.dr-source-list { + display: grid; + gap: 10px; +} + +.dr-source-card { + display: grid; + grid-template-columns: 34px 1fr auto; + gap: 12px; + align-items: start; + border: 1px solid var(--line); + border-radius: 8px; + padding: 12px; + background: #fbfcfe; + cursor: pointer; + text-align: left; + width: 100%; +} + +.dr-source-card:hover { border-color: rgba(15, 118, 110, 0.40); } + +.dr-source-card.is-highlight { + border-color: var(--coral); + background: var(--soft-coral); +} + +.dr-source-number { + display: inline-flex; + align-items: center; + justify-content: center; + width: 28px; + height: 28px; + border-radius: 999px; + background: var(--soft-coral); + color: var(--coral); + font-weight: 900; + font-variant-numeric: tabular-nums; +} + +.dr-source-body { + min-width: 0; +} + +.dr-source-title { + font-weight: 800; + color: var(--ink); + line-height: 1.35; +} + +.dr-source-meta { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-top: 6px; +} + +.dr-source-tag { + background: var(--soft-teal); + color: var(--teal-dark); + border-radius: 999px; + font-size: 0.7rem; + font-weight: 800; + padding: 3px 8px; + text-transform: uppercase; +} + +.dr-source-tag--upload { background: #fff0e8; color: #8a4524; } +.dr-source-tag--score { background: #eef3fb; color: #314158; } + +.dr-source-excerpt { + color: var(--muted); + margin-top: 8px; + line-height: 1.5; + font-size: 0.92rem; +} + +.dr-source-aside { + align-self: stretch; + display: grid; + grid-template-rows: auto auto; + gap: 6px; + font-size: 0.78rem; + color: var(--muted); + text-align: right; + min-width: 90px; +} + +.dr-source-aside b { + color: var(--ink); + font-variant-numeric: tabular-nums; + font-size: 0.92rem; +} + +/* Method trace — overrides for #traceList rendered in rich mode */ +.trace-list.is-rich { + display: grid; + gap: 8px; +} + +.trace-list.is-rich .trace-step { + display: grid; + grid-template-columns: 28px 1fr; + gap: 10px; + align-items: start; + padding: 10px 12px; + border: 1px solid var(--line); + border-radius: 8px; + background: #fbfcfe; + list-style: none; +} + +.trace-list.is-rich .trace-step__marker { + display: inline-flex; + align-items: center; + justify-content: center; + width: 22px; + height: 22px; + border-radius: 999px; + border: 1px solid var(--line); + background: #fff; + color: var(--muted); + font-size: 0.72rem; + font-weight: 900; + font-variant-numeric: tabular-nums; +} + +.trace-list.is-rich .trace-step__label { + display: block; + font-weight: 800; + color: var(--ink); + font-size: 0.94rem; +} + +.trace-list.is-rich .trace-step__detail { + display: block; + margin-top: 4px; + color: var(--muted); + font-size: 0.83rem; + line-height: 1.45; +} + +.trace-list.is-rich .trace-step.is-running { + background: var(--soft-coral); +} +.trace-list.is-rich .trace-step.is-running .trace-step__marker { + background: rgba(194, 65, 12, 0.18); + border-color: rgba(194, 65, 12, 0.35); + color: var(--coral); + animation: drTracePulse 950ms ease-in-out infinite; +} + +.trace-list.is-rich .trace-step.is-done .trace-step__marker { + background: var(--soft-teal); + border-color: rgba(15, 118, 110, 0.30); + color: var(--teal-dark); +} + +.trace-list.is-rich .trace-step.is-warning .trace-step__marker { + background: #fff4dc; + border-color: rgba(183, 121, 31, 0.35); + color: var(--amber); +} + +.trace-list.is-rich .trace-step.is-error { + background: #fff0e8; +} +.trace-list.is-rich .trace-step.is-error .trace-step__marker { + background: rgba(180, 30, 30, 0.10); + border-color: rgba(180, 30, 30, 0.30); + color: #b41e1e; +} + +@keyframes drTracePulse { + 0%, 100% { opacity: 0.55; transform: scale(0.92); } + 50% { opacity: 1; transform: scale(1.04); } +} + +/* Source modal */ +.dr-source-modal { + position: fixed; + inset: 0; + background: rgba(23, 32, 51, 0.62); + display: flex; + align-items: center; + justify-content: center; + padding: 24px; + z-index: 9999; +} + +.dr-source-modal__dialog { + width: min(960px, 100%); + max-height: 90vh; + background: #fff; + border-radius: 8px; + box-shadow: 0 28px 92px rgba(0, 0, 0, 0.34); + overflow: hidden; + display: grid; + grid-template-rows: auto 1fr; +} + +.dr-source-modal__head { + display: flex; + align-items: start; + justify-content: space-between; + gap: 14px; + padding: 16px 18px; + border-bottom: 1px solid var(--line); +} + +.dr-source-modal__head h3 { + margin: 0; + color: var(--ink); + line-height: 1.25; + font-size: 1.2rem; +} + +.dr-source-modal__body { + display: grid; + grid-template-columns: 260px minmax(0, 1fr); + overflow: hidden; +} + +.dr-source-modal__meta, +.dr-source-modal__text { + padding: 16px 18px; + overflow: auto; +} + +.dr-source-modal__meta { + border-right: 1px solid var(--line); + background: #fbfcfe; + color: var(--muted); + font-size: 0.88rem; + line-height: 1.55; +} + +.dr-source-modal__meta dt { + color: var(--ink); + font-weight: 800; + margin-top: 8px; +} + +.dr-source-modal__meta dt:first-of-type { margin-top: 0; } + +.dr-source-modal__text { + white-space: pre-wrap; + line-height: 1.7; + color: var(--ink); +} + +@media (max-width: 720px) { + .dr-source-modal__body { grid-template-columns: 1fr; } + .dr-source-modal__meta { border-right: 0; border-bottom: 1px solid var(--line); } + .dr-source-card { grid-template-columns: 32px 1fr; } + .dr-source-aside { display: none; } +} diff --git a/assets/js/deep-research.js b/assets/js/deep-research.js new file mode 100644 index 0000000..fbbb32e --- /dev/null +++ b/assets/js/deep-research.js @@ -0,0 +1,481 @@ +/* deep-research.js — page-scoped UI for /deep-research.php */ +(function () { + 'use strict'; + + const els = {}; + let lang = 'en'; + let uploadFiles = []; + let lastResult = null; + + const SLICE_DEFS = [ + { id: 'family_core', label: 'Family Law Core' }, + { id: 'child_welfare', label: 'Child Welfare' }, + { id: 'echr_hague', label: 'ECHR and Hague' }, + { id: 'broader_legal', label: 'Broader Legal Support' }, + ]; + + const STEP_LABELS = [ + 'Query interpretation', + 'Query expansion', + 'Slice resolution', + 'Upload indexing', + 'Retrieval', + 'Synthesis', + 'Citation confidence', + ]; + + document.addEventListener('DOMContentLoaded', () => { + if (!document.body.dataset.activeTool || document.body.dataset.activeTool !== 'deep-research') return; + + Object.assign(els, { + form: document.getElementById('deepResearchForm'), + input: document.getElementById('drInput'), + status: document.getElementById('drStatus'), + runButton: document.getElementById('drRunButton'), + results: document.getElementById('drResults'), + traceList: document.getElementById('traceList'), + slices: Array.from(document.querySelectorAll('.dr-slice')), + langButtons: Array.from(document.querySelectorAll('#drLangSwitcher .lang-btn')), + engineRadios: Array.from(document.querySelectorAll('input[name="drEngine"]')), + subQ: document.getElementById('drSubQ'), + subQVal: document.getElementById('drSubQValue'), + chunkLimit: document.getElementById('drChunkLimit'), + chunkLimitVal: document.getElementById('drChunkLimitValue'), + sim: document.getElementById('drSim'), + simVal: document.getElementById('drSimValue'), + topK: document.getElementById('drTopK'), + topKVal: document.getElementById('drTopKValue'), + temp: document.getElementById('drTemp'), + tempVal: document.getElementById('drTempValue'), + uploadZone: document.getElementById('drUploadZone'), + uploadInput: document.getElementById('drUploadInput'), + uploadPrompt: document.getElementById('drUploadPrompt'), + uploadFileInfo: document.getElementById('drUploadFileInfo'), + uploadFileList: document.getElementById('drUploadFileList'), + uploadClear: document.getElementById('drUploadClear'), + modal: document.getElementById('drSourceModal'), + modalClose: document.getElementById('drSourceModalClose'), + modalTitle: document.getElementById('drSourceModalTitle'), + modalEyebrow: document.getElementById('drSourceModalEyebrow'), + modalMeta: document.getElementById('drSourceModalMeta'), + modalText: document.getElementById('drSourceModalText'), + }); + + if (!els.form) return; + + bindSlices(); + bindLang(); + bindRanges(); + bindUpload(); + bindModal(); + els.form.addEventListener('submit', onSubmit); + + // Pre-render placeholder trace + renderTrace(STEP_LABELS.map((label) => ({ label, detail: 'Waiting…', status: 'idle' }))); + }); + + function bindSlices() { + els.slices.forEach((btn) => { + btn.addEventListener('click', () => { + const isOn = btn.classList.toggle('is-on'); + btn.setAttribute('aria-pressed', isOn ? 'true' : 'false'); + const badge = btn.querySelector('.dr-slice__badge'); + if (badge) badge.textContent = isOn ? 'on' : 'off'; + }); + }); + } + + function bindLang() { + els.langButtons.forEach((b) => { + b.addEventListener('click', () => { + els.langButtons.forEach((x) => x.classList.remove('is-active')); + b.classList.add('is-active'); + lang = b.dataset.lang || 'en'; + }); + }); + } + + function bindRanges() { + const pairs = [ + [els.subQ, els.subQVal, (v) => v], + [els.chunkLimit, els.chunkLimitVal, (v) => v], + [els.sim, els.simVal, (v) => Number(v).toFixed(2)], + [els.topK, els.topKVal, (v) => v], + [els.temp, els.tempVal, (v) => Number(v).toFixed(2)], + ]; + pairs.forEach(([range, label, fmt]) => { + if (!range || !label) return; + const sync = () => { label.textContent = fmt(range.value); }; + range.addEventListener('input', sync); + sync(); + }); + } + + function bindUpload() { + if (!els.uploadZone) return; + const onFiles = (fileList) => { + const files = Array.from(fileList || []).slice(0, 5); + if (uploadFiles.length + files.length > 5) { + setStatus('At most 5 files can be uploaded per request.', 'error'); + return; + } + files.forEach((f) => { + if (f.size > 4 * 1024 * 1024) { + setStatus(`${f.name} exceeds the 4 MB limit.`, 'error'); + return; + } + const ext = (f.name.split('.').pop() || '').toLowerCase(); + if (!['pdf', 'docx', 'txt'].includes(ext)) { + setStatus(`${f.name} is not a supported file type.`, 'error'); + return; + } + uploadFiles.push(f); + }); + renderUploadList(); + }; + els.uploadInput.addEventListener('change', (e) => onFiles(e.target.files)); + els.uploadZone.addEventListener('dragover', (e) => { e.preventDefault(); els.uploadZone.classList.add('is-drop'); }); + els.uploadZone.addEventListener('dragleave', () => els.uploadZone.classList.remove('is-drop')); + els.uploadZone.addEventListener('drop', (e) => { + e.preventDefault(); + els.uploadZone.classList.remove('is-drop'); + onFiles(e.dataTransfer?.files); + }); + els.uploadClear?.addEventListener('click', () => { + uploadFiles = []; + els.uploadInput.value = ''; + renderUploadList(); + }); + } + + function renderUploadList() { + if (!uploadFiles.length) { + els.uploadFileInfo.classList.add('is-hidden'); + els.uploadPrompt.classList.remove('is-hidden'); + return; + } + els.uploadPrompt.classList.add('is-hidden'); + els.uploadFileInfo.classList.remove('is-hidden'); + els.uploadFileList.innerHTML = uploadFiles.map((f, i) => { + const kb = (f.size / 1024).toFixed(0); + return `
  • ${escapeHtml(f.name)}${kb} KB
  • `; + }).join(''); + } + + function bindModal() { + els.modalClose?.addEventListener('click', closeModal); + els.modal?.addEventListener('click', (e) => { + if (e.target === els.modal) closeModal(); + }); + document.addEventListener('keydown', (e) => { + if (e.key === 'Escape' && els.modal && !els.modal.classList.contains('is-hidden')) closeModal(); + }); + } + + function closeModal() { + els.modal?.classList.add('is-hidden'); + } + + function openModal(source) { + if (!source) return; + els.modalEyebrow.textContent = source.source_origin === 'upload' ? 'Uploaded file' : 'Corpus source'; + els.modalTitle.textContent = source.title || 'Source'; + const metaRows = [ + ['Number', `[${source.n}]`], + source.section ? ['Section', source.section] : null, + ['Corpus / package', source.package_or_corpus || '—'], + source.authority_type ? ['Authority', source.authority_type] : null, + source.jurisdiction ? ['Jurisdiction', source.jurisdiction] : null, + source.similarity != null ? ['Similarity', String(source.similarity)] : null, + source.reranker_score != null ? ['Rerank score', String(source.reranker_score)] : null, + source.matched_sub_questions?.length ? ['Matched sub-Q', source.matched_sub_questions.join(', ')] : null, + ].filter(Boolean); + els.modalMeta.innerHTML = '
    ' + metaRows.map(([k, v]) => `
    ${escapeHtml(k)}
    ${escapeHtml(String(v))}
    `).join('') + '
    '; + els.modalText.textContent = source.chunk_text || source.excerpt || ''; + els.modal.classList.remove('is-hidden'); + } + + function getSelectedSlices() { + const out = {}; + SLICE_DEFS.forEach((s) => { + const btn = els.slices.find((b) => b.dataset.slice === s.id); + out[s.id] = !!(btn && btn.classList.contains('is-on')); + }); + return out; + } + + function getEngine() { + const checked = els.engineRadios.find((r) => r.checked); + return checked ? checked.value : 'azure_mini'; + } + + function getControls() { + return { + sub_q_count: parseInt(els.subQ.value, 10), + chunk_limit: parseInt(els.chunkLimit.value, 10), + similarity_threshold: parseFloat(els.sim.value), + reranker_top_k: parseInt(els.topK.value, 10), + temperature: parseFloat(els.temp.value), + }; + } + + async function onSubmit(e) { + e.preventDefault(); + const query = (els.input.value || '').trim(); + if (!query && uploadFiles.length === 0) { + setStatus('Type a question or upload a file before running deep research.', 'error'); + return; + } + const slices = getSelectedSlices(); + if (!Object.values(slices).some(Boolean)) { + setStatus('Enable at least one corpus slice.', 'error'); + return; + } + + setStatus('Running deep research…', 'busy'); + els.runButton.disabled = true; + els.results.innerHTML = `

    Working…

    The agent is expanding the question, retrieving from the corpus, and synthesising the brief. This usually takes 6–15 seconds.

    `; + + // Render placeholder trace with first step running + const placeholder = STEP_LABELS.map((label, i) => ({ + label, + detail: i === 0 ? 'Running…' : 'Queued', + status: i === 0 ? 'running' : 'idle', + })); + renderTrace(placeholder); + + const payload = { + query, + paste_text: '', + slices, + engine: getEngine(), + language: lang, + controls: getControls(), + }; + + let response; + try { + if (uploadFiles.length > 0) { + const form = new FormData(); + form.append('payload', JSON.stringify(payload)); + uploadFiles.forEach((f) => form.append('files[]', f)); + response = await fetch('api/deep-research.php', { method: 'POST', body: form, credentials: 'same-origin' }); + } else { + response = await fetch('api/deep-research.php', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + credentials: 'same-origin', + }); + } + } catch (err) { + setStatus(`Network error: ${err.message || err}`, 'error'); + els.runButton.disabled = false; + return; + } + + let data = null; + try { data = await response.json(); } catch (_) {} + + if (!response.ok || !data || data.ok === false) { + const msg = (data && data.error && data.error.message) || `Request failed (${response.status}).`; + setStatus(msg, 'error'); + els.runButton.disabled = false; + renderTrace(placeholder.map((s, i) => i === 0 ? { ...s, status: 'error', detail: msg } : s)); + return; + } + + lastResult = data; + setStatus(`Done in ${data.latency_ms || 0} ms · ${data.trace_metadata?.source_count || 0} sources · confidence ${data.trace_metadata?.citation_confidence || '?'}`, 'ok'); + els.runButton.disabled = false; + renderTrace(data.trace || []); + renderResults(data); + } + + function setStatus(message, kind) { + els.status.textContent = message; + els.status.style.color = kind === 'error' ? '#b41e1e' : (kind === 'ok' ? 'var(--teal-dark)' : 'var(--muted)'); + } + + function renderTrace(steps) { + if (!els.traceList) return; + els.traceList.classList.add('is-rich'); + els.traceList.innerHTML = steps.map((step, i) => { + const statusClass = step.status === 'running' ? 'is-running' + : step.status === 'complete' ? 'is-done' + : step.status === 'warning' ? 'is-warning' + : step.status === 'error' ? 'is-error' + : ''; + const marker = step.status === 'complete' ? '✓' + : step.status === 'warning' ? '!' + : step.status === 'error' ? '×' + : (i + 1); + return `
  • + ${marker} +
    + ${escapeHtml(step.label || '')} + ${escapeHtml(step.detail || '')} +
    +
  • `; + }).join(''); + } + + function renderResults(data) { + const sources = data.sources || []; + const subs = data.sub_questions || []; + + const briefHtml = renderBrief(data.brief_markdown || '', sources); + + const subQHtml = subs.length ? ` +
    +

    Angles the agent explored

    +
      + ${subs.map((sq) => `
    1. ${escapeHtml(sq.question)}${sq.rationale ? `
      ${escapeHtml(sq.rationale)}` : ''}
    2. `).join('')} +
    +
    ` : ''; + + const sourcesHtml = ` +
    +
    +

    Sources (${sources.length})

    + Click a card to see the full chunk + scores +
    +
    + ${sources.map((s) => renderSourceCard(s)).join('')} +
    +
    `; + + const uncertHtml = (data.what_remains_uncertain || []).length ? ` +
    +

    What remains uncertain

    + +
    ` : ''; + + const nextHtml = data.next_practical_step ? ` +
    +

    Next practical step

    +

    ${escapeHtml(data.next_practical_step)}

    +
    ` : ''; + + els.results.innerHTML = ` +
    +
    ${briefHtml}
    +
    + ${subQHtml} + ${sourcesHtml} + ${uncertHtml} + ${nextHtml} + `; + + // Bind source-card click handlers + citation marker click handlers + els.results.querySelectorAll('[data-source-n]').forEach((node) => { + node.addEventListener('click', () => { + const n = parseInt(node.dataset.sourceN, 10); + const src = sources.find((s) => s.n === n); + if (src) { + openModal(src); + flashSource(n); + } + }); + }); + } + + function flashSource(n) { + document.querySelectorAll('.dr-source-card.is-highlight').forEach((c) => c.classList.remove('is-highlight')); + const target = document.querySelector(`.dr-source-card[data-source-n="${n}"]`); + if (target) { + target.classList.add('is-highlight'); + target.scrollIntoView({ behavior: 'smooth', block: 'center' }); + setTimeout(() => target.classList.remove('is-highlight'), 1800); + } + } + + function renderSourceCard(s) { + const score = s.reranker_score != null ? s.reranker_score : s.similarity; + const originTagClass = s.source_origin === 'upload' ? 'dr-source-tag dr-source-tag--upload' : 'dr-source-tag'; + const originLabel = s.source_origin === 'upload' ? 'upload' : 'corpus'; + return ``; + } + + // Markdown renderer — minimal: paragraphs, bold/italic, code, [n] citation badges + function renderBrief(markdown, sources) { + if (!markdown) return '

    No brief was returned.

    '; + const sourceSet = new Set((sources || []).map((s) => s.n)); + const escaped = escapeHtml(markdown); + + // Citation markers [1], [1,2], [1-3] + const withCites = escaped.replace(/\[(\d+(?:\s*[-,]\s*\d+)*)\]/g, (_, group) => { + const nums = expandCiteGroup(group); + return nums.map((n) => { + const known = sourceSet.has(n); + const cls = known ? 'dr-cite' : 'dr-cite'; + return `${n}`; + }).join(''); + }); + + // Bold/italic + const withBold = withCites + .replace(/\*\*([^*]+)\*\*/g, '$1') + .replace(/(^|[^*])\*([^*]+)\*(?!\*)/g, '$1$2') + .replace(/`([^`]+)`/g, '$1'); + + // Paragraphs + const paragraphs = withBold.split(/\n{2,}/).map((p) => { + const t = p.trim(); + if (!t) return ''; + if (/^### /.test(t)) return `

    ${t.replace(/^### /, '')}

    `; + return `

    ${t.replace(/\n/g, '
    ')}

    `; + }).join(''); + + return paragraphs; + } + + function expandCiteGroup(group) { + const out = []; + group.split(',').forEach((part) => { + const range = part.trim().match(/^(\d+)\s*-\s*(\d+)$/); + if (range) { + const a = parseInt(range[1], 10); + const b = parseInt(range[2], 10); + for (let i = a; i <= b; i++) out.push(i); + } else { + const n = parseInt(part.trim(), 10); + if (!Number.isNaN(n)) out.push(n); + } + }); + return Array.from(new Set(out)); + } + + function escapeHtml(s) { + return String(s) + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + } + + function truncate(s, n) { + if (!s) return ''; + if (s.length <= n) return s; + return s.slice(0, n - 1) + '…'; + } +})(); diff --git a/deep-research.php b/deep-research.php new file mode 100644 index 0000000..572e01d --- /dev/null +++ b/deep-research.php @@ -0,0 +1,162 @@ + +
    + +
    + + +
    + +
    + Engine + + + +
    +

    Azure engines use your BNL Azure credits. GPU runs qwen2.5:14b via LiteLLM on cuttlefish.

    + +
    +

    Corpus slices

    +

    Select which slices of the Do Better Norge legal corpus the agent searches. Toggle Broader Legal on when the question reaches beyond family law.

    +
    + + + + +
    +
    + +
    + Advanced controls +
    +
    + + + How many angles the agent expands the question into before retrieval. +
    +
    + + + How many corpus chunks the hybrid retriever pulls per sub-question. +
    +
    + + + Minimum cosine similarity for uploaded-doc chunks to count as a match. +
    +
    + + + Top sources kept after dedupe + rerank to feed synthesis. +
    +
    + + + Synthesis creativity. Keep low for grounded legal briefs. +
    +
    +
    + +
    + +
    + +

    Drop up to 5 case files here, or

    +

    PDF, DOCX, TXT — chunked + embedded in memory only, never stored.

    +
    + +
    + + + + + +
    + +
    +
    +

    Ready

    +

    Pick slices, drop a case file or paste a question, then run. The agent will expand the question, retrieve from the corpus + your upload, rerank, and synthesise a cited brief.

    +
    +
    + + + + + + + + + + + + + + diff --git a/includes/DeepResearchAgent.php b/includes/DeepResearchAgent.php new file mode 100644 index 0000000..d08c4d5 --- /dev/null +++ b/includes/DeepResearchAgent.php @@ -0,0 +1,727 @@ +azure = $azure ?: new DbnAzureOpenAiGateway(); + } + + public function run( + string $seedQuery, + string $pastedText, + array $uploadedFiles, + array $sliceSelection, + string $engine, + string $language, + array $controls + ): array { + $seedQuery = trim($seedQuery); + $pastedText = trim($pastedText); + $engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini'; + $language = in_array($language, ['en', 'no'], true) ? $language : 'en'; + + $controls = $this->normalizeControls($controls); + + if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) { + dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed'); + } + + $client = dbnToolsRequireClient(); + $package = $this->requireFamilyPackage((int)$client['id']); + + dbnToolsBootCaveau(); + $aiPortalRoot = dbnToolsAiPortalRoot(); + require_once $aiPortalRoot . '/platform/includes/dbn_v6.php'; + require_once $aiPortalRoot . '/lib/ai/AiGateway.php'; + + $this->ai = new AiGateway(); + $this->uploadVecs = []; + $this->stepTimings = []; + + $trace = []; + $seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles); + + // STEP 1: Query interpretation — build research brief + $stepStart = microtime(true); + $interpretation = $this->interpretSeed($seedDescription, $language); + $this->stepTimings['interpretation'] = $this->elapsedMs($stepStart); + $trace[] = $this->trace( + 'Query interpretation', + $interpretation['detail'], + 'complete' + ); + + // STEP 2: Query expansion + $stepStart = microtime(true); + $expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language); + $this->stepTimings['expansion'] = $this->elapsedMs($stepStart); + $subQuestions = $expansion['questions']; + $expansionStatus = $expansion['fallback'] ? 'warning' : 'complete'; + $trace[] = $this->trace( + 'Query expansion', + $expansion['fallback'] + ? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.' + : sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions)), + $expansionStatus + ); + + // STEP 3: Slice resolution + $stepStart = microtime(true); + $sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection); + if (!array_filter($sliceSelectionNormalized)) { + dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices'); + } + $ragDb = dbnToolsRagDb(); + try { + $sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized); + $sliceStatus = 'complete'; + $sliceDetail = sprintf( + '%d slice(s) active → %d candidate documents constrain the corpus search.', + count(array_filter($sliceSelectionNormalized)), + count($sharedDocIds) + ); + } catch (Throwable $e) { + error_log('DBN deep research slice resolve failed: ' . $e->getMessage()); + $sharedDocIds = []; + $sliceStatus = 'warning'; + $sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.'; + } + $this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart); + $trace[] = $this->trace('Slice resolution', $sliceDetail, $sliceStatus); + + // STEP 4: Upload indexing (in-memory, ephemeral) + $stepStart = microtime(true); + $uploadChunks = []; + foreach ($uploadedFiles as $idx => $file) { + $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1)); + $text = (string)($file['text'] ?? ''); + $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx)); + } + $uploadStatus = 'complete'; + $uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks)); + if ($uploadChunks) { + try { + $texts = array_map(fn(array $c) => $c['text'], $uploadChunks); + $vecs = $this->ai->embedBatch($texts, 'nomic-embed-text'); + if (count($vecs) === count($uploadChunks)) { + foreach ($uploadChunks as $i => $chunk) { + $this->uploadVecs[] = [ + 'meta' => $chunk, + 'vec' => $vecs[$i], + ]; + } + } else { + $uploadStatus = 'warning'; + $uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.'; + } + } catch (Throwable $e) { + error_log('DBN deep research upload embed failed: ' . $e->getMessage()); + $uploadStatus = 'warning'; + $uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.'; + $this->uploadVecs = []; + } + } elseif (empty($uploadedFiles)) { + $uploadDetail = 'No files uploaded; agent will research the corpus only.'; + } + $this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart); + $trace[] = $this->trace('Upload indexing', $uploadDetail, $uploadStatus); + + // STEP 5: Retrieval (per sub-question) + $stepStart = microtime(true); + $retrievalQueries = $subQuestions ?: [[ + 'id' => 'q1', + 'question' => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'), + 'rationale' => 'Seed query (no sub-question expansion).', + ]]; + + try { + $rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60); + } catch (Throwable $e) { + dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed'); + } + + $rawPool = []; + $retrievalWarnings = 0; + foreach ($retrievalQueries as $sq) { + try { + $corpusChunks = $rag->searchAll( + $sq['question'], + $controls['chunk_limit'], + null, + [ + 'search_private' => false, + 'search_shared' => true, + 'package_ids' => [(int)$package['id']], + 'shared_doc_ids' => $sharedDocIds, + 'chunk_limit' => $controls['chunk_limit'], + 'search_method' => 'hybrid', + 'reranker_enabled' => true, + ] + ); + } catch (Throwable $e) { + error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage()); + $corpusChunks = []; + $retrievalWarnings++; + } + foreach ($corpusChunks as $chunk) { + $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']); + } + + // Upload chunk retrieval via cosine sim + if (!empty($this->uploadVecs)) { + $uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']); + foreach ($uploadHits as $hit) { + $hit['matched_sub_questions'] = [$sq['id']]; + $rawPool[] = $hit; + } + } + } + + $merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP); + $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart); + $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete'; + $trace[] = $this->trace( + 'Retrieval', + sprintf( + '%d sub-question(s) × hybrid + RRF + rerank → %d raw chunks → %d unique after dedupe.', + count($retrievalQueries), + count($rawPool), + count($merged) + ), + $retrievalStatus + ); + + // Cap pool to reranker top-K for synthesis + $synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']); + $numberedSources = $this->numberSources($synthesisPool); + + // STEP 6: Synthesis + $stepStart = microtime(true); + $synthesis = $this->synthesise( + $seedDescription, + $interpretation['brief'], + $retrievalQueries, + $numberedSources, + $engine, + $language, + $controls['temperature'] + ); + $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart); + $trace[] = $this->trace( + 'Synthesis', + sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)), + 'complete' + ); + + // STEP 7: Confidence + $confidence = $this->citationConfidence($numberedSources); + $trace[] = $this->trace( + 'Citation confidence', + sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)), + $confidence === 'low' ? 'warning' : 'complete' + ); + + // Stitch sub-question chunk_ids + $subQOut = []; + foreach ($retrievalQueries as $sq) { + $matchedChunks = array_values(array_filter( + $numberedSources, + fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true) + )); + $subQOut[] = [ + 'id' => $sq['id'], + 'question' => $sq['question'], + 'rationale' => $sq['rationale'] ?? '', + 'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)), + ]; + } + + return [ + 'tool' => 'deep_research', + 'language' => $language, + 'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''), + 'sub_questions' => $subQOut, + 'sources' => $numberedSources, + 'what_we_found' => (string)($synthesis['json']['what_we_found'] ?? ''), + 'evidence_trail' => $numberedSources, + 'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [], + 'next_practical_step' => (string)($synthesis['json']['next_practical_step'] ?? ''), + 'trace' => $trace, + 'trace_metadata' => [ + 'chunk_count' => count($merged), + 'source_count' => count($numberedSources), + 'sub_question_count' => count($retrievalQueries), + 'upload_chunk_count' => count($this->uploadVecs), + 'deployment' => $synthesis['deploy_label'], + 'engine_used' => $engine, + 'citation_confidence' => $confidence, + 'elapsed_ms_per_step' => $this->stepTimings, + 'slices_active' => array_keys(array_filter($sliceSelectionNormalized)), + ], + 'disclaimer' => dbnToolsDisclaimer($language), + ]; + } + + private function normalizeControls(array $controls): array + { + return [ + 'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))), + 'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))), + 'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))), + 'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))), + 'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))), + ]; + } + + private function requireFamilyPackage(int $clientId): array + { + $package = dbnToolsFetchPackage('family-legal'); + if (!$package || empty($package['is_active'])) { + dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable'); + } + if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) { + dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing'); + } + return $package; + } + + private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string + { + $parts = []; + if ($seedQuery !== '') { + $parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8'); + } + if ($pastedText !== '') { + $parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8'); + } + foreach ($uploadedFiles as $idx => $file) { + $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1)); + $text = (string)($file['text'] ?? ''); + if ($text === '') { + continue; + } + $parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8')); + } + return implode("\n\n", $parts); + } + + private function interpretSeed(string $seedDescription, string $language): array + { + $locale = $language === 'no' ? 'Norwegian' : 'English'; + $prompt = <<azure->chatText([ + ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], + ['role' => 'user', 'content' => $prompt], + ], ['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]); + $json = $this->azure->decodeJsonObject($raw); + if (is_array($json) && !empty($json['brief'])) { + $signals = $json['key_signals'] ?? []; + $signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : ''; + return [ + 'brief' => (string)$json['brief'], + 'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''), + ]; + } + } catch (Throwable $e) { + error_log('DBN deep research interpretation failed: ' . $e->getMessage()); + } + + return [ + 'brief' => '', + 'detail' => 'Interpretation step skipped — proceeding with raw seed input.', + ]; + } + + private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language): array + { + $locale = $language === 'no' ? 'Norwegian' : 'English'; + $prompt = <<azure->chatText([ + ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'], + ['role' => 'user', 'content' => $prompt], + ], ['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 35]); + $json = $this->azure->decodeJsonObject($raw); + $items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : []; + $normalized = []; + foreach ($items as $i => $item) { + if (!is_array($item) || empty($item['question'])) { + continue; + } + $normalized[] = [ + 'id' => 'q' . ($i + 1), + 'question' => trim((string)$item['question']), + 'rationale' => trim((string)($item['rationale'] ?? '')), + ]; + if (count($normalized) >= $targetCount) break; + } + if (count($normalized) >= 2) { + return ['questions' => $normalized, 'fallback' => false]; + } + } catch (Throwable $e) { + error_log('DBN deep research expansion failed: ' . $e->getMessage()); + } + + return ['questions' => [], 'fallback' => true]; + } + + private function splitIntoChunks(string $text, string $filename, int $fileIdx): array + { + $text = preg_replace('/\s+/u', ' ', trim($text)) ?? ''; + if ($text === '') { + return []; + } + $words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: []; + if (!$words) { + return []; + } + + $chunks = []; + $i = 0; + $chunkIdx = 0; + $total = count($words); + while ($i < $total) { + $slice = array_slice($words, $i, self::CHUNK_WORDS); + if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) { + $chunks[] = [ + 'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx), + 'file_index' => $fileIdx, + 'chunk_index'=> $chunkIdx, + 'filename' => $filename, + 'text' => implode(' ', $slice), + ]; + $chunkIdx++; + } + $advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS; + if ($advance < 1) $advance = 1; + $i += $advance; + if (count($slice) < self::CHUNK_WORDS) { + break; + } + } + return $chunks; + } + + private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array + { + if (empty($this->uploadVecs)) { + return []; + } + try { + $qVec = $this->ai->embed($question, 'nomic-embed-text'); + } catch (Throwable $e) { + error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage()); + return []; + } + if (empty($qVec)) { + return []; + } + $scored = []; + foreach ($this->uploadVecs as $entry) { + $sim = $this->cosineSim($qVec, $entry['vec']); + if ($sim < $threshold) { + continue; + } + $scored[] = [ + 'chunk_id' => $entry['meta']['chunk_id'], + 'title' => 'uploaded: ' . $entry['meta']['filename'], + 'section' => null, + 'package_or_corpus' => 'Your upload', + 'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620), + 'chunk_text' => $entry['meta']['text'], + 'similarity' => round($sim, 4), + 'reranker_score' => null, + 'document_id' => null, + 'source_origin' => 'upload', + 'authority_type' => null, + 'jurisdiction' => null, + ]; + } + usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity'])); + $keep = (int)ceil($limitPerSubQ / 2); + return array_slice($scored, 0, max(1, $keep)); + } + + private function cosineSim(array $a, array $b): float + { + $len = min(count($a), count($b)); + if ($len === 0) return 0.0; + $dot = 0.0; + $na = 0.0; + $nb = 0.0; + for ($i = 0; $i < $len; $i++) { + $x = (float)$a[$i]; + $y = (float)$b[$i]; + $dot += $x * $y; + $na += $x * $x; + $nb += $y * $y; + } + if ($na === 0.0 || $nb === 0.0) return 0.0; + return $dot / (sqrt($na) * sqrt($nb)); + } + + private function normalizeCorpusChunk(array $chunk, string $subQId): array + { + $similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null; + $rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null; + return [ + 'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null, + 'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'), + 'section' => $chunk['section_title'] ?? null, + 'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Norge'), + 'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620), + 'chunk_text' => (string)($chunk['content'] ?? ''), + 'similarity' => $similarity, + 'reranker_score' => $rerankerScore, + 'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null, + 'source_origin' => 'corpus', + 'authority_type' => $chunk['authority_type'] ?? null, + 'jurisdiction' => $chunk['jurisdiction'] ?? null, + 'matched_sub_questions' => [$subQId], + ]; + } + + private function mergeAndDedupe(array $rawPool, int $cap): array + { + $byKey = []; + foreach ($rawPool as $chunk) { + $key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4))); + if (!isset($byKey[$key])) { + $byKey[$key] = $chunk; + continue; + } + $existing = $byKey[$key]; + $existing['matched_sub_questions'] = array_values(array_unique(array_merge( + $existing['matched_sub_questions'] ?? [], + $chunk['matched_sub_questions'] ?? [] + ))); + // Keep the higher similarity score + if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) { + $existing['similarity'] = $chunk['similarity']; + } + if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) { + $existing['reranker_score'] = $chunk['reranker_score']; + } + $byKey[$key] = $existing; + } + $merged = array_values($byKey); + usort($merged, function (array $a, array $b): int { + $aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0; + $bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0; + return $bScore <=> $aScore; + }); + return array_slice($merged, 0, $cap); + } + + private function numberSources(array $chunks): array + { + $out = []; + foreach ($chunks as $i => $c) { + $c['n'] = $i + 1; + $out[] = $c; + } + return $out; + } + + private function synthesise( + string $seedDescription, + string $brief, + array $subQuestions, + array $numberedSources, + string $engine, + string $language, + float $temperature + ): array { + $locale = $language === 'no' ? 'Norwegian' : 'English'; + + if (empty($numberedSources)) { + return [ + 'json' => [ + 'brief_markdown' => $language === 'no' + ? 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.' + : 'I did not find enough source support in the corpus to give a grounded answer.', + 'what_we_found' => 'No retrieved sources passed the similarity threshold.', + 'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'], + 'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.', + ], + 'deploy_label' => $engine === 'gpu' ? 'GPU (cuttlefish)' : ($engine === 'azure_full' ? 'gpt-4o' : $this->azure->chatDeployment()), + ]; + } + + $sourcesContext = []; + foreach ($numberedSources as $s) { + $sourcesContext[] = sprintf( + "[%d] (%s) %s%s\n Corpus: %s\n Excerpt: %s", + $s['n'], + $s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus', + $s['title'], + !empty($s['section']) ? ' — ' . $s['section'] : '', + $s['package_or_corpus'], + $s['excerpt'] + ); + } + $sourcesText = implode("\n\n", $sourcesContext); + + $subQText = ''; + if ($subQuestions) { + $lines = array_map( + fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']), + $subQuestions, + array_keys($subQuestions) + ); + $subQText = "\nSub-questions explored:\n" . implode("\n", $lines); + } + + $prompt = << 'system', 'content' => 'You return valid JSON only. No markdown fences.'], + ['role' => 'user', 'content' => $prompt], + ]; + $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 2200, 'timeout' => 120]; + + try { + if ($engine === 'gpu') { + $response = dbnToolsCallGpuLlm($messages, $opts); + $deployLabel = 'GPU (cuttlefish)'; + $raw = (string)($response['choices'][0]['message']['content'] ?? ''); + } elseif ($engine === 'azure_full') { + $raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts); + $deployLabel = 'gpt-4o'; + } else { + $raw = $this->azure->chatText($messages, $opts); + $deployLabel = $this->azure->chatDeployment(); + } + } catch (Throwable $e) { + dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error'); + } + + $json = $this->azure->decodeJsonObject($raw); + if (!is_array($json) || empty($json['brief_markdown'])) { + // Salvage as plain markdown + $json = [ + 'brief_markdown' => $raw, + 'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.', + 'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'], + 'next_practical_step' => 'Review the brief manually before relying on it.', + ]; + } + + return [ + 'json' => $json, + 'deploy_label' => $deployLabel, + ]; + } + + private function citationConfidence(array $sources): string + { + if (!$sources) { + return 'low'; + } + $scores = array_values(array_filter(array_map( + fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null, + $sources + ), 'is_numeric')); + $best = $scores ? max($scores) : 0; + if (count($sources) >= 6 && $best >= 0.5) { + return 'high'; + } + if (count($sources) >= 3 && $best >= 0.35) { + return 'medium'; + } + return 'low'; + } + + private function trace(string $label, string $detail, string $status = 'complete'): array + { + return [ + 'label' => $label, + 'detail' => $detail, + 'status' => $status, + ]; + } + + private function elapsedMs(float $start): int + { + return (int)round((microtime(true) - $start) * 1000); + } +} diff --git a/includes/bootstrap.php b/includes/bootstrap.php index 20668c2..6df79aa 100644 --- a/includes/bootstrap.php +++ b/includes/bootstrap.php @@ -487,3 +487,192 @@ function dbnToolsExcerpt(string $text, int $limit = 520): string } return rtrim(mb_substr($text, 0, $limit - 1, 'UTF-8')) . '…'; } + +const DBN_TOOLS_EXTRACT_MAX_BYTES = 4 * 1024 * 1024; +const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000; +const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx']; + +function dbnToolsExtractUploadedFile(array $file): array +{ + $errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE); + if ($errCode !== UPLOAD_ERR_OK) { + $msg = match ($errCode) { + UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.', + UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.', + UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.', + default => 'File upload failed.', + }; + dbnToolsAbort($msg, 422, 'upload_error'); + } + + $originalName = basename((string)($file['name'] ?? '')); + $tmpPath = (string)($file['tmp_name'] ?? ''); + $size = (int)($file['size'] ?? 0); + + if (!is_uploaded_file($tmpPath)) { + dbnToolsAbort('Invalid file upload.', 400, 'invalid_upload'); + } + if ($size === 0) { + dbnToolsAbort('The uploaded file is empty.', 422, 'file_empty'); + } + if ($size > DBN_TOOLS_EXTRACT_MAX_BYTES) { + dbnToolsAbort('File exceeds the 4 MB limit.', 413, 'file_too_large'); + } + + $ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION)); + if (!in_array($ext, DBN_TOOLS_EXTRACT_ALLOWED_EXTS, true)) { + dbnToolsAbort('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type'); + } + + $text = match ($ext) { + 'txt' => dbnToolsExtractTxt($tmpPath), + 'pdf' => dbnToolsExtractPdf($tmpPath), + 'docx' => dbnToolsExtractDocx($tmpPath), + }; + + $text = trim($text); + if ($text === '') { + dbnToolsAbort('No text could be extracted from this file.', 422, 'no_text'); + } + + $truncated = false; + if (mb_strlen($text, 'UTF-8') > DBN_TOOLS_EXTRACT_TEXT_LIMIT) { + $text = mb_substr($text, 0, DBN_TOOLS_EXTRACT_TEXT_LIMIT, 'UTF-8'); + $truncated = true; + } + + return [ + 'ok' => true, + 'text' => $text, + 'filename' => $originalName, + 'chars' => mb_strlen($text, 'UTF-8'), + 'truncated' => $truncated, + ]; +} + +function dbnToolsExtractTxt(string $path): string +{ + $content = file_get_contents($path); + if ($content === false) { + throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error'); + } + return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252'); +} + +function dbnToolsExtractPdf(string $path): string +{ + $cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null'; + $output = shell_exec($cmd); + if ($output === null || $output === false || trim($output) === '') { + throw new DbnToolsHttpException( + 'PDF text extraction failed. The file may be image-only or encrypted.', + 422, + 'pdf_extract_failed' + ); + } + return $output; +} + +function dbnToolsExtractDocx(string $path): string +{ + $zip = new ZipArchive(); + $result = $zip->open($path); + if ($result !== true) { + throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed'); + } + + $xml = $zip->getFromName('word/document.xml'); + $zip->close(); + + if ($xml === false) { + throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content'); + } + + $doc = new DOMDocument(); + libxml_use_internal_errors(true); + $doc->loadXML($xml); + libxml_clear_errors(); + + $xpath = new DOMXPath($doc); + $xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'); + + $paragraphs = []; + foreach ($xpath->query('//w:p') as $para) { + $runs = []; + foreach ($xpath->query('.//w:t', $para) as $t) { + $runs[] = $t->textContent; + } + $paragraphs[] = implode('', $runs); + } + + return implode("\n", $paragraphs); +} + +function dbnToolsCallGpuLlm(array $messages, array $options = []): array +{ + $url = 'http://10.0.1.10:4000/v1/chat/completions'; + $apiKey = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d'); + $model = (string)($options['model'] ?? 'qwen2.5:14b'); + $timeout = (int)($options['timeout'] ?? 90); + + $payload = [ + 'model' => $model, + 'messages' => $messages, + 'temperature' => $options['temperature'] ?? 0.1, + 'max_tokens' => $options['max_tokens'] ?? 8000, + ]; + if (!empty($options['json'])) { + $payload['response_format'] = ['type' => 'json_object']; + } + + $body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); + $headers = [ + 'Content-Type: application/json', + 'Authorization: Bearer ' . $apiKey, + ]; + + if (function_exists('curl_init')) { + $ch = curl_init($url); + curl_setopt_array($ch, [ + CURLOPT_RETURNTRANSFER => true, + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $body, + CURLOPT_HTTPHEADER => $headers, + CURLOPT_TIMEOUT => $timeout, + ]); + $response = curl_exec($ch); + $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE); + $err = curl_error($ch); + curl_close($ch); + + if ($response === false) { + throw new RuntimeException('GPU LiteLLM request failed: ' . $err); + } + } else { + $ctx = stream_context_create(['http' => [ + 'method' => 'POST', + 'header' => implode("\r\n", $headers), + 'content' => $body, + 'timeout' => $timeout, + 'ignore_errors' => true, + ]]); + $response = @file_get_contents($url, false, $ctx); + $code = 0; + if (isset($http_response_header[0]) && preg_match('/\s(\d{3})\s/', $http_response_header[0], $m)) { + $code = (int)$m[1]; + } + if ($response === false) { + throw new RuntimeException('GPU LiteLLM request failed.'); + } + } + + $decoded = json_decode($response, true); + if (!is_array($decoded)) { + throw new RuntimeException('GPU LiteLLM returned non-JSON response.'); + } + if ($code < 200 || $code >= 300) { + $msg = $decoded['error']['message'] ?? ('HTTP ' . $code); + throw new RuntimeException('GPU LiteLLM error: ' . $msg); + } + return $decoded; +} diff --git a/includes/layout.php b/includes/layout.php index 43f9921..af1bb54 100644 --- a/includes/layout.php +++ b/includes/layout.php @@ -9,12 +9,13 @@ if (!dbnToolsIsAuthenticated()) { } $navItems = [ - 'ask' => ['Ask', 'Source-grounded'], - 'search' => ['Search', 'Legal sources'], - 'summarize' => ['Summarize', 'Pasted text'], - 'timeline' => ['Timeline', 'Events'], - 'redact' => ['Redact', 'Privacy'], - 'transcribe' => ['Transcribe', 'Audio'], + 'ask' => ['Ask', 'Source-grounded'], + 'search' => ['Search', 'Legal sources'], + 'deep-research' => ['Deep research', 'Agent + RAG'], + 'summarize' => ['Summarize', 'Pasted text'], + 'timeline' => ['Timeline', 'Events'], + 'redact' => ['Redact', 'Privacy'], + 'transcribe' => ['Transcribe', 'Audio'], ]; $toolName = $toolName ?? 'ask'; $toolTitle = $toolTitle ?? 'Legal Tools'; diff --git a/includes/layout_footer.php b/includes/layout_footer.php index bc3e187..25450fa 100644 --- a/includes/layout_footer.php +++ b/includes/layout_footer.php @@ -18,5 +18,8 @@ + + + diff --git a/index.php b/index.php index f32ada6..e86af41 100644 --- a/index.php +++ b/index.php @@ -91,7 +91,7 @@ if (dbnToolsIsAuthenticated()) {
    -

    Six tools, one suite

    +

    Seven tools, one suite

    Ask @@ -103,6 +103,11 @@ if (dbnToolsIsAuthenticated()) {

    Search

    Retrieve up to seven relevant legal sources with titles, sections, and excerpts.

    +
    + Deep research +

    Deep research

    +

    Upload a case file or paste a question. An agent expands it into 3–5 angles, runs hybrid rank/rerank RAG across the corpus + your upload, and returns a cited brief.

    +
    Summarize

    Summarize