Add Deep Research tool — agent + rank/rerank RAG
New surface at /deep-research.php where the user pastes a question or uploads PDF/DOCX/TXT case files and a LLM-orchestrated agent researches the Do Better Norge legal corpus from 3-5 angles, with hybrid retrieval, cross-encoder rerank, and synthesis that emits an inline-[n]-cited markdown brief plus a numbered sources panel. Uploaded documents are chunked + embedded in memory only (nomic-embed-text via LiteLLM) and searched alongside the shared corpus during the same request — never persisted to disk, DB, or Qdrant. Reuses ClientRagPipeline::searchAll (hybrid + rerank), dbnV6 slice helpers, and the existing extract.php text-extraction logic via a new dbnToolsExtractUploadedFile() helper. Also adds dbnToolsCallGpuLlm() helper in bootstrap.php — fixes a latent bug where LegalTools.php was already calling that name with no definition. Search.php is unchanged.
This commit is contained in:
@@ -0,0 +1,727 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/bootstrap.php';
|
||||
require_once __DIR__ . '/AzureOpenAiGateway.php';
|
||||
|
||||
final class DbnDeepResearchAgent
|
||||
{
|
||||
private const MAX_SEED_CHARS = 16000;
|
||||
private const MAX_UPLOAD_CHARS = 64000;
|
||||
private const CHUNK_WORDS = 600;
|
||||
private const CHUNK_OVERLAP_WORDS = 75;
|
||||
private const MIN_CHUNK_WORDS = 50;
|
||||
private const POOL_CAP = 30;
|
||||
|
||||
private DbnAzureOpenAiGateway $azure;
|
||||
private ?AiGateway $ai = null;
|
||||
private array $uploadVecs = [];
|
||||
private array $stepTimings = [];
|
||||
|
||||
public function __construct(?DbnAzureOpenAiGateway $azure = null)
|
||||
{
|
||||
$this->azure = $azure ?: new DbnAzureOpenAiGateway();
|
||||
}
|
||||
|
||||
public function run(
|
||||
string $seedQuery,
|
||||
string $pastedText,
|
||||
array $uploadedFiles,
|
||||
array $sliceSelection,
|
||||
string $engine,
|
||||
string $language,
|
||||
array $controls
|
||||
): array {
|
||||
$seedQuery = trim($seedQuery);
|
||||
$pastedText = trim($pastedText);
|
||||
$engine = in_array($engine, ['azure_mini', 'azure_full', 'gpu'], true) ? $engine : 'azure_mini';
|
||||
$language = in_array($language, ['en', 'no'], true) ? $language : 'en';
|
||||
|
||||
$controls = $this->normalizeControls($controls);
|
||||
|
||||
if ($seedQuery === '' && $pastedText === '' && empty($uploadedFiles)) {
|
||||
dbnToolsAbort('Provide a question, paste text, or upload at least one file.', 422, 'missing_seed');
|
||||
}
|
||||
|
||||
$client = dbnToolsRequireClient();
|
||||
$package = $this->requireFamilyPackage((int)$client['id']);
|
||||
|
||||
dbnToolsBootCaveau();
|
||||
$aiPortalRoot = dbnToolsAiPortalRoot();
|
||||
require_once $aiPortalRoot . '/platform/includes/dbn_v6.php';
|
||||
require_once $aiPortalRoot . '/lib/ai/AiGateway.php';
|
||||
|
||||
$this->ai = new AiGateway();
|
||||
$this->uploadVecs = [];
|
||||
$this->stepTimings = [];
|
||||
|
||||
$trace = [];
|
||||
$seedDescription = $this->buildSeedDescription($seedQuery, $pastedText, $uploadedFiles);
|
||||
|
||||
// STEP 1: Query interpretation — build research brief
|
||||
$stepStart = microtime(true);
|
||||
$interpretation = $this->interpretSeed($seedDescription, $language);
|
||||
$this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
|
||||
$trace[] = $this->trace(
|
||||
'Query interpretation',
|
||||
$interpretation['detail'],
|
||||
'complete'
|
||||
);
|
||||
|
||||
// STEP 2: Query expansion
|
||||
$stepStart = microtime(true);
|
||||
$expansion = $this->expandQueries($seedDescription, $interpretation['brief'], $controls['sub_q_count'], $language);
|
||||
$this->stepTimings['expansion'] = $this->elapsedMs($stepStart);
|
||||
$subQuestions = $expansion['questions'];
|
||||
$expansionStatus = $expansion['fallback'] ? 'warning' : 'complete';
|
||||
$trace[] = $this->trace(
|
||||
'Query expansion',
|
||||
$expansion['fallback']
|
||||
? 'Could not parse sub-questions; falling back to retrieving on the seed query alone.'
|
||||
: sprintf('Generated %d sub-questions to research the corpus from multiple angles.', count($subQuestions)),
|
||||
$expansionStatus
|
||||
);
|
||||
|
||||
// STEP 3: Slice resolution
|
||||
$stepStart = microtime(true);
|
||||
$sliceSelectionNormalized = dbnV6NormalizeSliceSelection($sliceSelection);
|
||||
if (!array_filter($sliceSelectionNormalized)) {
|
||||
dbnToolsAbort('Enable at least one corpus slice before running deep research.', 422, 'no_slices');
|
||||
}
|
||||
$ragDb = dbnToolsRagDb();
|
||||
try {
|
||||
$sharedDocIds = dbnV6ResolveSelectedDocIds($ragDb, $sliceSelectionNormalized);
|
||||
$sliceStatus = 'complete';
|
||||
$sliceDetail = sprintf(
|
||||
'%d slice(s) active → %d candidate documents constrain the corpus search.',
|
||||
count(array_filter($sliceSelectionNormalized)),
|
||||
count($sharedDocIds)
|
||||
);
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN deep research slice resolve failed: ' . $e->getMessage());
|
||||
$sharedDocIds = [];
|
||||
$sliceStatus = 'warning';
|
||||
$sliceDetail = 'Slice resolution failed; corpus search will run unconstrained.';
|
||||
}
|
||||
$this->stepTimings['slice_resolution'] = $this->elapsedMs($stepStart);
|
||||
$trace[] = $this->trace('Slice resolution', $sliceDetail, $sliceStatus);
|
||||
|
||||
// STEP 4: Upload indexing (in-memory, ephemeral)
|
||||
$stepStart = microtime(true);
|
||||
$uploadChunks = [];
|
||||
foreach ($uploadedFiles as $idx => $file) {
|
||||
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
|
||||
$text = (string)($file['text'] ?? '');
|
||||
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
|
||||
}
|
||||
$uploadStatus = 'complete';
|
||||
$uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
|
||||
if ($uploadChunks) {
|
||||
try {
|
||||
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
|
||||
$vecs = $this->ai->embedBatch($texts, 'nomic-embed-text');
|
||||
if (count($vecs) === count($uploadChunks)) {
|
||||
foreach ($uploadChunks as $i => $chunk) {
|
||||
$this->uploadVecs[] = [
|
||||
'meta' => $chunk,
|
||||
'vec' => $vecs[$i],
|
||||
];
|
||||
}
|
||||
} else {
|
||||
$uploadStatus = 'warning';
|
||||
$uploadDetail = 'Upload embedding returned an unexpected count; uploaded chunks will not participate in retrieval.';
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN deep research upload embed failed: ' . $e->getMessage());
|
||||
$uploadStatus = 'warning';
|
||||
$uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.';
|
||||
$this->uploadVecs = [];
|
||||
}
|
||||
} elseif (empty($uploadedFiles)) {
|
||||
$uploadDetail = 'No files uploaded; agent will research the corpus only.';
|
||||
}
|
||||
$this->stepTimings['upload_indexing'] = $this->elapsedMs($stepStart);
|
||||
$trace[] = $this->trace('Upload indexing', $uploadDetail, $uploadStatus);
|
||||
|
||||
// STEP 5: Retrieval (per sub-question)
|
||||
$stepStart = microtime(true);
|
||||
$retrievalQueries = $subQuestions ?: [[
|
||||
'id' => 'q1',
|
||||
'question' => $seedQuery !== '' ? $seedQuery : ($interpretation['brief'] ?: 'legal research'),
|
||||
'rationale' => 'Seed query (no sub-question expansion).',
|
||||
]];
|
||||
|
||||
try {
|
||||
$rag = new ClientRagPipeline((int)$client['id'], 'http://10.0.1.10:4000', 60);
|
||||
} catch (Throwable $e) {
|
||||
dbnToolsAbort('Could not initialise the retrieval pipeline.', 503, 'rag_init_failed');
|
||||
}
|
||||
|
||||
$rawPool = [];
|
||||
$retrievalWarnings = 0;
|
||||
foreach ($retrievalQueries as $sq) {
|
||||
try {
|
||||
$corpusChunks = $rag->searchAll(
|
||||
$sq['question'],
|
||||
$controls['chunk_limit'],
|
||||
null,
|
||||
[
|
||||
'search_private' => false,
|
||||
'search_shared' => true,
|
||||
'package_ids' => [(int)$package['id']],
|
||||
'shared_doc_ids' => $sharedDocIds,
|
||||
'chunk_limit' => $controls['chunk_limit'],
|
||||
'search_method' => 'hybrid',
|
||||
'reranker_enabled' => true,
|
||||
]
|
||||
);
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN deep research sub-Q retrieval failed: ' . $e->getMessage());
|
||||
$corpusChunks = [];
|
||||
$retrievalWarnings++;
|
||||
}
|
||||
foreach ($corpusChunks as $chunk) {
|
||||
$rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
|
||||
}
|
||||
|
||||
// Upload chunk retrieval via cosine sim
|
||||
if (!empty($this->uploadVecs)) {
|
||||
$uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
|
||||
foreach ($uploadHits as $hit) {
|
||||
$hit['matched_sub_questions'] = [$sq['id']];
|
||||
$rawPool[] = $hit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$merged = $this->mergeAndDedupe($rawPool, self::POOL_CAP);
|
||||
$this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
|
||||
$retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
|
||||
$trace[] = $this->trace(
|
||||
'Retrieval',
|
||||
sprintf(
|
||||
'%d sub-question(s) × hybrid + RRF + rerank → %d raw chunks → %d unique after dedupe.',
|
||||
count($retrievalQueries),
|
||||
count($rawPool),
|
||||
count($merged)
|
||||
),
|
||||
$retrievalStatus
|
||||
);
|
||||
|
||||
// Cap pool to reranker top-K for synthesis
|
||||
$synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
|
||||
$numberedSources = $this->numberSources($synthesisPool);
|
||||
|
||||
// STEP 6: Synthesis
|
||||
$stepStart = microtime(true);
|
||||
$synthesis = $this->synthesise(
|
||||
$seedDescription,
|
||||
$interpretation['brief'],
|
||||
$retrievalQueries,
|
||||
$numberedSources,
|
||||
$engine,
|
||||
$language,
|
||||
$controls['temperature']
|
||||
);
|
||||
$this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
|
||||
$trace[] = $this->trace(
|
||||
'Synthesis',
|
||||
sprintf('%s synthesised the brief using %d grounded source(s).', $synthesis['deploy_label'], count($numberedSources)),
|
||||
'complete'
|
||||
);
|
||||
|
||||
// STEP 7: Confidence
|
||||
$confidence = $this->citationConfidence($numberedSources);
|
||||
$trace[] = $this->trace(
|
||||
'Citation confidence',
|
||||
sprintf('%s confidence based on %d source(s) and reranker score distribution.', ucfirst($confidence), count($numberedSources)),
|
||||
$confidence === 'low' ? 'warning' : 'complete'
|
||||
);
|
||||
|
||||
// Stitch sub-question chunk_ids
|
||||
$subQOut = [];
|
||||
foreach ($retrievalQueries as $sq) {
|
||||
$matchedChunks = array_values(array_filter(
|
||||
$numberedSources,
|
||||
fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
|
||||
));
|
||||
$subQOut[] = [
|
||||
'id' => $sq['id'],
|
||||
'question' => $sq['question'],
|
||||
'rationale' => $sq['rationale'] ?? '',
|
||||
'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
'tool' => 'deep_research',
|
||||
'language' => $language,
|
||||
'brief_markdown' => (string)($synthesis['json']['brief_markdown'] ?? $synthesis['json']['answer'] ?? ''),
|
||||
'sub_questions' => $subQOut,
|
||||
'sources' => $numberedSources,
|
||||
'what_we_found' => (string)($synthesis['json']['what_we_found'] ?? ''),
|
||||
'evidence_trail' => $numberedSources,
|
||||
'what_remains_uncertain' => $synthesis['json']['what_remains_uncertain'] ?? [],
|
||||
'next_practical_step' => (string)($synthesis['json']['next_practical_step'] ?? ''),
|
||||
'trace' => $trace,
|
||||
'trace_metadata' => [
|
||||
'chunk_count' => count($merged),
|
||||
'source_count' => count($numberedSources),
|
||||
'sub_question_count' => count($retrievalQueries),
|
||||
'upload_chunk_count' => count($this->uploadVecs),
|
||||
'deployment' => $synthesis['deploy_label'],
|
||||
'engine_used' => $engine,
|
||||
'citation_confidence' => $confidence,
|
||||
'elapsed_ms_per_step' => $this->stepTimings,
|
||||
'slices_active' => array_keys(array_filter($sliceSelectionNormalized)),
|
||||
],
|
||||
'disclaimer' => dbnToolsDisclaimer($language),
|
||||
];
|
||||
}
|
||||
|
||||
private function normalizeControls(array $controls): array
|
||||
{
|
||||
return [
|
||||
'sub_q_count' => max(3, min(5, (int)($controls['sub_q_count'] ?? 4))),
|
||||
'chunk_limit' => max(4, min(10, (int)($controls['chunk_limit'] ?? 6))),
|
||||
'similarity_threshold' => max(0.2, min(0.6, (float)($controls['similarity_threshold'] ?? 0.30))),
|
||||
'reranker_top_k' => max(8, min(14, (int)($controls['reranker_top_k'] ?? 12))),
|
||||
'temperature' => max(0.05, min(0.4, (float)($controls['temperature'] ?? 0.15))),
|
||||
];
|
||||
}
|
||||
|
||||
private function requireFamilyPackage(int $clientId): array
|
||||
{
|
||||
$package = dbnToolsFetchPackage('family-legal');
|
||||
if (!$package || empty($package['is_active'])) {
|
||||
dbnToolsAbort('The family-legal corpus package is not active.', 503, 'package_unavailable');
|
||||
}
|
||||
if (!dbnToolsHasActiveSubscription($clientId, (int)$package['id'])) {
|
||||
dbnToolsAbort('Do Better Norge does not have an active family-legal subscription.', 503, 'subscription_missing');
|
||||
}
|
||||
return $package;
|
||||
}
|
||||
|
||||
private function buildSeedDescription(string $seedQuery, string $pastedText, array $uploadedFiles): string
|
||||
{
|
||||
$parts = [];
|
||||
if ($seedQuery !== '') {
|
||||
$parts[] = "Question:\n" . mb_substr($seedQuery, 0, self::MAX_SEED_CHARS, 'UTF-8');
|
||||
}
|
||||
if ($pastedText !== '') {
|
||||
$parts[] = "Pasted text:\n" . mb_substr($pastedText, 0, self::MAX_SEED_CHARS, 'UTF-8');
|
||||
}
|
||||
foreach ($uploadedFiles as $idx => $file) {
|
||||
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
|
||||
$text = (string)($file['text'] ?? '');
|
||||
if ($text === '') {
|
||||
continue;
|
||||
}
|
||||
$parts[] = sprintf("Uploaded file [%s]:\n%s", $filename, mb_substr($text, 0, self::MAX_UPLOAD_CHARS, 'UTF-8'));
|
||||
}
|
||||
return implode("\n\n", $parts);
|
||||
}
|
||||
|
||||
private function interpretSeed(string $seedDescription, string $language): array
|
||||
{
|
||||
$locale = $language === 'no' ? 'Norwegian' : 'English';
|
||||
$prompt = <<<PROMPT
|
||||
You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.
|
||||
|
||||
Input:
|
||||
{$seedDescription}
|
||||
|
||||
In {$locale}, produce JSON with:
|
||||
{
|
||||
"brief": "1-3 sentence description of what the user is trying to research (≤ 220 chars)",
|
||||
"key_signals": ["short keywords or terms that should drive retrieval"]
|
||||
}
|
||||
PROMPT;
|
||||
|
||||
try {
|
||||
$raw = $this->azure->chatText([
|
||||
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
|
||||
['role' => 'user', 'content' => $prompt],
|
||||
], ['json' => true, 'temperature' => 0.1, 'max_tokens' => 400, 'timeout' => 30]);
|
||||
$json = $this->azure->decodeJsonObject($raw);
|
||||
if (is_array($json) && !empty($json['brief'])) {
|
||||
$signals = $json['key_signals'] ?? [];
|
||||
$signalText = is_array($signals) ? implode(', ', array_slice($signals, 0, 6)) : '';
|
||||
return [
|
||||
'brief' => (string)$json['brief'],
|
||||
'detail' => sprintf('Research focus: %s%s', (string)$json['brief'], $signalText ? ' — signals: ' . $signalText : ''),
|
||||
];
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN deep research interpretation failed: ' . $e->getMessage());
|
||||
}
|
||||
|
||||
return [
|
||||
'brief' => '',
|
||||
'detail' => 'Interpretation step skipped — proceeding with raw seed input.',
|
||||
];
|
||||
}
|
||||
|
||||
private function expandQueries(string $seedDescription, string $brief, int $targetCount, string $language): array
|
||||
{
|
||||
$locale = $language === 'no' ? 'Norwegian' : 'English';
|
||||
$prompt = <<<PROMPT
|
||||
You are decomposing a Do Better Norge legal-research request into {$targetCount} focused sub-questions that should each be answered by the legal corpus (Norwegian family law, child welfare, ECHR/Hague).
|
||||
|
||||
Research brief:
|
||||
{$brief}
|
||||
|
||||
Raw input:
|
||||
{$seedDescription}
|
||||
|
||||
Return JSON only:
|
||||
{
|
||||
"sub_questions": [
|
||||
{"id":"q1","question":"... ({$locale})","rationale":"why this angle matters (≤ 140 chars)"}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Exactly {$targetCount} sub-questions, no more, no fewer.
|
||||
- Each sub-question must be answerable with Norwegian family-law, child-welfare, or ECHR sources.
|
||||
- Each sub-question must explore a DIFFERENT angle (statute interpretation, procedural fairness, ECHR case law, evidence/factual frame, comparative authority).
|
||||
- Sub-questions must be self-contained — readable without seeing the seed text.
|
||||
- Write the questions in {$locale}.
|
||||
PROMPT;
|
||||
|
||||
try {
|
||||
$raw = $this->azure->chatText([
|
||||
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
|
||||
['role' => 'user', 'content' => $prompt],
|
||||
], ['json' => true, 'temperature' => 0.2, 'max_tokens' => 700, 'timeout' => 35]);
|
||||
$json = $this->azure->decodeJsonObject($raw);
|
||||
$items = is_array($json['sub_questions'] ?? null) ? $json['sub_questions'] : [];
|
||||
$normalized = [];
|
||||
foreach ($items as $i => $item) {
|
||||
if (!is_array($item) || empty($item['question'])) {
|
||||
continue;
|
||||
}
|
||||
$normalized[] = [
|
||||
'id' => 'q' . ($i + 1),
|
||||
'question' => trim((string)$item['question']),
|
||||
'rationale' => trim((string)($item['rationale'] ?? '')),
|
||||
];
|
||||
if (count($normalized) >= $targetCount) break;
|
||||
}
|
||||
if (count($normalized) >= 2) {
|
||||
return ['questions' => $normalized, 'fallback' => false];
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN deep research expansion failed: ' . $e->getMessage());
|
||||
}
|
||||
|
||||
return ['questions' => [], 'fallback' => true];
|
||||
}
|
||||
|
||||
private function splitIntoChunks(string $text, string $filename, int $fileIdx): array
|
||||
{
|
||||
$text = preg_replace('/\s+/u', ' ', trim($text)) ?? '';
|
||||
if ($text === '') {
|
||||
return [];
|
||||
}
|
||||
$words = preg_split('/\s+/u', $text, -1, PREG_SPLIT_NO_EMPTY) ?: [];
|
||||
if (!$words) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$chunks = [];
|
||||
$i = 0;
|
||||
$chunkIdx = 0;
|
||||
$total = count($words);
|
||||
while ($i < $total) {
|
||||
$slice = array_slice($words, $i, self::CHUNK_WORDS);
|
||||
if (count($slice) >= self::MIN_CHUNK_WORDS || $i === 0) {
|
||||
$chunks[] = [
|
||||
'chunk_id' => sprintf('upload:%d:%d', $fileIdx, $chunkIdx),
|
||||
'file_index' => $fileIdx,
|
||||
'chunk_index'=> $chunkIdx,
|
||||
'filename' => $filename,
|
||||
'text' => implode(' ', $slice),
|
||||
];
|
||||
$chunkIdx++;
|
||||
}
|
||||
$advance = self::CHUNK_WORDS - self::CHUNK_OVERLAP_WORDS;
|
||||
if ($advance < 1) $advance = 1;
|
||||
$i += $advance;
|
||||
if (count($slice) < self::CHUNK_WORDS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return $chunks;
|
||||
}
|
||||
|
||||
private function retrieveFromUploads(string $question, int $limitPerSubQ, float $threshold): array
|
||||
{
|
||||
if (empty($this->uploadVecs)) {
|
||||
return [];
|
||||
}
|
||||
try {
|
||||
$qVec = $this->ai->embed($question, 'nomic-embed-text');
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN deep research sub-Q embed failed: ' . $e->getMessage());
|
||||
return [];
|
||||
}
|
||||
if (empty($qVec)) {
|
||||
return [];
|
||||
}
|
||||
$scored = [];
|
||||
foreach ($this->uploadVecs as $entry) {
|
||||
$sim = $this->cosineSim($qVec, $entry['vec']);
|
||||
if ($sim < $threshold) {
|
||||
continue;
|
||||
}
|
||||
$scored[] = [
|
||||
'chunk_id' => $entry['meta']['chunk_id'],
|
||||
'title' => 'uploaded: ' . $entry['meta']['filename'],
|
||||
'section' => null,
|
||||
'package_or_corpus' => 'Your upload',
|
||||
'excerpt' => dbnToolsExcerpt($entry['meta']['text'], 620),
|
||||
'chunk_text' => $entry['meta']['text'],
|
||||
'similarity' => round($sim, 4),
|
||||
'reranker_score' => null,
|
||||
'document_id' => null,
|
||||
'source_origin' => 'upload',
|
||||
'authority_type' => null,
|
||||
'jurisdiction' => null,
|
||||
];
|
||||
}
|
||||
usort($scored, fn(array $a, array $b) => ($b['similarity'] <=> $a['similarity']));
|
||||
$keep = (int)ceil($limitPerSubQ / 2);
|
||||
return array_slice($scored, 0, max(1, $keep));
|
||||
}
|
||||
|
||||
private function cosineSim(array $a, array $b): float
|
||||
{
|
||||
$len = min(count($a), count($b));
|
||||
if ($len === 0) return 0.0;
|
||||
$dot = 0.0;
|
||||
$na = 0.0;
|
||||
$nb = 0.0;
|
||||
for ($i = 0; $i < $len; $i++) {
|
||||
$x = (float)$a[$i];
|
||||
$y = (float)$b[$i];
|
||||
$dot += $x * $y;
|
||||
$na += $x * $x;
|
||||
$nb += $y * $y;
|
||||
}
|
||||
if ($na === 0.0 || $nb === 0.0) return 0.0;
|
||||
return $dot / (sqrt($na) * sqrt($nb));
|
||||
}
|
||||
|
||||
private function normalizeCorpusChunk(array $chunk, string $subQId): array
|
||||
{
|
||||
$similarity = isset($chunk['similarity']) ? round((float)$chunk['similarity'], 4) : null;
|
||||
$rerankerScore = isset($chunk['reranker_score']) ? round((float)$chunk['reranker_score'], 4) : null;
|
||||
return [
|
||||
'chunk_id' => isset($chunk['id']) ? (int)$chunk['id'] : null,
|
||||
'title' => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
|
||||
'section' => $chunk['section_title'] ?? null,
|
||||
'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Norge'),
|
||||
'excerpt' => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
|
||||
'chunk_text' => (string)($chunk['content'] ?? ''),
|
||||
'similarity' => $similarity,
|
||||
'reranker_score' => $rerankerScore,
|
||||
'document_id' => isset($chunk['document_id']) ? (int)$chunk['document_id'] : null,
|
||||
'source_origin' => 'corpus',
|
||||
'authority_type' => $chunk['authority_type'] ?? null,
|
||||
'jurisdiction' => $chunk['jurisdiction'] ?? null,
|
||||
'matched_sub_questions' => [$subQId],
|
||||
];
|
||||
}
|
||||
|
||||
private function mergeAndDedupe(array $rawPool, int $cap): array
|
||||
{
|
||||
$byKey = [];
|
||||
foreach ($rawPool as $chunk) {
|
||||
$key = ($chunk['source_origin'] ?? 'corpus') . ':' . ($chunk['chunk_id'] ?? bin2hex(random_bytes(4)));
|
||||
if (!isset($byKey[$key])) {
|
||||
$byKey[$key] = $chunk;
|
||||
continue;
|
||||
}
|
||||
$existing = $byKey[$key];
|
||||
$existing['matched_sub_questions'] = array_values(array_unique(array_merge(
|
||||
$existing['matched_sub_questions'] ?? [],
|
||||
$chunk['matched_sub_questions'] ?? []
|
||||
)));
|
||||
// Keep the higher similarity score
|
||||
if (($chunk['similarity'] ?? 0) > ($existing['similarity'] ?? 0)) {
|
||||
$existing['similarity'] = $chunk['similarity'];
|
||||
}
|
||||
if (($chunk['reranker_score'] ?? 0) > ($existing['reranker_score'] ?? 0)) {
|
||||
$existing['reranker_score'] = $chunk['reranker_score'];
|
||||
}
|
||||
$byKey[$key] = $existing;
|
||||
}
|
||||
$merged = array_values($byKey);
|
||||
usort($merged, function (array $a, array $b): int {
|
||||
$aScore = $a['reranker_score'] ?? $a['similarity'] ?? 0;
|
||||
$bScore = $b['reranker_score'] ?? $b['similarity'] ?? 0;
|
||||
return $bScore <=> $aScore;
|
||||
});
|
||||
return array_slice($merged, 0, $cap);
|
||||
}
|
||||
|
||||
private function numberSources(array $chunks): array
|
||||
{
|
||||
$out = [];
|
||||
foreach ($chunks as $i => $c) {
|
||||
$c['n'] = $i + 1;
|
||||
$out[] = $c;
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
|
||||
private function synthesise(
|
||||
string $seedDescription,
|
||||
string $brief,
|
||||
array $subQuestions,
|
||||
array $numberedSources,
|
||||
string $engine,
|
||||
string $language,
|
||||
float $temperature
|
||||
): array {
|
||||
$locale = $language === 'no' ? 'Norwegian' : 'English';
|
||||
|
||||
if (empty($numberedSources)) {
|
||||
return [
|
||||
'json' => [
|
||||
'brief_markdown' => $language === 'no'
|
||||
? 'Jeg fant ikke tilstrekkelig kildestøtte i korpuset til å gi et grunnlagsbasert svar.'
|
||||
: 'I did not find enough source support in the corpus to give a grounded answer.',
|
||||
'what_we_found' => 'No retrieved sources passed the similarity threshold.',
|
||||
'what_remains_uncertain' => ['No corpus evidence retrieved for the given query and slice selection.'],
|
||||
'next_practical_step' => 'Try widening slice selection or rephrasing with more specific statutory or party terms.',
|
||||
],
|
||||
'deploy_label' => $engine === 'gpu' ? 'GPU (cuttlefish)' : ($engine === 'azure_full' ? 'gpt-4o' : $this->azure->chatDeployment()),
|
||||
];
|
||||
}
|
||||
|
||||
$sourcesContext = [];
|
||||
foreach ($numberedSources as $s) {
|
||||
$sourcesContext[] = sprintf(
|
||||
"[%d] (%s) %s%s\n Corpus: %s\n Excerpt: %s",
|
||||
$s['n'],
|
||||
$s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
|
||||
$s['title'],
|
||||
!empty($s['section']) ? ' — ' . $s['section'] : '',
|
||||
$s['package_or_corpus'],
|
||||
$s['excerpt']
|
||||
);
|
||||
}
|
||||
$sourcesText = implode("\n\n", $sourcesContext);
|
||||
|
||||
$subQText = '';
|
||||
if ($subQuestions) {
|
||||
$lines = array_map(
|
||||
fn(array $sq, int $i): string => sprintf('%d. (%s) %s', $i + 1, $sq['id'], $sq['question']),
|
||||
$subQuestions,
|
||||
array_keys($subQuestions)
|
||||
);
|
||||
$subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
|
||||
}
|
||||
|
||||
$prompt = <<<PROMPT
|
||||
You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
|
||||
|
||||
User input:
|
||||
{$seedDescription}
|
||||
|
||||
Research brief:
|
||||
{$brief}
|
||||
{$subQText}
|
||||
|
||||
Sources (numbered):
|
||||
{$sourcesText}
|
||||
|
||||
Return JSON only in {$locale}:
|
||||
{
|
||||
"brief_markdown": "Markdown legal brief, 250-700 words, with inline [n] citation markers keyed to the sources above. Use short paragraphs. End with a one-line caveat. Do NOT include headings above level 3 (###).",
|
||||
"what_we_found": "1-2 sentence plain-language summary of the grounded finding",
|
||||
"what_remains_uncertain": ["gaps or caveats — what the corpus did not cover or where confidence is limited"],
|
||||
"next_practical_step": "one concrete next action the user can take"
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
|
||||
- If no source supports a point, omit the point.
|
||||
- Respond in {$locale}.
|
||||
- Output valid JSON only — no markdown fences around the JSON.
|
||||
PROMPT;
|
||||
|
||||
$messages = [
|
||||
['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
|
||||
['role' => 'user', 'content' => $prompt],
|
||||
];
|
||||
$opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 2200, 'timeout' => 120];
|
||||
|
||||
try {
|
||||
if ($engine === 'gpu') {
|
||||
$response = dbnToolsCallGpuLlm($messages, $opts);
|
||||
$deployLabel = 'GPU (cuttlefish)';
|
||||
$raw = (string)($response['choices'][0]['message']['content'] ?? '');
|
||||
} elseif ($engine === 'azure_full') {
|
||||
$raw = $this->azure->withDeployment('gpt-4o')->chatText($messages, $opts);
|
||||
$deployLabel = 'gpt-4o';
|
||||
} else {
|
||||
$raw = $this->azure->chatText($messages, $opts);
|
||||
$deployLabel = $this->azure->chatDeployment();
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
dbnToolsAbort('Synthesis LLM request failed: ' . $e->getMessage(), 502, 'llm_error');
|
||||
}
|
||||
|
||||
$json = $this->azure->decodeJsonObject($raw);
|
||||
if (!is_array($json) || empty($json['brief_markdown'])) {
|
||||
// Salvage as plain markdown
|
||||
$json = [
|
||||
'brief_markdown' => $raw,
|
||||
'what_we_found' => 'Synthesis returned non-structured output; rendered as raw markdown.',
|
||||
'what_remains_uncertain' => ['Response format could not be validated as structured JSON.'],
|
||||
'next_practical_step' => 'Review the brief manually before relying on it.',
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
'json' => $json,
|
||||
'deploy_label' => $deployLabel,
|
||||
];
|
||||
}
|
||||
|
||||
private function citationConfidence(array $sources): string
|
||||
{
|
||||
if (!$sources) {
|
||||
return 'low';
|
||||
}
|
||||
$scores = array_values(array_filter(array_map(
|
||||
fn(array $s) => $s['reranker_score'] ?? $s['similarity'] ?? null,
|
||||
$sources
|
||||
), 'is_numeric'));
|
||||
$best = $scores ? max($scores) : 0;
|
||||
if (count($sources) >= 6 && $best >= 0.5) {
|
||||
return 'high';
|
||||
}
|
||||
if (count($sources) >= 3 && $best >= 0.35) {
|
||||
return 'medium';
|
||||
}
|
||||
return 'low';
|
||||
}
|
||||
|
||||
private function trace(string $label, string $detail, string $status = 'complete'): array
|
||||
{
|
||||
return [
|
||||
'label' => $label,
|
||||
'detail' => $detail,
|
||||
'status' => $status,
|
||||
];
|
||||
}
|
||||
|
||||
private function elapsedMs(float $start): int
|
||||
{
|
||||
return (int)round((microtime(true) - $start) * 1000);
|
||||
}
|
||||
}
|
||||
@@ -487,3 +487,192 @@ function dbnToolsExcerpt(string $text, int $limit = 520): string
|
||||
}
|
||||
return rtrim(mb_substr($text, 0, $limit - 1, 'UTF-8')) . '…';
|
||||
}
|
||||
|
||||
const DBN_TOOLS_EXTRACT_MAX_BYTES = 4 * 1024 * 1024;
|
||||
const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000;
|
||||
const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
|
||||
|
||||
function dbnToolsExtractUploadedFile(array $file): array
|
||||
{
|
||||
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
|
||||
if ($errCode !== UPLOAD_ERR_OK) {
|
||||
$msg = match ($errCode) {
|
||||
UPLOAD_ERR_INI_SIZE, UPLOAD_ERR_FORM_SIZE => 'The file exceeds the allowed size limit.',
|
||||
UPLOAD_ERR_NO_TMP_DIR => 'No temporary directory is available.',
|
||||
UPLOAD_ERR_CANT_WRITE => 'Unable to save the uploaded file.',
|
||||
default => 'File upload failed.',
|
||||
};
|
||||
dbnToolsAbort($msg, 422, 'upload_error');
|
||||
}
|
||||
|
||||
$originalName = basename((string)($file['name'] ?? ''));
|
||||
$tmpPath = (string)($file['tmp_name'] ?? '');
|
||||
$size = (int)($file['size'] ?? 0);
|
||||
|
||||
if (!is_uploaded_file($tmpPath)) {
|
||||
dbnToolsAbort('Invalid file upload.', 400, 'invalid_upload');
|
||||
}
|
||||
if ($size === 0) {
|
||||
dbnToolsAbort('The uploaded file is empty.', 422, 'file_empty');
|
||||
}
|
||||
if ($size > DBN_TOOLS_EXTRACT_MAX_BYTES) {
|
||||
dbnToolsAbort('File exceeds the 4 MB limit.', 413, 'file_too_large');
|
||||
}
|
||||
|
||||
$ext = strtolower(pathinfo($originalName, PATHINFO_EXTENSION));
|
||||
if (!in_array($ext, DBN_TOOLS_EXTRACT_ALLOWED_EXTS, true)) {
|
||||
dbnToolsAbort('Unsupported file type. Upload a .pdf, .docx, or .txt file.', 422, 'unsupported_type');
|
||||
}
|
||||
|
||||
$text = match ($ext) {
|
||||
'txt' => dbnToolsExtractTxt($tmpPath),
|
||||
'pdf' => dbnToolsExtractPdf($tmpPath),
|
||||
'docx' => dbnToolsExtractDocx($tmpPath),
|
||||
};
|
||||
|
||||
$text = trim($text);
|
||||
if ($text === '') {
|
||||
dbnToolsAbort('No text could be extracted from this file.', 422, 'no_text');
|
||||
}
|
||||
|
||||
$truncated = false;
|
||||
if (mb_strlen($text, 'UTF-8') > DBN_TOOLS_EXTRACT_TEXT_LIMIT) {
|
||||
$text = mb_substr($text, 0, DBN_TOOLS_EXTRACT_TEXT_LIMIT, 'UTF-8');
|
||||
$truncated = true;
|
||||
}
|
||||
|
||||
return [
|
||||
'ok' => true,
|
||||
'text' => $text,
|
||||
'filename' => $originalName,
|
||||
'chars' => mb_strlen($text, 'UTF-8'),
|
||||
'truncated' => $truncated,
|
||||
];
|
||||
}
|
||||
|
||||
function dbnToolsExtractTxt(string $path): string
|
||||
{
|
||||
$content = file_get_contents($path);
|
||||
if ($content === false) {
|
||||
throw new DbnToolsHttpException('Unable to read the file.', 500, 'read_error');
|
||||
}
|
||||
return mb_convert_encoding($content, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252');
|
||||
}
|
||||
|
||||
function dbnToolsExtractPdf(string $path): string
|
||||
{
|
||||
$cmd = 'pdftotext ' . escapeshellarg($path) . ' - 2>/dev/null';
|
||||
$output = shell_exec($cmd);
|
||||
if ($output === null || $output === false || trim($output) === '') {
|
||||
throw new DbnToolsHttpException(
|
||||
'PDF text extraction failed. The file may be image-only or encrypted.',
|
||||
422,
|
||||
'pdf_extract_failed'
|
||||
);
|
||||
}
|
||||
return $output;
|
||||
}
|
||||
|
||||
function dbnToolsExtractDocx(string $path): string
|
||||
{
|
||||
$zip = new ZipArchive();
|
||||
$result = $zip->open($path);
|
||||
if ($result !== true) {
|
||||
throw new DbnToolsHttpException('Unable to open the .docx file.', 422, 'docx_open_failed');
|
||||
}
|
||||
|
||||
$xml = $zip->getFromName('word/document.xml');
|
||||
$zip->close();
|
||||
|
||||
if ($xml === false) {
|
||||
throw new DbnToolsHttpException('No document content found in this .docx file.', 422, 'docx_no_content');
|
||||
}
|
||||
|
||||
$doc = new DOMDocument();
|
||||
libxml_use_internal_errors(true);
|
||||
$doc->loadXML($xml);
|
||||
libxml_clear_errors();
|
||||
|
||||
$xpath = new DOMXPath($doc);
|
||||
$xpath->registerNamespace('w', 'http://schemas.openxmlformats.org/wordprocessingml/2006/main');
|
||||
|
||||
$paragraphs = [];
|
||||
foreach ($xpath->query('//w:p') as $para) {
|
||||
$runs = [];
|
||||
foreach ($xpath->query('.//w:t', $para) as $t) {
|
||||
$runs[] = $t->textContent;
|
||||
}
|
||||
$paragraphs[] = implode('', $runs);
|
||||
}
|
||||
|
||||
return implode("\n", $paragraphs);
|
||||
}
|
||||
|
||||
function dbnToolsCallGpuLlm(array $messages, array $options = []): array
|
||||
{
|
||||
$url = 'http://10.0.1.10:4000/v1/chat/completions';
|
||||
$apiKey = (string)(dbnToolsEnv('LITELLM_MASTER_KEY') ?: 'sk-bnl-litellm-26xR9mK4qvN3wL8sTj7pB2d');
|
||||
$model = (string)($options['model'] ?? 'qwen2.5:14b');
|
||||
$timeout = (int)($options['timeout'] ?? 90);
|
||||
|
||||
$payload = [
|
||||
'model' => $model,
|
||||
'messages' => $messages,
|
||||
'temperature' => $options['temperature'] ?? 0.1,
|
||||
'max_tokens' => $options['max_tokens'] ?? 8000,
|
||||
];
|
||||
if (!empty($options['json'])) {
|
||||
$payload['response_format'] = ['type' => 'json_object'];
|
||||
}
|
||||
|
||||
$body = json_encode($payload, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
|
||||
$headers = [
|
||||
'Content-Type: application/json',
|
||||
'Authorization: Bearer ' . $apiKey,
|
||||
];
|
||||
|
||||
if (function_exists('curl_init')) {
|
||||
$ch = curl_init($url);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => $headers,
|
||||
CURLOPT_TIMEOUT => $timeout,
|
||||
]);
|
||||
$response = curl_exec($ch);
|
||||
$code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
|
||||
$err = curl_error($ch);
|
||||
curl_close($ch);
|
||||
|
||||
if ($response === false) {
|
||||
throw new RuntimeException('GPU LiteLLM request failed: ' . $err);
|
||||
}
|
||||
} else {
|
||||
$ctx = stream_context_create(['http' => [
|
||||
'method' => 'POST',
|
||||
'header' => implode("\r\n", $headers),
|
||||
'content' => $body,
|
||||
'timeout' => $timeout,
|
||||
'ignore_errors' => true,
|
||||
]]);
|
||||
$response = @file_get_contents($url, false, $ctx);
|
||||
$code = 0;
|
||||
if (isset($http_response_header[0]) && preg_match('/\s(\d{3})\s/', $http_response_header[0], $m)) {
|
||||
$code = (int)$m[1];
|
||||
}
|
||||
if ($response === false) {
|
||||
throw new RuntimeException('GPU LiteLLM request failed.');
|
||||
}
|
||||
}
|
||||
|
||||
$decoded = json_decode($response, true);
|
||||
if (!is_array($decoded)) {
|
||||
throw new RuntimeException('GPU LiteLLM returned non-JSON response.');
|
||||
}
|
||||
if ($code < 200 || $code >= 300) {
|
||||
$msg = $decoded['error']['message'] ?? ('HTTP ' . $code);
|
||||
throw new RuntimeException('GPU LiteLLM error: ' . $msg);
|
||||
}
|
||||
return $decoded;
|
||||
}
|
||||
|
||||
+7
-6
@@ -9,12 +9,13 @@ if (!dbnToolsIsAuthenticated()) {
|
||||
}
|
||||
|
||||
$navItems = [
|
||||
'ask' => ['Ask', 'Source-grounded'],
|
||||
'search' => ['Search', 'Legal sources'],
|
||||
'summarize' => ['Summarize', 'Pasted text'],
|
||||
'timeline' => ['Timeline', 'Events'],
|
||||
'redact' => ['Redact', 'Privacy'],
|
||||
'transcribe' => ['Transcribe', 'Audio'],
|
||||
'ask' => ['Ask', 'Source-grounded'],
|
||||
'search' => ['Search', 'Legal sources'],
|
||||
'deep-research' => ['Deep research', 'Agent + RAG'],
|
||||
'summarize' => ['Summarize', 'Pasted text'],
|
||||
'timeline' => ['Timeline', 'Events'],
|
||||
'redact' => ['Redact', 'Privacy'],
|
||||
'transcribe' => ['Transcribe', 'Audio'],
|
||||
];
|
||||
$toolName = $toolName ?? 'ask';
|
||||
$toolTitle = $toolTitle ?? 'Legal Tools';
|
||||
|
||||
@@ -18,5 +18,8 @@
|
||||
</section><!-- /workspace -->
|
||||
</main><!-- /appShell -->
|
||||
<script src="assets/js/tools.js" defer></script>
|
||||
<?php if (!empty($extraScripts) && is_array($extraScripts)): foreach ($extraScripts as $extraScript): ?>
|
||||
<script src="<?= htmlspecialchars((string)$extraScript) ?>" defer></script>
|
||||
<?php endforeach; endif; ?>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
Reference in New Issue
Block a user