Add sub-question branching + document summary modals

- Source modal now shows LLM-generated document summary (lazy-gen + cached in documents.summary) instead of raw chunk text; toggle reveals matched chunk; "View all chunks" button fetches every chunk of the document via new api/document-chunks.php endpoint - Each sub-question card gets a "Branch ↓" button that pre-fills the query with that sub-question and shows a context panel with the prior brief summary; prior_context + branch_notes are injected into interpretSeed() and synthesise() so the LLM knows where the research is coming from - Upload document summaries generated at synthesis time and attached to upload sources alongside corpus summaries - DB: documents.summary TEXT column added to bnl_corpus on chloe Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-15 19:44:27 +02:00
parent 0ff4eb6d31
commit 343b19d0b4
8 changed files with 566 additions and 28 deletions
@@ -23,15 +23,17 @@ final class DbnDeepResearchAgent
    }

    public function run(
-        string $seedQuery,
-        string $pastedText,
-        array  $uploadedFiles,
-        array  $sliceSelection,
-        string $engine,
-        string $language,
-        array  $controls,
+        string   $seedQuery,
+        string   $pastedText,
+        array    $uploadedFiles,
+        array    $sliceSelection,
+        string   $engine,
+        string   $language,
+        array    $controls,
        ?callable $emit = null,
-        string $advocateRole = ''
+        string   $advocateRole = '',
+        ?array   $priorContext = null,
+        string   $branchNotes = ''
    ): array {
        $seedQuery   = trim($seedQuery);
        $pastedText  = trim($pastedText);
@@ -82,7 +84,7 @@ final class DbnDeepResearchAgent
        // STEP 1: Query interpretation
        $emitRunning('interpretation', 'Query interpretation', 'Summarising the seed input…');
        $stepStart = microtime(true);
-        $interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole);
+        $interpretation = $this->interpretSeed($seedDescription, $language, $advocateRole, $priorContext, $branchNotes);
        $this->stepTimings['interpretation'] = $this->elapsedMs($stepStart);
        $emitStep('interpretation', 'Query interpretation', $interpretation['detail'], 'complete');

@@ -284,6 +286,33 @@ final class DbnDeepResearchAgent
        $synthesisEngineLabel = $engine === 'azure_full' ? 'Azure gpt-4o' : ($engine === 'gpu' ? 'GPU qwen2.5:14b' : 'Azure gpt-4o-mini');
        $emitRunning('synthesis', 'Synthesis', sprintf('Synthesising cited brief with %s — this is the slowest step…', $synthesisEngineLabel));
        $stepStart = microtime(true);
+        // Attach upload summaries (generated lazily) to numbered sources
+        if (!empty($uploadedFiles) && !empty($numberedSources)) {
+            $uploadSummaries = [];
+            foreach ($uploadedFiles as $idx => $file) {
+                $text = mb_substr((string)($file['text'] ?? ''), 0, 4000, 'UTF-8');
+                $filename = (string)($file['filename'] ?? "file-{$idx}");
+                if ($text === '') continue;
+                try {
+                    $raw = $this->azure->chatText([
+                        ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
+                        ['role' => 'user', 'content' => "Summarise this document for a legal researcher.\n\nFilename: {$filename}\n\nContent:\n{$text}"],
+                    ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 20]);
+                    $uploadSummaries[$idx] = trim($raw);
+                } catch (Throwable $e) {
+                    error_log('DBN upload summary gen failed for file ' . $idx . ': ' . $e->getMessage());
+                    $uploadSummaries[$idx] = null;
+                }
+            }
+            foreach ($numberedSources as &$src) {
+                if (($src['source_origin'] ?? '') !== 'upload') continue;
+                if (preg_match('/^upload:(\d+):/', (string)($src['chunk_id'] ?? ''), $m)) {
+                    $src['summary'] = $uploadSummaries[(int)$m[1]] ?? null;
+                }
+            }
+            unset($src);
+        }
+
        $synthesis = $this->synthesise(
            $seedDescription,
            $interpretation['brief'],
@@ -292,7 +321,9 @@ final class DbnDeepResearchAgent
            $engine,
            $language,
            $controls['temperature'],
-            $advocateRole
+            $advocateRole,
+            $priorContext,
+            $branchNotes
        );
        $this->stepTimings['synthesis'] = $this->elapsedMs($stepStart);
        $emitStep(
@@ -411,14 +442,30 @@ final class DbnDeepResearchAgent
        return implode("\n\n", $parts);
    }

-    private function interpretSeed(string $seedDescription, string $language, string $advocateRole = ''): array
+    private function interpretSeed(string $seedDescription, string $language, string $advocateRole = '', ?array $priorContext = null, string $branchNotes = ''): array
    {
        $locale = $language === 'no' ? 'Norwegian' : 'English';
        $rolePrefix = $advocateRole !== ''
            ? "You are preparing a case-research brief for: {$advocateRole}. Frame your interpretation to identify the strongest legal angles for this party.\n\n"
            : '';
+
+        $priorContextBlock = '';
+        if (!empty($priorContext)) {
+            $parts = ['Prior research context:'];
+            if (!empty($priorContext['original_query'])) {
+                $parts[] = 'Original question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8');
+            }
+            if (!empty($priorContext['what_we_found'])) {
+                $parts[] = 'Key findings: ' . mb_substr((string)$priorContext['what_we_found'], 0, 400, 'UTF-8');
+            }
+            if ($branchNotes !== '') {
+                $parts[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8');
+            }
+            $priorContextBlock = implode("\n", $parts) . "\n\nNow investigate this branch:\n";
+        }
+
        $prompt = <<<PROMPT
-{$rolePrefix}You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.
+{$rolePrefix}{$priorContextBlock}You are reviewing the input below to set up a deep legal research pass against the Do Better Norge family-law corpus.

 Input:
 {$seedDescription}
@@ -741,7 +788,8 @@ PROMPT;

            $stmt = $ragDb->prepare("
                SELECT d.id, d.title, d.source_url, d.authority_type,
-                       d.publication_date, d.source_id, d.jurisdiction
+                       d.publication_date, d.source_id, d.jurisdiction,
+                       d.summary, LEFT(d.content, 4000) AS content_excerpt
                FROM documents d
                WHERE d.id IN ({$ph})
            ");
@@ -759,9 +807,30 @@ PROMPT;
                    'publication_date' => $row['publication_date'] ?? null,
                    'corpus_source_name' => 'Do Better Legal',
                    'source_id'        => $sid,
+                    'summary'          => $row['summary'] ?? null,
+                    'content_excerpt'  => (string)($row['content_excerpt'] ?? ''),
+                    'title'            => (string)($row['title'] ?? ''),
                ];
            }

+            // Lazily generate summaries for documents that don't have one yet
+            $unsummarized = array_filter($docMeta, fn($m) => $m['summary'] === null && $m['content_excerpt'] !== '');
+            foreach ($unsummarized as $dId => $m) {
+                try {
+                    $raw = $this->azure->chatText([
+                        ['role' => 'system', 'content' => 'Return only a concise 3-4 sentence summary. No preamble.'],
+                        ['role' => 'user', 'content' => "Summarise this Norwegian family law document for a legal researcher.\nFocus on: which legal provisions it covers, its authority type, and what questions it helps answer.\n\nTitle: {$m['title']}\n\nContent:\n{$m['content_excerpt']}"],
+                    ], ['temperature' => 0.1, 'max_tokens' => 200, 'timeout' => 25]);
+                    $summary = trim($raw);
+                    if ($summary !== '') {
+                        $ragDb->prepare("UPDATE documents SET summary = ? WHERE id = ?")->execute([$summary, $dId]);
+                        $docMeta[$dId]['summary'] = $summary;
+                    }
+                } catch (Throwable $e) {
+                    error_log('DBN hydrateSourceUrls summary gen failed for doc ' . $dId . ': ' . $e->getMessage());
+                }
+            }
+
            // Enrich with corpus source name from bnl_admin.corpus_sources
            if (!empty($sourceIds)) {
                $uSids = array_values(array_unique($sourceIds));
@@ -795,6 +864,7 @@ PROMPT;
            $chunk['authority_label']    = $m['authority_label'] ?? $chunk['authority_label'];
            $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
            $chunk['publication_date']   = $m['publication_date'] ?? null;
+            $chunk['summary']            = $m['summary'] ?? null;
        }
        unset($chunk);
    }
@@ -861,14 +931,16 @@ PROMPT;
    }

    private function synthesise(
-        string $seedDescription,
-        string $brief,
-        array  $subQuestions,
-        array  $numberedSources,
-        string $engine,
-        string $language,
-        float  $temperature,
-        string $advocateRole = ''
+        string  $seedDescription,
+        string  $brief,
+        array   $subQuestions,
+        array   $numberedSources,
+        string  $engine,
+        string  $language,
+        float   $temperature,
+        string  $advocateRole = '',
+        ?array  $priorContext = null,
+        string  $branchNotes = ''
    ): array {
        $locale = $language === 'no' ? 'Norwegian' : 'English';

@@ -891,6 +963,23 @@ PROMPT;
            ];
        }

+        $priorContextSection = '';
+        if (!empty($priorContext)) {
+            $prior = [];
+            if (!empty($priorContext['original_query'])) {
+                $prior[] = 'Original research question: ' . mb_substr((string)$priorContext['original_query'], 0, 300, 'UTF-8');
+            }
+            if (!empty($priorContext['brief_summary'])) {
+                $prior[] = "Key findings from prior research:\n" . mb_substr((string)$priorContext['brief_summary'], 0, 600, 'UTF-8');
+            }
+            if ($branchNotes !== '') {
+                $prior[] = 'Researcher notes: ' . mb_substr($branchNotes, 0, 300, 'UTF-8');
+            }
+            if ($prior) {
+                $priorContextSection = "\nBackground from prior research:\n" . implode("\n", $prior) . "\n";
+            }
+        }
+
        $sourcesContext = [];
        foreach ($numberedSources as $s) {
            $sourcesContext[] = sprintf(
@@ -926,7 +1015,7 @@ PROMPT;
            $prompt = <<<PROMPT
 You are Do Better Norge Legal Tools producing a legal preparation brief in {$locale}.
 Your client: {$advocateRole}
-
+{$priorContextSection}
 You MUST ground every claim in the numbered sources below using inline `[n]` citation markers. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.

 User input:
@@ -961,7 +1050,7 @@ PROMPT;
        } else {
            $prompt = <<<PROMPT
 You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
-
+{$priorContextSection}
 User input:
 {$seedDescription}