From e130db8119cf3cefeba4a6e89c813301c2f6a31e Mon Sep 17 00:00:00 2001
From: davegilligan <davegilligan73@gmail.com>
Date: Fri, 15 May 2026 11:12:13 +0200
Subject: [PATCH] Deep Research v2: exclude marketing site, deep-link sources,
 per-agent reports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three user-flagged issues after the first real run with a 920KB sakkyndig PDF:

1. dobetternorge.no marketing-website chunks leaked into the retrieval pool.
   ClientRagPipeline::searchAll defaults include_beta_website=true; we now
   pass false for both website flags, AND defensively drop any returned
   chunk whose source_name contains "website" or title contains
   "dobetternorge.no" before it can pollute synthesis.

2. Brief returned was "just a paragraph". Bumped synthesis max_tokens
   2200→3200, raised timeout 120→180s, and rewrote the prompt to require
   400-900 words with min 4 paragraphs when source_count>=3, covering EACH
   sub-question in its own paragraph. Now also passes authority + jurisdiction
   into the sources block so the model can pinpoint statutes correctly.

3. No way to see what each "sub-question agent" researched or click through
   to the source articles. Restructured the results panel so per-sub-question
   report cards now render ABOVE the synthesised brief. Each report shows the
   question, the rationale, and the top 3 retrieved sources for that sub-Q
   with title→deep link + 1-line excerpt. Brief follows. Consolidated
   numbered sources list at the bottom, with titles as deep links too.

Deep-link construction: source_url is hydrated via dbnV6QueryDocumentMeta
in a single batched call after retrieval. For Lovdata sources with a
section_title containing §<n>, the link is path-anchored to that section
(/§43). For other hosts (HUDOC, Regjeringen, Bufdir, etc.) we link to the
document root URL.

Telemetry: trace_metadata now carries retrieval_counts {raw_corpus,
filtered_website, post_filter_corpus, raw_upload, after_dedupe, after_topk}
so future regressions are diagnosable from the metadata.jsonl log alone.
The completion status pill surfaces the corpus/website/upload split.
---
 assets/css/tools.css           | 124 +++++++++++++++++++++++
 assets/js/deep-research.js     |  91 ++++++++++++++---
 includes/DeepResearchAgent.php | 175 ++++++++++++++++++++++++++++-----
 3 files changed, 351 insertions(+), 39 deletions(-)
diff --git a/assets/css/tools.css b/assets/css/tools.css
index cc1902f..3513758 100644
--- a/assets/css/tools.css
+++ b/assets/css/tools.css
@@ -2176,3 +2176,127 @@ p {
   .dr-source-card { grid-template-columns: 32px 1fr; }
   .dr-source-aside { display: none; }
 }
+
+/* Per-sub-question agent report cards (v2) */
+.dr-subq-list {
+  display: grid;
+  gap: 10px;
+}
+
+.dr-subq-report {
+  border: 1px solid var(--line);
+  border-radius: 8px;
+  padding: 12px 13px;
+  background: #fbfcfe;
+}
+
+.dr-subq-report__head {
+  display: grid;
+  grid-template-columns: auto 1fr;
+  gap: 10px;
+  align-items: start;
+  margin-bottom: 10px;
+}
+
+.dr-subq-report__index {
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  min-width: 30px;
+  height: 24px;
+  padding: 0 8px;
+  border-radius: 999px;
+  background: var(--soft-teal);
+  color: var(--teal-dark);
+  font-weight: 800;
+  font-variant-numeric: tabular-nums;
+  font-size: 0.78rem;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+}
+
+.dr-subq-report__question {
+  font-weight: 700;
+  color: var(--ink);
+  line-height: 1.4;
+}
+
+.dr-subq-report__rationale {
+  margin-top: 4px;
+  color: var(--muted);
+  font-size: 0.86rem;
+  line-height: 1.45;
+}
+
+.dr-mini-source-list {
+  list-style: none;
+  padding: 0;
+  margin: 0;
+  display: grid;
+  gap: 6px;
+}
+
+.dr-mini-source {
+  display: grid;
+  grid-template-columns: 32px 1fr;
+  gap: 8px;
+  align-items: start;
+  padding: 8px 10px;
+  background: #fff;
+  border: 1px solid var(--line);
+  border-radius: 6px;
+}
+
+.dr-mini-source--empty {
+  display: block;
+  color: var(--muted);
+  padding: 8px 10px;
+}
+
+.dr-mini-source__n {
+  font-variant-numeric: tabular-nums;
+  color: var(--coral);
+  font-weight: 800;
+  font-size: 0.85rem;
+}
+
+.dr-mini-source__title {
+  display: inline-block;
+  font-weight: 700;
+  color: var(--ink);
+  text-decoration: none;
+  line-height: 1.35;
+}
+
+a.dr-mini-source__title:hover { color: var(--teal-dark); text-decoration: underline; }
+
+.dr-mini-source__meta {
+  color: var(--muted);
+  font-size: 0.78rem;
+  margin-top: 3px;
+}
+
+.dr-mini-source__excerpt {
+  color: var(--muted);
+  font-size: 0.86rem;
+  line-height: 1.45;
+  margin-top: 5px;
+}
+
+.dr-external-link {
+  display: inline-block;
+  color: var(--teal);
+  font-size: 0.8em;
+  margin-left: 3px;
+  vertical-align: 1px;
+}
+
+a.dr-source-title-link {
+  color: var(--ink);
+  text-decoration: none;
+}
+
+a.dr-source-title-link:hover {
+  color: var(--teal-dark);
+  text-decoration: underline;
+}
diff --git a/assets/js/deep-research.js b/assets/js/deep-research.js
index 39b51f6..16abde6 100644
--- a/assets/js/deep-research.js
+++ b/assets/js/deep-research.js
@@ -346,8 +346,12 @@
 
     lastResult = finalResult;
     const meta = finalResult.trace_metadata || {};
+    const rc = meta.retrieval_counts || {};
+    const countSummary = (rc.post_filter_corpus != null)
+      ? `${rc.post_filter_corpus} corpus${rc.filtered_website ? ` (${rc.filtered_website} website filtered)` : ''}${rc.raw_upload ? ` + ${rc.raw_upload} upload` : ''}`
+      : `${meta.source_count || 0} sources`;
     setStatus(
-      `Done in ${Math.round((finalResult.latency_ms || 0) / 1000)} s · ${meta.source_count || 0} sources · confidence ${meta.citation_confidence || '?'}`,
+      `Done in ${Math.round((finalResult.latency_ms || 0) / 1000)} s · ${countSummary} · confidence ${meta.citation_confidence || '?'}`,
       'ok'
     );
     els.runButton.disabled = false;
@@ -425,19 +429,23 @@
 
     const briefHtml = renderBrief(data.brief_markdown || '', sources);
 
-    const subQHtml = subs.length ? `
+    // Per-sub-question report cards — the "what each agent researched" view
+    const subQReportsHtml = subs.length ? `
       <div class="dr-result-block">
-        <h3 style="margin:0 0 8px;font-size:1rem">Angles the agent explored</h3>
-        <ol style="padding-left:1.2em;margin:0;color:var(--muted);line-height:1.55">
-          ${subs.map((sq) => `<li><strong style="color:var(--ink)">${escapeHtml(sq.question)}</strong>${sq.rationale ? `<br><small>${escapeHtml(sq.rationale)}</small>` : ''}</li>`).join('')}
-        </ol>
+        <div class="dr-sources-head">
+          <h3>What each sub-question agent researched</h3>
+          <small>${subs.length} sub-question${subs.length === 1 ? '' : 's'}, top 3 sources each</small>
+        </div>
+        <div class="dr-subq-list">
+          ${subs.map((sq, i) => renderSubQReport(sq, i)).join('')}
+        </div>
       </div>` : '';
 
     const sourcesHtml = `
       <div class="dr-result-block">
         <div class="dr-sources-head">
-          <h3>Sources (${sources.length})</h3>
-          <small>Click a card to see the full chunk + scores</small>
+          <h3>All sources (${sources.length})</h3>
+          <small>Click a card to see the full chunk + scores · external link opens the original article</small>
         </div>
         <div class="dr-source-list">
           ${sources.map((s) => renderSourceCard(s)).join('')}
@@ -459,18 +467,20 @@
       </div>` : '';
 
     els.results.innerHTML = `
+      ${subQReportsHtml}
       <div class="dr-result-block">
+        <h3 style="margin:0 0 10px;font-size:1rem">Synthesised brief</h3>
         <div class="dr-brief">${briefHtml}</div>
       </div>
-      ${subQHtml}
       ${sourcesHtml}
       ${uncertHtml}
       ${nextHtml}
     `;
 
-    // Bind source-card click handlers + citation marker click handlers
-    els.results.querySelectorAll('[data-source-n]').forEach((node) => {
-      node.addEventListener('click', () => {
+    // Bind source-card click handlers (open modal) — but ignore clicks on inner <a>
+    els.results.querySelectorAll('.dr-source-card[data-source-n]').forEach((node) => {
+      node.addEventListener('click', (e) => {
+        if (e.target.closest('a')) return;  // let anchor handle its own click
         const n = parseInt(node.dataset.sourceN, 10);
         const src = sources.find((s) => s.n === n);
         if (src) {
@@ -479,6 +489,52 @@
         }
       });
     });
+    // Bind inline citation markers in brief → flash + open modal
+    els.results.querySelectorAll('.dr-cite[data-source-n]').forEach((node) => {
+      node.addEventListener('click', (e) => {
+        if (e.target.closest('a')) return;
+        const n = parseInt(node.dataset.sourceN, 10);
+        const src = sources.find((s) => s.n === n);
+        if (src) {
+          flashSource(n);
+        }
+      });
+    });
+  }
+
+  function renderSubQReport(sq, idx) {
+    const top = sq.top_sources || [];
+    const sourceItems = top.length
+      ? top.map((s) => {
+          const link = s.deep_link || s.source_url;
+          const titleHtml = link
+            ? `<a href="${escapeHtml(link)}" target="_blank" rel="noopener" class="dr-mini-source__title">${escapeHtml(s.title || 'Untitled')} <span class="dr-external-link" aria-hidden="true">↗</span></a>`
+            : `<span class="dr-mini-source__title">${escapeHtml(s.title || 'Untitled')}</span>`;
+          const meta = [];
+          if (s.section) meta.push(escapeHtml(s.section));
+          if (s.authority_label) meta.push(escapeHtml(s.authority_label));
+          if (s.source_origin === 'upload') meta.push('your upload');
+          return `<li class="dr-mini-source">
+            <span class="dr-mini-source__n">[${s.n ?? '?'}]</span>
+            <div class="dr-mini-source__body">
+              ${titleHtml}
+              ${meta.length ? `<div class="dr-mini-source__meta">${meta.join(' · ')}</div>` : ''}
+              <div class="dr-mini-source__excerpt">${escapeHtml(truncate(s.excerpt || '', 180))}</div>
+            </div>
+          </li>`;
+        }).join('')
+      : `<li class="dr-mini-source dr-mini-source--empty"><em>No sources retrieved for this sub-question.</em></li>`;
+
+    return `<div class="dr-subq-report">
+      <div class="dr-subq-report__head">
+        <span class="dr-subq-report__index">${escapeHtml(sq.id || ('q' + (idx + 1)))}</span>
+        <div class="dr-subq-report__body">
+          <div class="dr-subq-report__question">${escapeHtml(sq.question || '')}</div>
+          ${sq.rationale ? `<div class="dr-subq-report__rationale">${escapeHtml(sq.rationale)}</div>` : ''}
+        </div>
+      </div>
+      <ul class="dr-mini-source-list">${sourceItems}</ul>
+    </div>`;
   }
 
   function flashSource(n) {
@@ -495,13 +551,18 @@
     const score = s.reranker_score != null ? s.reranker_score : s.similarity;
     const originTagClass = s.source_origin === 'upload' ? 'dr-source-tag dr-source-tag--upload' : 'dr-source-tag';
     const originLabel = s.source_origin === 'upload' ? 'upload' : 'corpus';
-    return `<button type="button" class="dr-source-card" data-source-n="${s.n}">
+    const link = s.deep_link || s.source_url;
+    const titleHtml = link
+      ? `<a href="${escapeHtml(link)}" target="_blank" rel="noopener" class="dr-source-title-link">${escapeHtml(s.title || 'Untitled')} <span class="dr-external-link" aria-hidden="true">↗</span></a>`
+      : `${escapeHtml(s.title || 'Untitled')}`;
+    return `<div class="dr-source-card" data-source-n="${s.n}" role="button" tabindex="0">
       <span class="dr-source-number">${s.n}</span>
       <div class="dr-source-body">
-        <div class="dr-source-title">${escapeHtml(s.title || 'Untitled')}</div>
+        <div class="dr-source-title">${titleHtml}</div>
         ${s.section ? `<div class="dr-source-meta"><span class="dr-source-tag">${escapeHtml(s.section)}</span></div>` : ''}
         <div class="dr-source-meta">
           <span class="${originTagClass}">${originLabel}</span>
+          ${s.authority_label ? `<span class="dr-source-tag">${escapeHtml(s.authority_label)}</span>` : ''}
           <span class="dr-source-tag dr-source-tag--score">${escapeHtml(s.package_or_corpus || '—')}</span>
           ${(s.matched_sub_questions || []).map((q) => `<span class="dr-source-tag">${escapeHtml(q)}</span>`).join('')}
         </div>
@@ -511,7 +572,7 @@
         <span>score<br><b>${score != null ? Number(score).toFixed(2) : '—'}</b></span>
         ${s.reranker_score != null && s.similarity != null ? `<span>sim<br><b>${Number(s.similarity).toFixed(2)}</b></span>` : ''}
       </div>
-    </button>`;
+    </div>`;
   }
 
   // Markdown renderer — minimal: paragraphs, bold/italic, code, [n] citation badges
diff --git a/includes/DeepResearchAgent.php b/includes/DeepResearchAgent.php
index 610c50b..2a1ed99 100644
--- a/includes/DeepResearchAgent.php
+++ b/includes/DeepResearchAgent.php
@@ -182,6 +182,9 @@ final class DbnDeepResearchAgent
 
         $rawPool = [];
         $retrievalWarnings = 0;
+        $rawCorpusCount = 0;
+        $rawUploadCount = 0;
+        $filteredOutCount = 0;
         foreach ($retrievalQueries as $idx => $sq) {
             if ($emit) {
                 $emit('subq', [
@@ -197,13 +200,15 @@ final class DbnDeepResearchAgent
                     $controls['chunk_limit'],
                     null,
                     [
-                        'search_private'   => false,
-                        'search_shared'    => true,
-                        'package_ids'      => [(int)$package['id']],
-                        'shared_doc_ids'   => $sharedDocIds,
-                        'chunk_limit'      => $controls['chunk_limit'],
-                        'search_method'    => 'hybrid',
-                        'reranker_enabled' => true,
+                        'search_private'         => false,
+                        'search_shared'          => true,
+                        'package_ids'            => [(int)$package['id']],
+                        'shared_doc_ids'         => $sharedDocIds,
+                        'chunk_limit'            => $controls['chunk_limit'],
+                        'search_method'          => 'hybrid',
+                        'reranker_enabled'       => true,
+                        'include_beta_website'   => false,
+                        'include_primary_website'=> false,
                     ]
                 );
             } catch (Throwable $e) {
@@ -211,13 +216,19 @@ final class DbnDeepResearchAgent
                 $corpusChunks = [];
                 $retrievalWarnings++;
             }
+            $rawCorpusCount += count($corpusChunks);
             foreach ($corpusChunks as $chunk) {
+                if ($this->isWebsiteChunk($chunk)) {
+                    $filteredOutCount++;
+                    continue;
+                }
                 $rawPool[] = $this->normalizeCorpusChunk($chunk, $sq['id']);
             }
 
             // Upload chunk retrieval via cosine sim
             if (!empty($this->uploadVecs)) {
                 $uploadHits = $this->retrieveFromUploads($sq['question'], $controls['chunk_limit'], $controls['similarity_threshold']);
+                $rawUploadCount += count($uploadHits);
                 foreach ($uploadHits as $hit) {
                     $hit['matched_sub_questions'] = [$sq['id']];
                     $rawPool[] = $hit;
@@ -229,17 +240,32 @@ final class DbnDeepResearchAgent
         $this->stepTimings['retrieval'] = $this->elapsedMs($stepStart);
         $retrievalStatus = $retrievalWarnings > 0 ? 'warning' : 'complete';
         $retrievalDetail = sprintf(
-            '%d sub-question(s) × hybrid + RRF + rerank → %d raw chunks → %d unique after dedupe.',
+            '%d sub-question(s) × hybrid + RRF + rerank → %d corpus chunks (%d filtered) + %d upload hits → %d unique after dedupe.',
             count($retrievalQueries),
-            count($rawPool),
+            $rawCorpusCount,
+            $filteredOutCount,
+            $rawUploadCount,
             count($merged)
         );
         $emitStep('retrieval', 'Retrieval', $retrievalDetail, $retrievalStatus);
 
         // Cap pool to reranker top-K for synthesis
         $synthesisPool = array_slice($merged, 0, $controls['reranker_top_k']);
+
+        // Hydrate corpus sources with source_url + authority_label via batched dbn_v6 query
+        $this->hydrateSourceUrls($synthesisPool);
+
         $numberedSources = $this->numberSources($synthesisPool);
 
+        $retrievalCounts = [
+            'raw_corpus'         => $rawCorpusCount,
+            'filtered_website'   => $filteredOutCount,
+            'post_filter_corpus' => $rawCorpusCount - $filteredOutCount,
+            'raw_upload'         => $rawUploadCount,
+            'after_dedupe'       => count($merged),
+            'after_topk'         => count($numberedSources),
+        ];
+
         // STEP 6: Synthesis
         $synthesisEngineLabel = $engine === 'azure_full' ? 'Azure gpt-4o' : ($engine === 'gpu' ? 'GPU qwen2.5:14b' : 'Azure gpt-4o-mini');
         $emitRunning('synthesis', 'Synthesis', sprintf('Synthesising cited brief with %s — this is the slowest step…', $synthesisEngineLabel));
@@ -270,18 +296,29 @@ final class DbnDeepResearchAgent
             $confidence === 'low' ? 'warning' : 'complete'
         );
 
-        // Stitch sub-question chunk_ids
+        // Stitch sub-question chunk_ids + top_sources (top 3 sources matched by each sub-Q)
         $subQOut = [];
         foreach ($retrievalQueries as $sq) {
             $matchedChunks = array_values(array_filter(
                 $numberedSources,
                 fn(array $s) => in_array($sq['id'], $s['matched_sub_questions'] ?? [], true)
             ));
+            $topSources = array_slice($matchedChunks, 0, 3);
             $subQOut[] = [
-                'id'        => $sq['id'],
-                'question'  => $sq['question'],
-                'rationale' => $sq['rationale'] ?? '',
-                'chunk_ids' => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
+                'id'          => $sq['id'],
+                'question'    => $sq['question'],
+                'rationale'   => $sq['rationale'] ?? '',
+                'chunk_ids'   => array_values(array_map(fn(array $s) => $s['chunk_id'], $matchedChunks)),
+                'top_sources' => array_map(fn(array $s) => [
+                    'n'              => $s['n'] ?? null,
+                    'title'          => $s['title'] ?? '',
+                    'section'        => $s['section'] ?? null,
+                    'deep_link'      => $s['deep_link'] ?? $s['source_url'] ?? null,
+                    'source_url'     => $s['source_url'] ?? null,
+                    'source_origin'  => $s['source_origin'] ?? 'corpus',
+                    'authority_label'=> $s['authority_label'] ?? null,
+                    'excerpt'        => $s['excerpt'] ?? '',
+                ], $topSources),
             ];
         }
 
@@ -305,6 +342,7 @@ final class DbnDeepResearchAgent
                 'engine_used'         => $engine,
                 'citation_confidence' => $confidence,
                 'elapsed_ms_per_step' => $this->stepTimings,
+                'retrieval_counts'    => $retrievalCounts,
                 'slices_active'       => array_keys(array_filter($sliceSelectionNormalized)),
             ],
             'disclaimer' => dbnToolsDisclaimer($language),
@@ -553,7 +591,7 @@ PROMPT;
             'chunk_id'          => isset($chunk['id']) ? (int)$chunk['id'] : null,
             'title'             => (string)($chunk['document_title'] ?? $chunk['title'] ?? 'Untitled source'),
             'section'           => $chunk['section_title'] ?? null,
-            'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Norge'),
+            'package_or_corpus' => (string)($chunk['source_name'] ?? $chunk['source_type'] ?? 'Do Better Legal'),
             'excerpt'           => dbnToolsExcerpt((string)($chunk['content'] ?? ''), 620),
             'chunk_text'        => (string)($chunk['content'] ?? ''),
             'similarity'        => $similarity,
@@ -562,10 +600,90 @@ PROMPT;
             'source_origin'     => 'corpus',
             'authority_type'    => $chunk['authority_type'] ?? null,
             'jurisdiction'      => $chunk['jurisdiction'] ?? null,
+            'publication_year'  => $chunk['publication_year'] ?? null,
+            // Filled in later by hydrateSourceUrls()
+            'source_url'        => null,
+            'deep_link'         => null,
+            'authority_label'   => null,
+            'corpus_source_name'=> null,
+            'publication_date'  => null,
             'matched_sub_questions' => [$subQId],
         ];
     }
 
+    /**
+     * Defensive post-filter: drop any chunk that smells like a marketing-website hit
+     * (dobetternorge.no marketing pages have source_group 'website-primary'/'website-beta'
+     * but the chunk payload only carries `source_name` — use a name+title regex check).
+     */
+    private function isWebsiteChunk(array $chunk): bool
+    {
+        $name = strtolower((string)($chunk['source_name'] ?? ''));
+        $title = strtolower((string)($chunk['document_title'] ?? $chunk['title'] ?? ''));
+        if ($name === '') return false;
+        // Trusted shared-corpus packages do not contain the word 'website'. Marketing
+        // sources are explicitly labelled with source_group=website-primary/beta upstream.
+        if (str_contains($name, 'website')) return true;
+        if (str_contains($title, 'dobetternorge.no')) return true;
+        if (preg_match('/^(homepage|landing|about |contact )/i', $title)) return true;
+        return false;
+    }
+
+    /**
+     * Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
+     * One batched dbn_v6 query for all unique document_ids.
+     */
+    private function hydrateSourceUrls(array &$pool): void
+    {
+        $docIds = [];
+        foreach ($pool as $chunk) {
+            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
+            $docId = (int)($chunk['document_id'] ?? 0);
+            if ($docId > 0) $docIds[$docId] = true;
+        }
+        if (empty($docIds)) return;
+
+        try {
+            $meta = dbnV6QueryDocumentMeta(dbnToolsDb(), dbnToolsRagDb(), array_keys($docIds));
+        } catch (Throwable $e) {
+            error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
+            return;
+        }
+
+        foreach ($pool as &$chunk) {
+            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
+            $docId = (int)($chunk['document_id'] ?? 0);
+            if (!$docId || !isset($meta[$docId])) continue;
+            $m = $meta[$docId];
+            $sourceUrl = $m['source_url'] ?? null;
+            $chunk['source_url']         = $sourceUrl;
+            $chunk['deep_link']          = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
+            $chunk['authority_label']    = $m['authority_label'] ?? $chunk['authority_label'];
+            $chunk['corpus_source_name'] = $m['corpus_source_name'] ?? null;
+            $chunk['publication_date']   = $m['publication_date'] ?? null;
+        }
+        unset($chunk);
+    }
+
+    /**
+     * Construct a clickable URL into the original article. Lovdata supports
+     * path-style section anchors (e.g. /§43). For other hosts we return the
+     * document root URL.
+     */
+    private function buildDeepLink(?string $sourceUrl, ?string $sectionTitle): ?string
+    {
+        if (!$sourceUrl) return null;
+        $sourceUrl = trim($sourceUrl);
+        if ($sourceUrl === '') return null;
+
+        if (preg_match('~^https?://lovdata\.no/~i', $sourceUrl)
+            && $sectionTitle
+            && preg_match('/§\s?(\d+[A-Za-z\-]?)/u', $sectionTitle, $m)) {
+            return rtrim($sourceUrl, '/') . '/§' . $m[1];
+        }
+        return $sourceUrl;
+    }
+
     private function mergeAndDedupe(array $rawPool, int $cap): array
     {
         $byKey = [];
@@ -636,12 +754,14 @@ PROMPT;
         $sourcesContext = [];
         foreach ($numberedSources as $s) {
             $sourcesContext[] = sprintf(
-                "[%d] (%s) %s%s\n    Corpus: %s\n    Excerpt: %s",
+                "[%d] (%s) %s%s\n    Corpus: %s\n    Authority: %s | Jurisdiction: %s\n    Excerpt: %s",
                 $s['n'],
                 $s['source_origin'] === 'upload' ? 'uploaded doc' : 'corpus',
                 $s['title'],
                 !empty($s['section']) ? ' — ' . $s['section'] : '',
                 $s['package_or_corpus'],
+                $s['authority_label'] ?? ($s['authority_type'] ?? 'n/a'),
+                $s['jurisdiction'] ?? 'n/a',
                 $s['excerpt']
             );
         }
@@ -657,6 +777,11 @@ PROMPT;
             $subQText = "\nSub-questions explored:\n" . implode("\n", $lines);
         }
 
+        $sourceCount = count($numberedSources);
+        $lengthGuidance = $sourceCount >= 3
+            ? '400-900 words, minimum 4 paragraphs, with clear paragraph breaks. Cover EACH sub-question above in its own paragraph.'
+            : '250-450 words, 2-3 short paragraphs. Note when evidence is thin.';
+
         $prompt = <<<PROMPT
 You are Do Better Norge Legal Tools running a deep-research synthesis. You MUST ground every claim in the numbered sources below, using inline `[n]` citation markers that map to the source list. Do NOT cite a source you did not use. Do NOT invent statutes, paragraph numbers, case names, dates, or parties.
 
@@ -667,29 +792,31 @@ Research brief:
 {$brief}
 {$subQText}
 
-Sources (numbered):
+Sources ({$sourceCount} numbered):
 {$sourcesText}
 
 Return JSON only in {$locale}:
 {
-  "brief_markdown": "Markdown legal brief, 250-700 words, with inline [n] citation markers keyed to the sources above. Use short paragraphs. End with a one-line caveat. Do NOT include headings above level 3 (###).",
-  "what_we_found": "1-2 sentence plain-language summary of the grounded finding",
-  "what_remains_uncertain": ["gaps or caveats — what the corpus did not cover or where confidence is limited"],
-  "next_practical_step": "one concrete next action the user can take"
+  "brief_markdown": "Markdown legal brief. {$lengthGuidance} Every factual claim ends with one or more inline [n] markers keyed to the sources above. Use level-3 headings (###) sparingly to separate paragraphs by theme when helpful. End with a one-line caveat that this is research support, not legal advice.",
+  "what_we_found": "2-4 sentence plain-language summary of the grounded finding",
+  "what_remains_uncertain": ["specific gaps — what the corpus did not cover, conflicting authority, or where confidence is limited (3-6 items when sources >= 3)"],
+  "next_practical_step": "one concrete next action the user can take to strengthen the case or close a gap"
 }
 
 Rules:
 - Every factual claim in `brief_markdown` must end with one or more `[n]` markers.
-- If no source supports a point, omit the point.
+- If no source supports a point, omit the point — DO NOT speculate.
+- Prefer pinpointing statute sections (e.g. "Barneloven §43") and case names verbatim from the source excerpts.
+- When multiple sources support the same point, cite all of them (e.g. `[2,4]`).
 - Respond in {$locale}.
-- Output valid JSON only — no markdown fences around the JSON.
+- Output valid JSON only — no markdown fences around the JSON object itself.
 PROMPT;
 
         $messages = [
             ['role' => 'system', 'content' => 'You return valid JSON only. No markdown fences.'],
             ['role' => 'user',   'content' => $prompt],
         ];
-        $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 2200, 'timeout' => 120];
+        $opts = ['json' => true, 'temperature' => $temperature, 'max_tokens' => 3200, 'timeout' => 180];
 
         try {
             if ($engine === 'gpu') {