From 785de04f05a86a038a968e40f13a13fc068a1a68 Mon Sep 17 00:00:00 2001
From: davegilligan <davegilligan73@gmail.com>
Date: Fri, 15 May 2026 11:42:38 +0200
Subject: [PATCH] fix: batch embed 5 chunks at a time with flush between; fix
 hydrateSourceUrls SQL

Embed timeout: bnl_corpus Ollama embeds ~49 chunks sequentially in CPU mode,
easily exceeding the 60s cURL timeout. Now truncates upload text to
MAX_UPLOAD_CHARS before chunking (~21 chunks max) and embeds in batches of 5
with a progress flush between batches to keep the stream alive.

SQL error: bnl_corpus.documents lacks the temporal columns added in migration
136 (valid_from, valid_until, etc.). dbnV6QueryDocumentMeta uses IFNULL which
doesn't protect against missing columns. Replaced with a direct query using
only the columns confirmed to exist on this instance.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 includes/DeepResearchAgent.php | 74 ++++++++++++++++++++++++++++++----
 1 file changed, 67 insertions(+), 7 deletions(-)

diff --git a/includes/DeepResearchAgent.php b/includes/DeepResearchAgent.php
index 44022f9..5a96a49 100644
--- a/includes/DeepResearchAgent.php
+++ b/includes/DeepResearchAgent.php
@@ -130,15 +130,31 @@ final class DbnDeepResearchAgent
         $uploadChunks = [];
         foreach ($uploadedFiles as $idx => $file) {
             $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
-            $text = (string)($file['text'] ?? '');
+            // Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
+            $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
             $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
         }
         $uploadStatus = 'complete';
         $uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
         if ($uploadChunks) {
             try {
+                // Embed in small batches of 5, emitting progress between each so the stream
+                // stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
                 $texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
-                $vecs = dbnToolsLiteLLMEmbedBatch($texts);
+                $allVecs = [];
+                $batchSize = 5;
+                for ($b = 0; $b < count($texts); $b += $batchSize) {
+                    $batch = array_slice($texts, $b, $batchSize);
+                    if ($emit) {
+                        $emit('progress', ['detail' => sprintf(
+                            'Embedding chunks %d–%d of %d…',
+                            $b + 1, $b + count($batch), count($texts)
+                        )]);
+                    }
+                    $batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
+                    $allVecs = array_merge($allVecs, $batchVecs);
+                }
+                $vecs = $allVecs;
                 if (count($vecs) === count($uploadChunks)) {
                     foreach ($uploadChunks as $i => $chunk) {
                         $this->uploadVecs[] = [
@@ -153,7 +169,7 @@ final class DbnDeepResearchAgent
             } catch (Throwable $e) {
                 error_log('DBN deep research upload embed failed: ' . $e->getMessage());
                 $uploadStatus = 'warning';
-                $uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.';
+                $uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
                 $this->uploadVecs = [];
             }
         } elseif (empty($uploadedFiles)) {
@@ -628,7 +644,8 @@ PROMPT;
 
     /**
      * Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
-     * One batched dbn_v6 query for all unique document_ids.
+     * Uses a direct query against bnl_corpus.documents (only columns that exist there —
+     * the temporal columns added in migration 136 are absent on this instance).
      */
     private function hydrateSourceUrls(array &$pool): void
     {
@@ -641,7 +658,50 @@ PROMPT;
         if (empty($docIds)) return;
 
         try {
-            $meta = dbnV6QueryDocumentMeta(dbnToolsDb(), dbnToolsRagDb(), array_keys($docIds));
+            $ragDb = dbnToolsRagDb();
+            $ids   = array_keys($docIds);
+            $ph    = implode(',', array_fill(0, count($ids), '?'));
+
+            $stmt = $ragDb->prepare("
+                SELECT d.id, d.title, d.source_url, d.authority_type,
+                       d.publication_date, d.source_id, d.jurisdiction
+                FROM documents d
+                WHERE d.id IN ({$ph})
+            ");
+            $stmt->execute($ids);
+
+            $docMeta  = [];
+            $sourceIds = [];
+            foreach ($stmt as $row) {
+                $dId = (int)$row['id'];
+                $sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
+                if ($sid) $sourceIds[] = $sid;
+                $docMeta[$dId] = [
+                    'source_url'       => $row['source_url']       ?? null,
+                    'authority_label'  => dbnV6AuthorityLabel($row['authority_type'] ?? null),
+                    'publication_date' => $row['publication_date'] ?? null,
+                    'corpus_source_name' => 'Do Better Legal',
+                    'source_id'        => $sid,
+                ];
+            }
+
+            // Enrich with corpus source name from bnl_admin.corpus_sources
+            if (!empty($sourceIds)) {
+                $uSids = array_values(array_unique($sourceIds));
+                $sPh   = implode(',', array_fill(0, count($uSids), '?'));
+                $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
+                $sStmt->execute($uSids);
+                $srcNames = [];
+                foreach ($sStmt as $row) {
+                    $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
+                }
+                foreach ($docMeta as &$m) {
+                    if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
+                        $m['corpus_source_name'] = $srcNames[$m['source_id']];
+                    }
+                }
+                unset($m);
+            }
         } catch (Throwable $e) {
             error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
             return;
@@ -650,8 +710,8 @@ PROMPT;
         foreach ($pool as &$chunk) {
             if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
             $docId = (int)($chunk['document_id'] ?? 0);
-            if (!$docId || !isset($meta[$docId])) continue;
-            $m = $meta[$docId];
+            if (!$docId || !isset($docMeta[$docId])) continue;
+            $m = $docMeta[$docId];
             $sourceUrl = $m['source_url'] ?? null;
             $chunk['source_url']         = $sourceUrl;
             $chunk['deep_link']          = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);