From 785de04f05a86a038a968e40f13a13fc068a1a68 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Fri, 15 May 2026 11:42:38 +0200 Subject: [PATCH] fix: batch embed 5 chunks at a time with flush between; fix hydrateSourceUrls SQL Embed timeout: bnl_corpus Ollama embeds ~49 chunks sequentially in CPU mode, easily exceeding the 60s cURL timeout. Now truncates upload text to MAX_UPLOAD_CHARS before chunking (~21 chunks max) and embeds in batches of 5 with a progress flush between batches to keep the stream alive. SQL error: bnl_corpus.documents lacks the temporal columns added in migration 136 (valid_from, valid_until, etc.). dbnV6QueryDocumentMeta uses IFNULL which doesn't protect against missing columns. Replaced with a direct query using only the columns confirmed to exist on this instance. Co-Authored-By: Claude Opus 4.7 --- includes/DeepResearchAgent.php | 74 ++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 7 deletions(-) diff --git a/includes/DeepResearchAgent.php b/includes/DeepResearchAgent.php index 44022f9..5a96a49 100644 --- a/includes/DeepResearchAgent.php +++ b/includes/DeepResearchAgent.php @@ -130,15 +130,31 @@ final class DbnDeepResearchAgent $uploadChunks = []; foreach ($uploadedFiles as $idx => $file) { $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1)); - $text = (string)($file['text'] ?? ''); + // Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size + $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8'); $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx)); } $uploadStatus = 'complete'; $uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks)); if ($uploadChunks) { try { + // Embed in small batches of 5, emitting progress between each so the stream + // stays alive during slow CPU Ollama inference (nomic-embed-text on chloe). $texts = array_map(fn(array $c) => $c['text'], $uploadChunks); - $vecs = dbnToolsLiteLLMEmbedBatch($texts); + $allVecs = []; + $batchSize = 5; + for ($b = 0; $b < count($texts); $b += $batchSize) { + $batch = array_slice($texts, $b, $batchSize); + if ($emit) { + $emit('progress', ['detail' => sprintf( + 'Embedding chunks %d–%d of %d…', + $b + 1, $b + count($batch), count($texts) + )]); + } + $batchVecs = dbnToolsLiteLLMEmbedBatch($batch); + $allVecs = array_merge($allVecs, $batchVecs); + } + $vecs = $allVecs; if (count($vecs) === count($uploadChunks)) { foreach ($uploadChunks as $i => $chunk) { $this->uploadVecs[] = [ @@ -153,7 +169,7 @@ final class DbnDeepResearchAgent } catch (Throwable $e) { error_log('DBN deep research upload embed failed: ' . $e->getMessage()); $uploadStatus = 'warning'; - $uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.'; + $uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.'; $this->uploadVecs = []; } } elseif (empty($uploadedFiles)) { @@ -628,7 +644,8 @@ PROMPT; /** * Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc. - * One batched dbn_v6 query for all unique document_ids. + * Uses a direct query against bnl_corpus.documents (only columns that exist there — + * the temporal columns added in migration 136 are absent on this instance). */ private function hydrateSourceUrls(array &$pool): void { @@ -641,7 +658,50 @@ PROMPT; if (empty($docIds)) return; try { - $meta = dbnV6QueryDocumentMeta(dbnToolsDb(), dbnToolsRagDb(), array_keys($docIds)); + $ragDb = dbnToolsRagDb(); + $ids = array_keys($docIds); + $ph = implode(',', array_fill(0, count($ids), '?')); + + $stmt = $ragDb->prepare(" + SELECT d.id, d.title, d.source_url, d.authority_type, + d.publication_date, d.source_id, d.jurisdiction + FROM documents d + WHERE d.id IN ({$ph}) + "); + $stmt->execute($ids); + + $docMeta = []; + $sourceIds = []; + foreach ($stmt as $row) { + $dId = (int)$row['id']; + $sid = isset($row['source_id']) ? (int)$row['source_id'] : null; + if ($sid) $sourceIds[] = $sid; + $docMeta[$dId] = [ + 'source_url' => $row['source_url'] ?? null, + 'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null), + 'publication_date' => $row['publication_date'] ?? null, + 'corpus_source_name' => 'Do Better Legal', + 'source_id' => $sid, + ]; + } + + // Enrich with corpus source name from bnl_admin.corpus_sources + if (!empty($sourceIds)) { + $uSids = array_values(array_unique($sourceIds)); + $sPh = implode(',', array_fill(0, count($uSids), '?')); + $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})"); + $sStmt->execute($uSids); + $srcNames = []; + foreach ($sStmt as $row) { + $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal')); + } + foreach ($docMeta as &$m) { + if ($m['source_id'] && isset($srcNames[$m['source_id']])) { + $m['corpus_source_name'] = $srcNames[$m['source_id']]; + } + } + unset($m); + } } catch (Throwable $e) { error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage()); return; @@ -650,8 +710,8 @@ PROMPT; foreach ($pool as &$chunk) { if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue; $docId = (int)($chunk['document_id'] ?? 0); - if (!$docId || !isset($meta[$docId])) continue; - $m = $meta[$docId]; + if (!$docId || !isset($docMeta[$docId])) continue; + $m = $docMeta[$docId]; $sourceUrl = $m['source_url'] ?? null; $chunk['source_url'] = $sourceUrl; $chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);