fix: batch embed 5 chunks at a time with flush between; fix hydrateSourceUrls SQL

Embed timeout: bnl_corpus Ollama embeds ~49 chunks sequentially in CPU mode, easily exceeding the 60s cURL timeout. Now truncates upload text to MAX_UPLOAD_CHARS before chunking (~21 chunks max) and embeds in batches of 5 with a progress flush between batches to keep the stream alive. SQL error: bnl_corpus.documents lacks the temporal columns added in migration 136 (valid_from, valid_until, etc.). dbnV6QueryDocumentMeta uses IFNULL which doesn't protect against missing columns. Replaced with a direct query using only the columns confirmed to exist on this instance. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 11:42:38 +02:00
parent d2f9831472
commit 785de04f05
1 changed files with 67 additions and 7 deletions
@@ -130,15 +130,31 @@ final class DbnDeepResearchAgent
        $uploadChunks = [];
        foreach ($uploadedFiles as $idx => $file) {
            $filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
-            $text = (string)($file['text'] ?? '');
+            // Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
+            $text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
            $uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
        }
        $uploadStatus = 'complete';
        $uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
        if ($uploadChunks) {
            try {
+                // Embed in small batches of 5, emitting progress between each so the stream
+                // stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
                $texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
-                $vecs = dbnToolsLiteLLMEmbedBatch($texts);
+                $allVecs = [];
+                $batchSize = 5;
+                for ($b = 0; $b < count($texts); $b += $batchSize) {
+                    $batch = array_slice($texts, $b, $batchSize);
+                    if ($emit) {
+                        $emit('progress', ['detail' => sprintf(
+                            'Embedding chunks %d–%d of %d…',
+                            $b + 1, $b + count($batch), count($texts)
+                        )]);
+                    }
+                    $batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
+                    $allVecs = array_merge($allVecs, $batchVecs);
+                }
+                $vecs = $allVecs;
                if (count($vecs) === count($uploadChunks)) {
                    foreach ($uploadChunks as $i => $chunk) {
                        $this->uploadVecs[] = [
@@ -153,7 +169,7 @@ final class DbnDeepResearchAgent
            } catch (Throwable $e) {
                error_log('DBN deep research upload embed failed: ' . $e->getMessage());
                $uploadStatus = 'warning';
-                $uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.';
+                $uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
                $this->uploadVecs = [];
            }
        } elseif (empty($uploadedFiles)) {
@@ -628,7 +644,8 @@ PROMPT;

    /**
     * Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
-     * One batched dbn_v6 query for all unique document_ids.
+     * Uses a direct query against bnl_corpus.documents (only columns that exist there —
+     * the temporal columns added in migration 136 are absent on this instance).
     */
    private function hydrateSourceUrls(array &$pool): void
    {
@@ -641,7 +658,50 @@ PROMPT;
        if (empty($docIds)) return;

        try {
-            $meta = dbnV6QueryDocumentMeta(dbnToolsDb(), dbnToolsRagDb(), array_keys($docIds));
+            $ragDb = dbnToolsRagDb();
+            $ids   = array_keys($docIds);
+            $ph    = implode(',', array_fill(0, count($ids), '?'));
+
+            $stmt = $ragDb->prepare("
+                SELECT d.id, d.title, d.source_url, d.authority_type,
+                       d.publication_date, d.source_id, d.jurisdiction
+                FROM documents d
+                WHERE d.id IN ({$ph})
+            ");
+            $stmt->execute($ids);
+
+            $docMeta  = [];
+            $sourceIds = [];
+            foreach ($stmt as $row) {
+                $dId = (int)$row['id'];
+                $sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
+                if ($sid) $sourceIds[] = $sid;
+                $docMeta[$dId] = [
+                    'source_url'       => $row['source_url']       ?? null,
+                    'authority_label'  => dbnV6AuthorityLabel($row['authority_type'] ?? null),
+                    'publication_date' => $row['publication_date'] ?? null,
+                    'corpus_source_name' => 'Do Better Legal',
+                    'source_id'        => $sid,
+                ];
+            }
+
+            // Enrich with corpus source name from bnl_admin.corpus_sources
+            if (!empty($sourceIds)) {
+                $uSids = array_values(array_unique($sourceIds));
+                $sPh   = implode(',', array_fill(0, count($uSids), '?'));
+                $sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
+                $sStmt->execute($uSids);
+                $srcNames = [];
+                foreach ($sStmt as $row) {
+                    $srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
+                }
+                foreach ($docMeta as &$m) {
+                    if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
+                        $m['corpus_source_name'] = $srcNames[$m['source_id']];
+                    }
+                }
+                unset($m);
+            }
        } catch (Throwable $e) {
            error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
            return;
@@ -650,8 +710,8 @@ PROMPT;
        foreach ($pool as &$chunk) {
            if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
            $docId = (int)($chunk['document_id'] ?? 0);
-            if (!$docId || !isset($meta[$docId])) continue;
-            $m = $meta[$docId];
+            if (!$docId || !isset($docMeta[$docId])) continue;
+            $m = $docMeta[$docId];
            $sourceUrl = $m['source_url'] ?? null;
            $chunk['source_url']         = $sourceUrl;
            $chunk['deep_link']          = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);