fix: batch embed 5 chunks at a time with flush between; fix hydrateSourceUrls SQL

Embed timeout: bnl_corpus Ollama embeds ~49 chunks sequentially in CPU mode,
easily exceeding the 60s cURL timeout. Now truncates upload text to
MAX_UPLOAD_CHARS before chunking (~21 chunks max) and embeds in batches of 5
with a progress flush between batches to keep the stream alive.

SQL error: bnl_corpus.documents lacks the temporal columns added in migration
136 (valid_from, valid_until, etc.). dbnV6QueryDocumentMeta uses IFNULL which
doesn't protect against missing columns. Replaced with a direct query using
only the columns confirmed to exist on this instance.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-15 11:42:38 +02:00
parent d2f9831472
commit 785de04f05
+67 -7
View File
@@ -130,15 +130,31 @@ final class DbnDeepResearchAgent
$uploadChunks = [];
foreach ($uploadedFiles as $idx => $file) {
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
$text = (string)($file['text'] ?? '');
// Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
}
$uploadStatus = 'complete';
$uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
if ($uploadChunks) {
try {
// Embed in small batches of 5, emitting progress between each so the stream
// stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
$vecs = dbnToolsLiteLLMEmbedBatch($texts);
$allVecs = [];
$batchSize = 5;
for ($b = 0; $b < count($texts); $b += $batchSize) {
$batch = array_slice($texts, $b, $batchSize);
if ($emit) {
$emit('progress', ['detail' => sprintf(
'Embedding chunks %d%d of %d…',
$b + 1, $b + count($batch), count($texts)
)]);
}
$batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
$allVecs = array_merge($allVecs, $batchVecs);
}
$vecs = $allVecs;
if (count($vecs) === count($uploadChunks)) {
foreach ($uploadChunks as $i => $chunk) {
$this->uploadVecs[] = [
@@ -153,7 +169,7 @@ final class DbnDeepResearchAgent
} catch (Throwable $e) {
error_log('DBN deep research upload embed failed: ' . $e->getMessage());
$uploadStatus = 'warning';
$uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.';
$uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
$this->uploadVecs = [];
}
} elseif (empty($uploadedFiles)) {
@@ -628,7 +644,8 @@ PROMPT;
/**
* Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
* One batched dbn_v6 query for all unique document_ids.
* Uses a direct query against bnl_corpus.documents (only columns that exist there —
* the temporal columns added in migration 136 are absent on this instance).
*/
private function hydrateSourceUrls(array &$pool): void
{
@@ -641,7 +658,50 @@ PROMPT;
if (empty($docIds)) return;
try {
$meta = dbnV6QueryDocumentMeta(dbnToolsDb(), dbnToolsRagDb(), array_keys($docIds));
$ragDb = dbnToolsRagDb();
$ids = array_keys($docIds);
$ph = implode(',', array_fill(0, count($ids), '?'));
$stmt = $ragDb->prepare("
SELECT d.id, d.title, d.source_url, d.authority_type,
d.publication_date, d.source_id, d.jurisdiction
FROM documents d
WHERE d.id IN ({$ph})
");
$stmt->execute($ids);
$docMeta = [];
$sourceIds = [];
foreach ($stmt as $row) {
$dId = (int)$row['id'];
$sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
if ($sid) $sourceIds[] = $sid;
$docMeta[$dId] = [
'source_url' => $row['source_url'] ?? null,
'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null),
'publication_date' => $row['publication_date'] ?? null,
'corpus_source_name' => 'Do Better Legal',
'source_id' => $sid,
];
}
// Enrich with corpus source name from bnl_admin.corpus_sources
if (!empty($sourceIds)) {
$uSids = array_values(array_unique($sourceIds));
$sPh = implode(',', array_fill(0, count($uSids), '?'));
$sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
$sStmt->execute($uSids);
$srcNames = [];
foreach ($sStmt as $row) {
$srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
}
foreach ($docMeta as &$m) {
if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
$m['corpus_source_name'] = $srcNames[$m['source_id']];
}
}
unset($m);
}
} catch (Throwable $e) {
error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
return;
@@ -650,8 +710,8 @@ PROMPT;
foreach ($pool as &$chunk) {
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
$docId = (int)($chunk['document_id'] ?? 0);
if (!$docId || !isset($meta[$docId])) continue;
$m = $meta[$docId];
if (!$docId || !isset($docMeta[$docId])) continue;
$m = $docMeta[$docId];
$sourceUrl = $m['source_url'] ?? null;
$chunk['source_url'] = $sourceUrl;
$chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);