fix: batch embed 5 chunks at a time with flush between; fix hydrateSourceUrls SQL
Embed timeout: bnl_corpus Ollama embeds ~49 chunks sequentially in CPU mode, easily exceeding the 60s cURL timeout. Now truncates upload text to MAX_UPLOAD_CHARS before chunking (~21 chunks max) and embeds in batches of 5 with a progress flush between batches to keep the stream alive. SQL error: bnl_corpus.documents lacks the temporal columns added in migration 136 (valid_from, valid_until, etc.). dbnV6QueryDocumentMeta uses IFNULL which doesn't protect against missing columns. Replaced with a direct query using only the columns confirmed to exist on this instance. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -130,15 +130,31 @@ final class DbnDeepResearchAgent
|
||||
$uploadChunks = [];
|
||||
foreach ($uploadedFiles as $idx => $file) {
|
||||
$filename = (string)($file['filename'] ?? sprintf('upload-%d', $idx + 1));
|
||||
$text = (string)($file['text'] ?? '');
|
||||
// Truncate to MAX_UPLOAD_CHARS before chunking to cap the embedding batch size
|
||||
$text = mb_substr((string)($file['text'] ?? ''), 0, self::MAX_UPLOAD_CHARS, 'UTF-8');
|
||||
$uploadChunks = array_merge($uploadChunks, $this->splitIntoChunks($text, $filename, $idx));
|
||||
}
|
||||
$uploadStatus = 'complete';
|
||||
$uploadDetail = sprintf('%d upload file(s) → %d in-memory chunks indexed with nomic-embed-text.', count($uploadedFiles), count($uploadChunks));
|
||||
if ($uploadChunks) {
|
||||
try {
|
||||
// Embed in small batches of 5, emitting progress between each so the stream
|
||||
// stays alive during slow CPU Ollama inference (nomic-embed-text on chloe).
|
||||
$texts = array_map(fn(array $c) => $c['text'], $uploadChunks);
|
||||
$vecs = dbnToolsLiteLLMEmbedBatch($texts);
|
||||
$allVecs = [];
|
||||
$batchSize = 5;
|
||||
for ($b = 0; $b < count($texts); $b += $batchSize) {
|
||||
$batch = array_slice($texts, $b, $batchSize);
|
||||
if ($emit) {
|
||||
$emit('progress', ['detail' => sprintf(
|
||||
'Embedding chunks %d–%d of %d…',
|
||||
$b + 1, $b + count($batch), count($texts)
|
||||
)]);
|
||||
}
|
||||
$batchVecs = dbnToolsLiteLLMEmbedBatch($batch);
|
||||
$allVecs = array_merge($allVecs, $batchVecs);
|
||||
}
|
||||
$vecs = $allVecs;
|
||||
if (count($vecs) === count($uploadChunks)) {
|
||||
foreach ($uploadChunks as $i => $chunk) {
|
||||
$this->uploadVecs[] = [
|
||||
@@ -153,7 +169,7 @@ final class DbnDeepResearchAgent
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN deep research upload embed failed: ' . $e->getMessage());
|
||||
$uploadStatus = 'warning';
|
||||
$uploadDetail = 'Upload embedding gateway unreachable; uploaded chunks will not participate in retrieval.';
|
||||
$uploadDetail = 'Upload embedding timed out; uploaded chunks will not participate in retrieval.';
|
||||
$this->uploadVecs = [];
|
||||
}
|
||||
} elseif (empty($uploadedFiles)) {
|
||||
@@ -628,7 +644,8 @@ PROMPT;
|
||||
|
||||
/**
|
||||
* Hydrate the synthesisPool in place with source_url/deep_link/authority_label/etc.
|
||||
* One batched dbn_v6 query for all unique document_ids.
|
||||
* Uses a direct query against bnl_corpus.documents (only columns that exist there —
|
||||
* the temporal columns added in migration 136 are absent on this instance).
|
||||
*/
|
||||
private function hydrateSourceUrls(array &$pool): void
|
||||
{
|
||||
@@ -641,7 +658,50 @@ PROMPT;
|
||||
if (empty($docIds)) return;
|
||||
|
||||
try {
|
||||
$meta = dbnV6QueryDocumentMeta(dbnToolsDb(), dbnToolsRagDb(), array_keys($docIds));
|
||||
$ragDb = dbnToolsRagDb();
|
||||
$ids = array_keys($docIds);
|
||||
$ph = implode(',', array_fill(0, count($ids), '?'));
|
||||
|
||||
$stmt = $ragDb->prepare("
|
||||
SELECT d.id, d.title, d.source_url, d.authority_type,
|
||||
d.publication_date, d.source_id, d.jurisdiction
|
||||
FROM documents d
|
||||
WHERE d.id IN ({$ph})
|
||||
");
|
||||
$stmt->execute($ids);
|
||||
|
||||
$docMeta = [];
|
||||
$sourceIds = [];
|
||||
foreach ($stmt as $row) {
|
||||
$dId = (int)$row['id'];
|
||||
$sid = isset($row['source_id']) ? (int)$row['source_id'] : null;
|
||||
if ($sid) $sourceIds[] = $sid;
|
||||
$docMeta[$dId] = [
|
||||
'source_url' => $row['source_url'] ?? null,
|
||||
'authority_label' => dbnV6AuthorityLabel($row['authority_type'] ?? null),
|
||||
'publication_date' => $row['publication_date'] ?? null,
|
||||
'corpus_source_name' => 'Do Better Legal',
|
||||
'source_id' => $sid,
|
||||
];
|
||||
}
|
||||
|
||||
// Enrich with corpus source name from bnl_admin.corpus_sources
|
||||
if (!empty($sourceIds)) {
|
||||
$uSids = array_values(array_unique($sourceIds));
|
||||
$sPh = implode(',', array_fill(0, count($uSids), '?'));
|
||||
$sStmt = dbnToolsDb()->prepare("SELECT id, name FROM corpus_sources WHERE id IN ({$sPh})");
|
||||
$sStmt->execute($uSids);
|
||||
$srcNames = [];
|
||||
foreach ($sStmt as $row) {
|
||||
$srcNames[(int)$row['id']] = dbnV6RepairText((string)($row['name'] ?? 'Do Better Legal'));
|
||||
}
|
||||
foreach ($docMeta as &$m) {
|
||||
if ($m['source_id'] && isset($srcNames[$m['source_id']])) {
|
||||
$m['corpus_source_name'] = $srcNames[$m['source_id']];
|
||||
}
|
||||
}
|
||||
unset($m);
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
error_log('DBN deep research hydrateSourceUrls failed: ' . $e->getMessage());
|
||||
return;
|
||||
@@ -650,8 +710,8 @@ PROMPT;
|
||||
foreach ($pool as &$chunk) {
|
||||
if (($chunk['source_origin'] ?? 'corpus') !== 'corpus') continue;
|
||||
$docId = (int)($chunk['document_id'] ?? 0);
|
||||
if (!$docId || !isset($meta[$docId])) continue;
|
||||
$m = $meta[$docId];
|
||||
if (!$docId || !isset($docMeta[$docId])) continue;
|
||||
$m = $docMeta[$docId];
|
||||
$sourceUrl = $m['source_url'] ?? null;
|
||||
$chunk['source_url'] = $sourceUrl;
|
||||
$chunk['deep_link'] = $this->buildDeepLink($sourceUrl, $chunk['section'] ?? null);
|
||||
|
||||
Reference in New Issue
Block a user