Add chunked timeline routing

This commit is contained in:
2026-05-25 12:34:41 +02:00
parent 75b19f1dcf
commit 17ad54cf36
7 changed files with 521 additions and 31 deletions
+277 -3
View File
@@ -7,6 +7,7 @@ require_once __DIR__ . '/AzureOpenAiGateway.php';
final class DbnLegalToolsService
{
private const MAX_PASTE_CHARS = 128000;
private const MAX_TIMELINE_CHARS = 600000;
private DbnAzureOpenAiGateway $azure;
@@ -353,7 +354,7 @@ PROMPT;
string $userNotes = '',
?callable $onProgress = null
): array {
$text = $this->requirePasteText($text);
$text = $this->requirePasteText($text, self::MAX_TIMELINE_CHARS);
$engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini';
$focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all';
@@ -382,6 +383,23 @@ PROMPT;
? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---"
: '';
$charCount = mb_strlen($text, 'UTF-8');
$singlePassLimit = $this->timelineSinglePassLimit($engine);
if ($charCount > $singlePassLimit) {
return $this->timelineChunked(
$text,
$language,
$engine,
$focus,
$confidenceFilter,
$includeRelative,
$includeBackground,
$userNotes,
$onProgress,
$inputDateHintCount
);
}
$prompt = <<<PROMPT
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
@@ -589,6 +607,261 @@ PROMPT;
];
}
private function timelineChunked(
string $text,
string $language,
string $engine,
string $focus,
string $confidenceFilter,
bool $includeRelative,
bool $includeBackground,
string $userNotes,
?callable $onProgress,
int $inputDateHintCount
): array {
$engineLabel = match ($engine) { 'azure_full' => 'gpt-4o', 'nova_lite' => 'nova-lite', default => 'gpt-4o-mini' };
$chunkSize = $this->timelineChunkSize($engine);
$chunks = $this->timelineTextChunks($text, $chunkSize, 900);
$chunkCount = count($chunks);
$events = [];
$chunkFailures = 0;
$usedFallbackExtractor = false;
$onProgress && $onProgress('Splitting timeline into ' . $chunkCount . " chunk(s)\u{2026}");
foreach ($chunks as $idx => $chunk) {
$chunkNo = $idx + 1;
$chunkText = trim((string)$chunk['text']);
if (mb_strlen($chunkText, 'UTF-8') < 20) {
continue;
}
$onProgress && $onProgress("Extracting timeline chunk {$chunkNo}/{$chunkCount}\u{2026}");
try {
$result = $this->timeline(
$chunkText,
$language,
$engine,
$focus,
$confidenceFilter,
$includeRelative,
$includeBackground,
$userNotes,
null
);
$chunkEvents = is_array($result['events'] ?? null) ? $result['events'] : [];
if (!empty($result['trace_metadata']['used_fallback_extractor'])) {
$usedFallbackExtractor = true;
}
} catch (DbnToolsHttpException $e) {
$chunkFailures++;
$chunkEvents = [];
if ($this->timelineDateHintCount($chunkText) > 0) {
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
if ($chunkEvents) {
$usedFallbackExtractor = true;
}
}
if (!$chunkEvents && $e->status >= 500) {
error_log('timeline chunk failed: ' . $e->errorCode . ' ' . $e->getMessage());
}
} catch (Throwable $e) {
$chunkFailures++;
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
if ($chunkEvents) {
$usedFallbackExtractor = true;
}
error_log('timeline chunk throwable: ' . $e->getMessage());
}
foreach ($chunkEvents as $event) {
if (!is_array($event)) {
continue;
}
$event['chunk_index'] = $chunkNo;
$event['source_position'] = (int)$chunk['start'];
$events[] = $event;
}
}
$events = $this->mergeTimelineEvents($events);
if ($confidenceFilter === 'high_medium') {
$events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low'));
}
if (!$includeRelative) {
$events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute'));
}
$focusLabel = match ($focus) {
'deadlines' => 'legal deadlines',
'hearings' => 'court hearings',
'cps' => 'CPS milestones',
default => 'all events',
};
$isoDates = array_values(array_filter(array_map(fn($ev) => (string)($ev['date'] ?? ''), $events), fn($d) => preg_match('/^\d{4}-\d{2}-\d{2}$/', $d)));
sort($isoDates);
$range = $isoDates ? (' from ' . $isoDates[0] . ' to ' . $isoDates[count($isoDates) - 1]) : '';
$actors = array_values(array_unique(array_filter(array_map(fn($ev) => (string)($ev['actor'] ?? ''), $events), fn($a) => $a !== '' && $a !== 'unknown')));
$summary = count($events) . " event(s) extracted from {$chunkCount} chunk(s){$range}.";
if ($actors) {
$summary .= ' Main actors: ' . implode(', ', array_slice($actors, 0, 8)) . '.';
}
$uncertain = [];
if ($chunkFailures > 0) {
$uncertain[] = "{$chunkFailures} chunk(s) needed fallback extraction or could not be fully parsed; review medium-confidence entries against the source.";
}
if ($usedFallbackExtractor) {
$uncertain[] = 'Deterministic fallback extraction was used for at least one chunk.';
}
$trace = [
$this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Chunked timeline mode; without saving the text or output.", 'complete'),
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text split into local chunks.', 'complete'),
$this->trace('Evidence found', count($events) . " event(s) identified across {$chunkCount} chunk(s).", count($events) ? 'complete' : 'warning'),
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text. Overlapping chunks were de-duplicated.', 'complete'),
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($uncertain), $uncertain ? 'warning' : 'complete'),
$this->trace('Next practical step', 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'complete'),
];
return [
'tool' => 'timeline',
'language' => $language,
'what_we_found' => $summary,
'events' => $events,
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; split into local timeline chunks; not stored.']],
'what_remains_uncertain' => $uncertain,
'next_practical_step' => 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.',
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($events),
'source_count' => $chunkCount,
'deployment' => $engineLabel,
'input_date_hint_count' => $inputDateHintCount,
'used_fallback_extractor' => $usedFallbackExtractor,
'chunked_timeline' => true,
'timeline_chunk_count' => $chunkCount,
'chunk_failures' => $chunkFailures,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function timelineSinglePassLimit(string $engine): int
{
return match ($engine) {
'nova_lite' => 25000,
'azure_mini' => 55000,
default => 128000,
};
}
private function timelineChunkSize(string $engine): int
{
return match ($engine) {
'nova_lite' => 10000,
'azure_mini' => 16000,
default => 30000,
};
}
private function timelineTextChunks(string $text, int $chunkSize, int $overlap): array
{
$len = mb_strlen($text, 'UTF-8');
$chunks = [];
$start = 0;
while ($start < $len) {
$targetEnd = min($len, $start + $chunkSize);
$window = mb_substr($text, $start, $targetEnd - $start, 'UTF-8');
$end = $targetEnd;
if ($targetEnd < $len) {
$breakAt = mb_strrpos($window, "\n\n", 0, 'UTF-8');
if ($breakAt === false || $breakAt < (int)($chunkSize * 0.55)) {
$breakAt = mb_strrpos($window, "\n", 0, 'UTF-8');
}
if ($breakAt !== false && $breakAt > (int)($chunkSize * 0.45)) {
$end = $start + $breakAt;
}
}
$chunkText = trim(mb_substr($text, $start, max(1, $end - $start), 'UTF-8'));
if ($chunkText !== '') {
$chunks[] = ['start' => $start, 'text' => $chunkText];
}
if ($end >= $len) {
break;
}
$nextStart = max(0, $end - $overlap);
if ($nextStart <= $start) {
$nextStart = $end;
}
$start = $nextStart;
}
return $chunks;
}
private function mergeTimelineEvents(array $events): array
{
$merged = [];
foreach ($events as $event) {
if (!is_array($event)) {
continue;
}
$key = $this->timelineEventSignature($event);
if (!isset($merged[$key])) {
$merged[$key] = $event;
continue;
}
$existing = $merged[$key];
$candidateExcerpt = (string)($event['source_excerpt'] ?? '');
$existingExcerpt = (string)($existing['source_excerpt'] ?? '');
$additionalExcerpt = $candidateExcerpt;
if ($this->timelineConfidenceRank((string)($event['confidence'] ?? 'medium')) > $this->timelineConfidenceRank((string)($existing['confidence'] ?? 'medium'))) {
$merged[$key] = $event;
$additionalExcerpt = $existingExcerpt;
}
$oldExcerpt = (string)($merged[$key]['source_excerpt'] ?? '');
$newExcerpt = $additionalExcerpt;
if ($newExcerpt !== '' && $oldExcerpt !== '' && $newExcerpt !== $oldExcerpt && mb_strlen($oldExcerpt, 'UTF-8') < 260) {
$merged[$key]['source_excerpt'] = $oldExcerpt . ' / ' . $newExcerpt;
}
}
$events = array_values($merged);
usort($events, static function (array $a, array $b): int {
$ad = (string)($a['date'] ?? '');
$bd = (string)($b['date'] ?? '');
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
$cmp = strcmp($ai, $bi);
if ($cmp !== 0) {
return $cmp;
}
return strcmp((string)($a['time'] ?? ''), (string)($b['time'] ?? ''));
});
return $events;
}
private function timelineEventSignature(array $event): string
{
$date = mb_strtolower(trim((string)($event['date'] ?? '')), 'UTF-8');
$time = mb_strtolower(trim((string)($event['time'] ?? '')), 'UTF-8');
$actor = mb_strtolower(trim((string)($event['actor'] ?? 'unknown')), 'UTF-8');
$body = mb_strtolower(trim((string)($event['event'] ?? '')), 'UTF-8');
$body = (string)preg_replace('/[^\p{L}\p{N}]+/u', ' ', $body);
$body = trim((string)preg_replace('/\s+/u', ' ', $body));
return $date . '|' . $time . '|' . $actor . '|' . mb_substr($body, 0, 96, 'UTF-8');
}
private function timelineConfidenceRank(string $confidence): int
{
return match ($confidence) {
'high' => 3,
'medium' => 2,
default => 1,
};
}
private function timelineDateHintCount(string $text): int
{
preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric);
@@ -1106,13 +1379,14 @@ PROMPT;
return array_slice(array_values(array_unique($terms)), 0, 6);
}
private function requirePasteText(string $text): string
private function requirePasteText(string $text, ?int $maxChars = null): string
{
$text = trim($text);
if (mb_strlen($text, 'UTF-8') < 20) {
dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short');
}
if (mb_strlen($text, 'UTF-8') > self::MAX_PASTE_CHARS) {
$maxChars ??= self::MAX_PASTE_CHARS;
if (mb_strlen($text, 'UTF-8') > $maxChars) {
dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long');
}
return $text;