Add chunked timeline routing
This commit is contained in:
+277
-3
@@ -7,6 +7,7 @@ require_once __DIR__ . '/AzureOpenAiGateway.php';
|
||||
final class DbnLegalToolsService
|
||||
{
|
||||
private const MAX_PASTE_CHARS = 128000;
|
||||
private const MAX_TIMELINE_CHARS = 600000;
|
||||
|
||||
private DbnAzureOpenAiGateway $azure;
|
||||
|
||||
@@ -353,7 +354,7 @@ PROMPT;
|
||||
string $userNotes = '',
|
||||
?callable $onProgress = null
|
||||
): array {
|
||||
$text = $this->requirePasteText($text);
|
||||
$text = $this->requirePasteText($text, self::MAX_TIMELINE_CHARS);
|
||||
$engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini';
|
||||
$focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all';
|
||||
|
||||
@@ -382,6 +383,23 @@ PROMPT;
|
||||
? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---"
|
||||
: '';
|
||||
|
||||
$charCount = mb_strlen($text, 'UTF-8');
|
||||
$singlePassLimit = $this->timelineSinglePassLimit($engine);
|
||||
if ($charCount > $singlePassLimit) {
|
||||
return $this->timelineChunked(
|
||||
$text,
|
||||
$language,
|
||||
$engine,
|
||||
$focus,
|
||||
$confidenceFilter,
|
||||
$includeRelative,
|
||||
$includeBackground,
|
||||
$userNotes,
|
||||
$onProgress,
|
||||
$inputDateHintCount
|
||||
);
|
||||
}
|
||||
|
||||
$prompt = <<<PROMPT
|
||||
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
|
||||
|
||||
@@ -589,6 +607,261 @@ PROMPT;
|
||||
];
|
||||
}
|
||||
|
||||
private function timelineChunked(
|
||||
string $text,
|
||||
string $language,
|
||||
string $engine,
|
||||
string $focus,
|
||||
string $confidenceFilter,
|
||||
bool $includeRelative,
|
||||
bool $includeBackground,
|
||||
string $userNotes,
|
||||
?callable $onProgress,
|
||||
int $inputDateHintCount
|
||||
): array {
|
||||
$engineLabel = match ($engine) { 'azure_full' => 'gpt-4o', 'nova_lite' => 'nova-lite', default => 'gpt-4o-mini' };
|
||||
$chunkSize = $this->timelineChunkSize($engine);
|
||||
$chunks = $this->timelineTextChunks($text, $chunkSize, 900);
|
||||
$chunkCount = count($chunks);
|
||||
$events = [];
|
||||
$chunkFailures = 0;
|
||||
$usedFallbackExtractor = false;
|
||||
|
||||
$onProgress && $onProgress('Splitting timeline into ' . $chunkCount . " chunk(s)\u{2026}");
|
||||
|
||||
foreach ($chunks as $idx => $chunk) {
|
||||
$chunkNo = $idx + 1;
|
||||
$chunkText = trim((string)$chunk['text']);
|
||||
if (mb_strlen($chunkText, 'UTF-8') < 20) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$onProgress && $onProgress("Extracting timeline chunk {$chunkNo}/{$chunkCount}\u{2026}");
|
||||
try {
|
||||
$result = $this->timeline(
|
||||
$chunkText,
|
||||
$language,
|
||||
$engine,
|
||||
$focus,
|
||||
$confidenceFilter,
|
||||
$includeRelative,
|
||||
$includeBackground,
|
||||
$userNotes,
|
||||
null
|
||||
);
|
||||
$chunkEvents = is_array($result['events'] ?? null) ? $result['events'] : [];
|
||||
if (!empty($result['trace_metadata']['used_fallback_extractor'])) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
} catch (DbnToolsHttpException $e) {
|
||||
$chunkFailures++;
|
||||
$chunkEvents = [];
|
||||
if ($this->timelineDateHintCount($chunkText) > 0) {
|
||||
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
|
||||
if ($chunkEvents) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
}
|
||||
if (!$chunkEvents && $e->status >= 500) {
|
||||
error_log('timeline chunk failed: ' . $e->errorCode . ' ' . $e->getMessage());
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
$chunkFailures++;
|
||||
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
|
||||
if ($chunkEvents) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
error_log('timeline chunk throwable: ' . $e->getMessage());
|
||||
}
|
||||
|
||||
foreach ($chunkEvents as $event) {
|
||||
if (!is_array($event)) {
|
||||
continue;
|
||||
}
|
||||
$event['chunk_index'] = $chunkNo;
|
||||
$event['source_position'] = (int)$chunk['start'];
|
||||
$events[] = $event;
|
||||
}
|
||||
}
|
||||
|
||||
$events = $this->mergeTimelineEvents($events);
|
||||
if ($confidenceFilter === 'high_medium') {
|
||||
$events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low'));
|
||||
}
|
||||
if (!$includeRelative) {
|
||||
$events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute'));
|
||||
}
|
||||
|
||||
$focusLabel = match ($focus) {
|
||||
'deadlines' => 'legal deadlines',
|
||||
'hearings' => 'court hearings',
|
||||
'cps' => 'CPS milestones',
|
||||
default => 'all events',
|
||||
};
|
||||
$isoDates = array_values(array_filter(array_map(fn($ev) => (string)($ev['date'] ?? ''), $events), fn($d) => preg_match('/^\d{4}-\d{2}-\d{2}$/', $d)));
|
||||
sort($isoDates);
|
||||
$range = $isoDates ? (' from ' . $isoDates[0] . ' to ' . $isoDates[count($isoDates) - 1]) : '';
|
||||
$actors = array_values(array_unique(array_filter(array_map(fn($ev) => (string)($ev['actor'] ?? ''), $events), fn($a) => $a !== '' && $a !== 'unknown')));
|
||||
$summary = count($events) . " event(s) extracted from {$chunkCount} chunk(s){$range}.";
|
||||
if ($actors) {
|
||||
$summary .= ' Main actors: ' . implode(', ', array_slice($actors, 0, 8)) . '.';
|
||||
}
|
||||
|
||||
$uncertain = [];
|
||||
if ($chunkFailures > 0) {
|
||||
$uncertain[] = "{$chunkFailures} chunk(s) needed fallback extraction or could not be fully parsed; review medium-confidence entries against the source.";
|
||||
}
|
||||
if ($usedFallbackExtractor) {
|
||||
$uncertain[] = 'Deterministic fallback extraction was used for at least one chunk.';
|
||||
}
|
||||
|
||||
$trace = [
|
||||
$this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Chunked timeline mode; without saving the text or output.", 'complete'),
|
||||
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text split into local chunks.', 'complete'),
|
||||
$this->trace('Evidence found', count($events) . " event(s) identified across {$chunkCount} chunk(s).", count($events) ? 'complete' : 'warning'),
|
||||
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text. Overlapping chunks were de-duplicated.', 'complete'),
|
||||
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($uncertain), $uncertain ? 'warning' : 'complete'),
|
||||
$this->trace('Next practical step', 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'complete'),
|
||||
];
|
||||
|
||||
return [
|
||||
'tool' => 'timeline',
|
||||
'language' => $language,
|
||||
'what_we_found' => $summary,
|
||||
'events' => $events,
|
||||
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; split into local timeline chunks; not stored.']],
|
||||
'what_remains_uncertain' => $uncertain,
|
||||
'next_practical_step' => 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.',
|
||||
'trace' => $trace,
|
||||
'trace_metadata' => [
|
||||
'chunk_count' => count($events),
|
||||
'source_count' => $chunkCount,
|
||||
'deployment' => $engineLabel,
|
||||
'input_date_hint_count' => $inputDateHintCount,
|
||||
'used_fallback_extractor' => $usedFallbackExtractor,
|
||||
'chunked_timeline' => true,
|
||||
'timeline_chunk_count' => $chunkCount,
|
||||
'chunk_failures' => $chunkFailures,
|
||||
],
|
||||
'disclaimer' => dbnToolsDisclaimer($language),
|
||||
];
|
||||
}
|
||||
|
||||
private function timelineSinglePassLimit(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 25000,
|
||||
'azure_mini' => 55000,
|
||||
default => 128000,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineChunkSize(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 10000,
|
||||
'azure_mini' => 16000,
|
||||
default => 30000,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineTextChunks(string $text, int $chunkSize, int $overlap): array
|
||||
{
|
||||
$len = mb_strlen($text, 'UTF-8');
|
||||
$chunks = [];
|
||||
$start = 0;
|
||||
while ($start < $len) {
|
||||
$targetEnd = min($len, $start + $chunkSize);
|
||||
$window = mb_substr($text, $start, $targetEnd - $start, 'UTF-8');
|
||||
$end = $targetEnd;
|
||||
if ($targetEnd < $len) {
|
||||
$breakAt = mb_strrpos($window, "\n\n", 0, 'UTF-8');
|
||||
if ($breakAt === false || $breakAt < (int)($chunkSize * 0.55)) {
|
||||
$breakAt = mb_strrpos($window, "\n", 0, 'UTF-8');
|
||||
}
|
||||
if ($breakAt !== false && $breakAt > (int)($chunkSize * 0.45)) {
|
||||
$end = $start + $breakAt;
|
||||
}
|
||||
}
|
||||
|
||||
$chunkText = trim(mb_substr($text, $start, max(1, $end - $start), 'UTF-8'));
|
||||
if ($chunkText !== '') {
|
||||
$chunks[] = ['start' => $start, 'text' => $chunkText];
|
||||
}
|
||||
if ($end >= $len) {
|
||||
break;
|
||||
}
|
||||
$nextStart = max(0, $end - $overlap);
|
||||
if ($nextStart <= $start) {
|
||||
$nextStart = $end;
|
||||
}
|
||||
$start = $nextStart;
|
||||
}
|
||||
return $chunks;
|
||||
}
|
||||
|
||||
private function mergeTimelineEvents(array $events): array
|
||||
{
|
||||
$merged = [];
|
||||
foreach ($events as $event) {
|
||||
if (!is_array($event)) {
|
||||
continue;
|
||||
}
|
||||
$key = $this->timelineEventSignature($event);
|
||||
if (!isset($merged[$key])) {
|
||||
$merged[$key] = $event;
|
||||
continue;
|
||||
}
|
||||
$existing = $merged[$key];
|
||||
$candidateExcerpt = (string)($event['source_excerpt'] ?? '');
|
||||
$existingExcerpt = (string)($existing['source_excerpt'] ?? '');
|
||||
$additionalExcerpt = $candidateExcerpt;
|
||||
if ($this->timelineConfidenceRank((string)($event['confidence'] ?? 'medium')) > $this->timelineConfidenceRank((string)($existing['confidence'] ?? 'medium'))) {
|
||||
$merged[$key] = $event;
|
||||
$additionalExcerpt = $existingExcerpt;
|
||||
}
|
||||
$oldExcerpt = (string)($merged[$key]['source_excerpt'] ?? '');
|
||||
$newExcerpt = $additionalExcerpt;
|
||||
if ($newExcerpt !== '' && $oldExcerpt !== '' && $newExcerpt !== $oldExcerpt && mb_strlen($oldExcerpt, 'UTF-8') < 260) {
|
||||
$merged[$key]['source_excerpt'] = $oldExcerpt . ' / ' . $newExcerpt;
|
||||
}
|
||||
}
|
||||
|
||||
$events = array_values($merged);
|
||||
usort($events, static function (array $a, array $b): int {
|
||||
$ad = (string)($a['date'] ?? '');
|
||||
$bd = (string)($b['date'] ?? '');
|
||||
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
|
||||
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
|
||||
$cmp = strcmp($ai, $bi);
|
||||
if ($cmp !== 0) {
|
||||
return $cmp;
|
||||
}
|
||||
return strcmp((string)($a['time'] ?? ''), (string)($b['time'] ?? ''));
|
||||
});
|
||||
return $events;
|
||||
}
|
||||
|
||||
private function timelineEventSignature(array $event): string
|
||||
{
|
||||
$date = mb_strtolower(trim((string)($event['date'] ?? '')), 'UTF-8');
|
||||
$time = mb_strtolower(trim((string)($event['time'] ?? '')), 'UTF-8');
|
||||
$actor = mb_strtolower(trim((string)($event['actor'] ?? 'unknown')), 'UTF-8');
|
||||
$body = mb_strtolower(trim((string)($event['event'] ?? '')), 'UTF-8');
|
||||
$body = (string)preg_replace('/[^\p{L}\p{N}]+/u', ' ', $body);
|
||||
$body = trim((string)preg_replace('/\s+/u', ' ', $body));
|
||||
return $date . '|' . $time . '|' . $actor . '|' . mb_substr($body, 0, 96, 'UTF-8');
|
||||
}
|
||||
|
||||
private function timelineConfidenceRank(string $confidence): int
|
||||
{
|
||||
return match ($confidence) {
|
||||
'high' => 3,
|
||||
'medium' => 2,
|
||||
default => 1,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineDateHintCount(string $text): int
|
||||
{
|
||||
preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric);
|
||||
@@ -1106,13 +1379,14 @@ PROMPT;
|
||||
return array_slice(array_values(array_unique($terms)), 0, 6);
|
||||
}
|
||||
|
||||
private function requirePasteText(string $text): string
|
||||
private function requirePasteText(string $text, ?int $maxChars = null): string
|
||||
{
|
||||
$text = trim($text);
|
||||
if (mb_strlen($text, 'UTF-8') < 20) {
|
||||
dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short');
|
||||
}
|
||||
if (mb_strlen($text, 'UTF-8') > self::MAX_PASTE_CHARS) {
|
||||
$maxChars ??= self::MAX_PASTE_CHARS;
|
||||
if (mb_strlen($text, 'UTF-8') > $maxChars) {
|
||||
dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long');
|
||||
}
|
||||
return $text;
|
||||
|
||||
Reference in New Issue
Block a user