Add chunked timeline routing

This commit is contained in:
2026-05-25 12:34:41 +02:00
parent 75b19f1dcf
commit 17ad54cf36
7 changed files with 521 additions and 31 deletions
+3 -1
View File
@@ -12,7 +12,9 @@ try {
dbnToolsError('No file was uploaded.', 422, 'missing_file'); dbnToolsError('No file was uploaded.', 422, 'missing_file');
} }
$result = dbnToolsExtractUploadedFile($_FILES['file']); $tool = (string)($_POST['tool'] ?? '');
$limit = $tool === 'timeline' ? DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT : DBN_TOOLS_EXTRACT_TEXT_LIMIT;
$result = dbnToolsExtractUploadedFile($_FILES['file'], $limit);
$ftRemaining = dbnToolsFreeTierDeduct($ftUid, 'extract'); $ftRemaining = dbnToolsFreeTierDeduct($ftUid, 'extract');
if ($ftRemaining >= 0) { if ($ftRemaining >= 0) {
header('X-Credits-Remaining: ' . $ftRemaining); header('X-Credits-Remaining: ' . $ftRemaining);
+12 -2
View File
@@ -9,7 +9,7 @@ dbnToolsRequireAuth();
// Parse input and run credit pre-check BEFORE emitting SSE headers so that // Parse input and run credit pre-check BEFORE emitting SSE headers so that
// auth/credit errors can still return JSON (dbnToolsError / dbnToolsAbort). // auth/credit errors can still return JSON (dbnToolsError / dbnToolsAbort).
$input = dbnToolsJsonInput(400000); $input = dbnToolsJsonInput(1500000);
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
$_validEngines = ['nova_lite', 'azure_mini', 'azure_full']; $_validEngines = ['nova_lite', 'azure_mini', 'azure_full'];
@@ -17,7 +17,7 @@ $_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, t
? (string)$input['engine'] : 'azure_mini'; ? (string)$input['engine'] : 'azure_mini';
try { try {
$text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', 128000, false)); $text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', ToolModels::TIMELINE_DEEP_MAX_CHARS, false));
if (mb_strlen(trim($text), 'UTF-8') < 10) { if (mb_strlen(trim($text), 'UTF-8') < 10) {
dbnToolsError('Paste text, upload a file, or select a document before running.', 422, 'empty_text'); dbnToolsError('Paste text, upload a file, or select a document before running.', 422, 'empty_text');
} }
@@ -33,6 +33,7 @@ try {
} }
$timelineRoute = ToolModels::timelineRoute($ftUid, $_requestedEngine, $text); $timelineRoute = ToolModels::timelineRoute($ftUid, $_requestedEngine, $text);
ToolModels::assertTimelineQuoteAccepted($timelineRoute, $input);
$ftUid = dbnToolsFreeTierCheckAmount('timeline', (int)$timelineRoute['credits']); $ftUid = dbnToolsFreeTierCheckAmount('timeline', (int)$timelineRoute['credits']);
} catch (DbnToolsHttpException $e) { } catch (DbnToolsHttpException $e) {
dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra); dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
@@ -69,6 +70,11 @@ try {
'msg' => 'This input is ' . number_format((int)$timelineRoute['input_char_count']) . " characters, so Timeline is using {$label} for reliability.", 'msg' => 'This input is ' . number_format((int)$timelineRoute['input_char_count']) . " characters, so Timeline is using {$label} for reliability.",
]); ]);
} }
if (!empty($timelineRoute['chunked_timeline'])) {
sseEmit('status', [
'msg' => 'Processing ' . (int)$timelineRoute['timeline_chunk_count'] . ' timeline chunk(s).',
]);
}
$validFocus = ['all', 'deadlines', 'hearings', 'cps']; $validFocus = ['all', 'deadlines', 'hearings', 'cps'];
$focus = in_array((string)($input['focus'] ?? ''), $validFocus, true) $focus = in_array((string)($input['focus'] ?? ''), $validFocus, true)
@@ -110,6 +116,10 @@ try {
'auto_upgraded_engine' => $timelineRoute['auto_upgraded_engine'], 'auto_upgraded_engine' => $timelineRoute['auto_upgraded_engine'],
'input_char_count' => $timelineRoute['input_char_count'], 'input_char_count' => $timelineRoute['input_char_count'],
'engine_limit_chars' => $timelineRoute['engine_limit_chars'], 'engine_limit_chars' => $timelineRoute['engine_limit_chars'],
'max_char_limit' => $timelineRoute['max_char_limit'],
'chunked_timeline' => $timelineRoute['chunked_timeline'],
'timeline_chunk_count' => $timelineRoute['timeline_chunk_count'],
'estimated_credits' => $timelineRoute['estimated_credits'],
'credits_charged' => $timelineRoute['credits'], 'credits_charged' => $timelineRoute['credits'],
]); ]);
+7 -2
View File
@@ -6,7 +6,7 @@ require_once __DIR__ . '/../includes/ToolModels.php';
dbnToolsRequireMethod('POST'); dbnToolsRequireMethod('POST');
dbnToolsRequireAuth(); dbnToolsRequireAuth();
$input = dbnToolsJsonInput(400000); $input = dbnToolsJsonInput(1500000);
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
$_validEngines = ['nova_lite', 'azure_mini', 'azure_full']; $_validEngines = ['nova_lite', 'azure_mini', 'azure_full'];
$_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, true) $_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, true)
@@ -15,7 +15,7 @@ $_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, t
$start = microtime(true); $start = microtime(true);
try { try {
$text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', 128000, false)); $text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', ToolModels::TIMELINE_DEEP_MAX_CHARS, false));
if (mb_strlen(trim($text), 'UTF-8') < 10) { if (mb_strlen(trim($text), 'UTF-8') < 10) {
dbnToolsAbort('Paste text, upload a file, or select a document before running.', 422, 'empty_text'); dbnToolsAbort('Paste text, upload a file, or select a document before running.', 422, 'empty_text');
} }
@@ -33,6 +33,7 @@ try {
} }
$timelineRoute = ToolModels::timelineRoute($ftUid, $_requestedEngine, $text); $timelineRoute = ToolModels::timelineRoute($ftUid, $_requestedEngine, $text);
ToolModels::assertTimelineQuoteAccepted($timelineRoute, $input);
$ftUid = dbnToolsFreeTierCheckAmount('timeline', (int)$timelineRoute['credits']); $ftUid = dbnToolsFreeTierCheckAmount('timeline', (int)$timelineRoute['credits']);
$validFocus = ['all', 'deadlines', 'hearings', 'cps']; $validFocus = ['all', 'deadlines', 'hearings', 'cps'];
@@ -71,6 +72,10 @@ try {
'auto_upgraded_engine' => $timelineRoute['auto_upgraded_engine'], 'auto_upgraded_engine' => $timelineRoute['auto_upgraded_engine'],
'input_char_count' => $timelineRoute['input_char_count'], 'input_char_count' => $timelineRoute['input_char_count'],
'engine_limit_chars' => $timelineRoute['engine_limit_chars'], 'engine_limit_chars' => $timelineRoute['engine_limit_chars'],
'max_char_limit' => $timelineRoute['max_char_limit'],
'chunked_timeline' => $timelineRoute['chunked_timeline'],
'timeline_chunk_count' => $timelineRoute['timeline_chunk_count'],
'estimated_credits' => $timelineRoute['estimated_credits'],
'credits_charged' => $timelineRoute['credits'], 'credits_charged' => $timelineRoute['credits'],
]); ]);
+108 -12
View File
@@ -400,6 +400,7 @@ let lastOriginalText = '';
let lastRedactPayload = null; let lastRedactPayload = null;
let lastRunEngine = null; let lastRunEngine = null;
let lastToolPayload = null; let lastToolPayload = null;
let pendingTimelineQuote = null;
const VOCAB_PRESETS = { const VOCAB_PRESETS = {
barnerett: 'Barnevernet, Fylkesnemnda, barnevernloven, barneloven, barnets beste, samvær, foreldreansvar, omsorgsovertakelse, sakkyndig, advokat, prosessfullmektig, dommer, vitne, tolk, bistandsadvokat, fosterforeldre, fosterhjem, akuttvedtak, statsforvalter, Bufetat, saksbehandler, rettslig medhold, begjæring, samtykke, tilsynsfører', barnerett: 'Barnevernet, Fylkesnemnda, barnevernloven, barneloven, barnets beste, samvær, foreldreansvar, omsorgsovertakelse, sakkyndig, advokat, prosessfullmektig, dommer, vitne, tolk, bistandsadvokat, fosterforeldre, fosterhjem, akuttvedtak, statsforvalter, Bufetat, saksbehandler, rettslig medhold, begjæring, samtykke, tilsynsfører',
@@ -764,10 +765,60 @@ function timelineEngineLabel(engine) {
} }
function timelineClientRoute(engine, charCount) { function timelineClientRoute(engine, charCount) {
let effective = engine; return timelineClientQuote(engine, charCount);
if (charCount > 55000) effective = 'azure_full'; }
else if (charCount > 25000 && effective === 'nova_lite') effective = 'azure_mini';
return { effective, upgraded: effective !== engine }; function timelineClientQuote(engine, charCount) {
const valid = ['nova_lite', 'azure_mini', 'azure_full'];
const requested = valid.includes(engine) ? engine : 'azure_mini';
const singleLimits = { nova_lite: 25000, azure_mini: 55000, azure_full: 128000 };
const maxLimits = { nova_lite: 100000, azure_mini: 300000, azure_full: 600000 };
const chunkSizes = { nova_lite: 10000, azure_mini: 16000, azure_full: 30000 };
const ranks = { nova_lite: 1, azure_mini: 2, azure_full: 3 };
const baseCredits = requested === 'azure_full' ? 2 : 1;
let effective = requested;
if (charCount > 600000) {
return {
error: true,
message: `This timeline input is ${charCount.toLocaleString()} characters. Split the file or use fewer selected documents; the current maximum is 600,000 characters.`,
};
}
if (charCount > maxLimits[effective]) {
effective = charCount <= maxLimits.azure_mini ? 'azure_mini' : 'azure_full';
}
if (charCount > maxLimits[effective]) effective = 'azure_full';
let credits = 1;
if (effective === 'nova_lite') {
credits = charCount <= singleLimits.nova_lite ? 1 : 2;
} else if (effective === 'azure_mini') {
credits = charCount <= singleLimits.azure_mini ? 1 : (charCount <= 180000 ? 2 : 3);
} else {
credits = charCount <= singleLimits.azure_full ? 2 : (charCount <= 350000 ? 4 : 6);
}
const chunked = charCount > singleLimits[effective];
return {
requested,
effective,
upgraded: ranks[effective] > ranks[requested],
charCount,
credits,
baseCredits,
chunked,
chunkCount: chunked ? Math.ceil(charCount / chunkSizes[effective]) : 1,
requiresConfirmation: credits > baseCredits || ranks[effective] > ranks[requested],
};
}
function timelineQuoteMessage(quote) {
return [
`Timeline will use ${timelineEngineLabel(quote.effective)} for ${Number(quote.charCount || 0).toLocaleString()} characters.`,
quote.chunked ? `It will process about ${quote.chunkCount} chunks.` : 'It can run in a single pass.',
`Cost: ${quote.credits} credit${quote.credits === 1 ? '' : 's'}.`,
'Continue?'
].join('\n');
} }
function currentTimelineFocus() { function currentTimelineFocus() {
@@ -1122,15 +1173,36 @@ async function runTool(event) {
let timelineRouteNotice = ''; let timelineRouteNotice = '';
if (state.activeTool === 'timeline') { if (state.activeTool === 'timeline') {
payload.engine = currentTimelineEngine(); payload.engine = currentTimelineEngine();
const clientRoute = timelineClientRoute(payload.engine, text.length); const clientRoute = timelineClientQuote(payload.engine, text.length);
if (clientRoute.error) {
els.status.textContent = clientRoute.message;
return;
}
const pendingQuoteApplies = pendingTimelineQuote
&& pendingTimelineQuote.text === text
&& pendingTimelineQuote.requested === payload.engine;
if (pendingQuoteApplies) {
payload.accepted_timeline_quote = true;
payload.accepted_credits = pendingTimelineQuote.credits;
payload.accepted_effective_engine = pendingTimelineQuote.effective;
pendingTimelineQuote = null;
} else if (clientRoute.requiresConfirmation) {
if (!window.confirm(timelineQuoteMessage(clientRoute))) {
els.status.textContent = 'Timeline run cancelled before any credits were charged.';
return;
}
payload.accepted_timeline_quote = true;
payload.accepted_credits = clientRoute.credits;
payload.accepted_effective_engine = clientRoute.effective;
}
payload.focus = currentTimelineFocus(); payload.focus = currentTimelineFocus();
payload.confidence_filter = currentConfidenceFilter(); payload.confidence_filter = currentConfidenceFilter();
payload.include_relative = currentIncludeRelative(); payload.include_relative = currentIncludeRelative();
payload.include_background = currentIncludeBackground(); payload.include_background = currentIncludeBackground();
payload.user_notes = (document.getElementById('timelineNotes')?.value || '').trim(); payload.user_notes = (document.getElementById('timelineNotes')?.value || '').trim();
payload.use_my_case = (typeof window.dbnGetUseMyCase === 'function') ? window.dbnGetUseMyCase() : false; payload.use_my_case = (typeof window.dbnGetUseMyCase === 'function') ? window.dbnGetUseMyCase() : false;
timelineRouteNotice = clientRoute.upgraded timelineRouteNotice = clientRoute.upgraded || clientRoute.chunked
? `This input is ${text.length.toLocaleString()} characters, so Timeline will use ${timelineEngineLabel(clientRoute.effective)} for reliability.` ? `This input is ${text.length.toLocaleString()} characters, so Timeline will use ${timelineEngineLabel(clientRoute.effective)}${clientRoute.chunked ? ` across about ${clientRoute.chunkCount} chunks` : ''}.`
: ''; : '';
} }
@@ -1157,7 +1229,30 @@ async function runTool(event) {
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload), body: JSON.stringify(payload),
}); });
if (!resp.ok) throw new Error(`HTTP ${resp.status}`); if (!resp.ok) {
const errData = await resp.json().catch(() => ({}));
const quote = errData.timeline_quote;
if (errData.error?.code === 'timeline_quote_required' && quote) {
const confirmQuote = {
effective: quote.effective_engine,
charCount: quote.input_char_count,
credits: quote.credits || quote.estimated_credits,
chunked: Boolean(quote.chunked_timeline),
chunkCount: quote.timeline_chunk_count || 1,
};
if (window.confirm(timelineQuoteMessage(confirmQuote))) {
pendingTimelineQuote = {
text,
requested: payload.engine,
effective: confirmQuote.effective,
credits: Number(confirmQuote.credits || 0),
};
return runTool(event);
}
throw new Error('Timeline run cancelled before any credits were charged.');
}
throw new Error(errData.error?.message || `HTTP ${resp.status}`);
}
const reader = resp.body.getReader(); const reader = resp.body.getReader();
const dec = new TextDecoder(); const dec = new TextDecoder();
let buf = '', event = ''; let buf = '', event = '';
@@ -1194,8 +1289,8 @@ async function runTool(event) {
renderResults(data); renderResults(data);
renderTrace(data.trace || []); renderTrace(data.trace || []);
const routeMeta = data.trace_metadata || {}; const routeMeta = data.trace_metadata || {};
const serverRouteNotice = state.activeTool === 'timeline' && routeMeta.auto_upgraded_engine const serverRouteNotice = state.activeTool === 'timeline' && (routeMeta.auto_upgraded_engine || routeMeta.chunked_timeline || routeMeta.credits_charged)
? ` Used ${timelineEngineLabel(routeMeta.effective_engine)} for ${Number(routeMeta.input_char_count || 0).toLocaleString()} characters.` ? ` Used ${timelineEngineLabel(routeMeta.effective_engine)} for ${Number(routeMeta.input_char_count || 0).toLocaleString()} characters${routeMeta.chunked_timeline ? ` across ${routeMeta.timeline_chunk_count || 1} chunks` : ''}; charged ${routeMeta.credits_charged || routeMeta.estimated_credits || 1} credit(s).`
: ''; : '';
els.status.textContent = `Done in ${data.latency_ms || 0} ms.${serverRouteNotice}`; els.status.textContent = `Done in ${data.latency_ms || 0} ms.${serverRouteNotice}`;
if (['ask', 'redact', 'timeline'].includes(state.activeTool)) { if (['ask', 'redact', 'timeline'].includes(state.activeTool)) {
@@ -1299,6 +1394,7 @@ async function handleFiles(fileList) {
for (const file of files) { for (const file of files) {
const formData = new FormData(); const formData = new FormData();
formData.append('file', file); formData.append('file', file);
formData.append('tool', state.activeTool);
const resp = await fetch('api/extract.php', { const resp = await fetch('api/extract.php', {
method: 'POST', method: 'POST',
@@ -1318,7 +1414,7 @@ async function handleFiles(fileList) {
const combined = parts[0].text; const combined = parts[0].text;
const MAX_COMBINED = 128000; const MAX_COMBINED = state.activeTool === 'timeline' ? 600000 : 128000;
const combinedTruncated = combined.length > MAX_COMBINED; const combinedTruncated = combined.length > MAX_COMBINED;
els.input.value = combinedTruncated ? combined.slice(0, MAX_COMBINED) : combined; els.input.value = combinedTruncated ? combined.slice(0, MAX_COMBINED) : combined;
@@ -1328,7 +1424,7 @@ async function handleFiles(fileList) {
els.uploadPrompt.classList.add('is-hidden'); els.uploadPrompt.classList.add('is-hidden');
els.uploadFileInfo.classList.remove('is-hidden'); els.uploadFileInfo.classList.remove('is-hidden');
const truncNote = (anyTruncated || combinedTruncated) ? ' — truncated to 128000 char limit' : ''; const truncNote = (anyTruncated || combinedTruncated) ? ` - truncated to ${MAX_COMBINED.toLocaleString()} char limit` : '';
els.status.textContent = `Extracted ${totalChars.toLocaleString()} chars from ${parts[0].filename}${truncNote}.`; els.status.textContent = `Extracted ${totalChars.toLocaleString()} chars from ${parts[0].filename}${truncNote}.`;
} catch (err) { } catch (err) {
els.status.textContent = err.message; els.status.textContent = err.message;
+277 -3
View File
@@ -7,6 +7,7 @@ require_once __DIR__ . '/AzureOpenAiGateway.php';
final class DbnLegalToolsService final class DbnLegalToolsService
{ {
private const MAX_PASTE_CHARS = 128000; private const MAX_PASTE_CHARS = 128000;
private const MAX_TIMELINE_CHARS = 600000;
private DbnAzureOpenAiGateway $azure; private DbnAzureOpenAiGateway $azure;
@@ -353,7 +354,7 @@ PROMPT;
string $userNotes = '', string $userNotes = '',
?callable $onProgress = null ?callable $onProgress = null
): array { ): array {
$text = $this->requirePasteText($text); $text = $this->requirePasteText($text, self::MAX_TIMELINE_CHARS);
$engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini'; $engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini';
$focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all'; $focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all';
@@ -382,6 +383,23 @@ PROMPT;
? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---" ? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---"
: ''; : '';
$charCount = mb_strlen($text, 'UTF-8');
$singlePassLimit = $this->timelineSinglePassLimit($engine);
if ($charCount > $singlePassLimit) {
return $this->timelineChunked(
$text,
$language,
$engine,
$focus,
$confidenceFilter,
$includeRelative,
$includeBackground,
$userNotes,
$onProgress,
$inputDateHintCount
);
}
$prompt = <<<PROMPT $prompt = <<<PROMPT
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock} Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
@@ -589,6 +607,261 @@ PROMPT;
]; ];
} }
private function timelineChunked(
string $text,
string $language,
string $engine,
string $focus,
string $confidenceFilter,
bool $includeRelative,
bool $includeBackground,
string $userNotes,
?callable $onProgress,
int $inputDateHintCount
): array {
$engineLabel = match ($engine) { 'azure_full' => 'gpt-4o', 'nova_lite' => 'nova-lite', default => 'gpt-4o-mini' };
$chunkSize = $this->timelineChunkSize($engine);
$chunks = $this->timelineTextChunks($text, $chunkSize, 900);
$chunkCount = count($chunks);
$events = [];
$chunkFailures = 0;
$usedFallbackExtractor = false;
$onProgress && $onProgress('Splitting timeline into ' . $chunkCount . " chunk(s)\u{2026}");
foreach ($chunks as $idx => $chunk) {
$chunkNo = $idx + 1;
$chunkText = trim((string)$chunk['text']);
if (mb_strlen($chunkText, 'UTF-8') < 20) {
continue;
}
$onProgress && $onProgress("Extracting timeline chunk {$chunkNo}/{$chunkCount}\u{2026}");
try {
$result = $this->timeline(
$chunkText,
$language,
$engine,
$focus,
$confidenceFilter,
$includeRelative,
$includeBackground,
$userNotes,
null
);
$chunkEvents = is_array($result['events'] ?? null) ? $result['events'] : [];
if (!empty($result['trace_metadata']['used_fallback_extractor'])) {
$usedFallbackExtractor = true;
}
} catch (DbnToolsHttpException $e) {
$chunkFailures++;
$chunkEvents = [];
if ($this->timelineDateHintCount($chunkText) > 0) {
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
if ($chunkEvents) {
$usedFallbackExtractor = true;
}
}
if (!$chunkEvents && $e->status >= 500) {
error_log('timeline chunk failed: ' . $e->errorCode . ' ' . $e->getMessage());
}
} catch (Throwable $e) {
$chunkFailures++;
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
if ($chunkEvents) {
$usedFallbackExtractor = true;
}
error_log('timeline chunk throwable: ' . $e->getMessage());
}
foreach ($chunkEvents as $event) {
if (!is_array($event)) {
continue;
}
$event['chunk_index'] = $chunkNo;
$event['source_position'] = (int)$chunk['start'];
$events[] = $event;
}
}
$events = $this->mergeTimelineEvents($events);
if ($confidenceFilter === 'high_medium') {
$events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low'));
}
if (!$includeRelative) {
$events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute'));
}
$focusLabel = match ($focus) {
'deadlines' => 'legal deadlines',
'hearings' => 'court hearings',
'cps' => 'CPS milestones',
default => 'all events',
};
$isoDates = array_values(array_filter(array_map(fn($ev) => (string)($ev['date'] ?? ''), $events), fn($d) => preg_match('/^\d{4}-\d{2}-\d{2}$/', $d)));
sort($isoDates);
$range = $isoDates ? (' from ' . $isoDates[0] . ' to ' . $isoDates[count($isoDates) - 1]) : '';
$actors = array_values(array_unique(array_filter(array_map(fn($ev) => (string)($ev['actor'] ?? ''), $events), fn($a) => $a !== '' && $a !== 'unknown')));
$summary = count($events) . " event(s) extracted from {$chunkCount} chunk(s){$range}.";
if ($actors) {
$summary .= ' Main actors: ' . implode(', ', array_slice($actors, 0, 8)) . '.';
}
$uncertain = [];
if ($chunkFailures > 0) {
$uncertain[] = "{$chunkFailures} chunk(s) needed fallback extraction or could not be fully parsed; review medium-confidence entries against the source.";
}
if ($usedFallbackExtractor) {
$uncertain[] = 'Deterministic fallback extraction was used for at least one chunk.';
}
$trace = [
$this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Chunked timeline mode; without saving the text or output.", 'complete'),
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text split into local chunks.', 'complete'),
$this->trace('Evidence found', count($events) . " event(s) identified across {$chunkCount} chunk(s).", count($events) ? 'complete' : 'warning'),
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text. Overlapping chunks were de-duplicated.', 'complete'),
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($uncertain), $uncertain ? 'warning' : 'complete'),
$this->trace('Next practical step', 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'complete'),
];
return [
'tool' => 'timeline',
'language' => $language,
'what_we_found' => $summary,
'events' => $events,
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; split into local timeline chunks; not stored.']],
'what_remains_uncertain' => $uncertain,
'next_practical_step' => 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.',
'trace' => $trace,
'trace_metadata' => [
'chunk_count' => count($events),
'source_count' => $chunkCount,
'deployment' => $engineLabel,
'input_date_hint_count' => $inputDateHintCount,
'used_fallback_extractor' => $usedFallbackExtractor,
'chunked_timeline' => true,
'timeline_chunk_count' => $chunkCount,
'chunk_failures' => $chunkFailures,
],
'disclaimer' => dbnToolsDisclaimer($language),
];
}
private function timelineSinglePassLimit(string $engine): int
{
return match ($engine) {
'nova_lite' => 25000,
'azure_mini' => 55000,
default => 128000,
};
}
private function timelineChunkSize(string $engine): int
{
return match ($engine) {
'nova_lite' => 10000,
'azure_mini' => 16000,
default => 30000,
};
}
private function timelineTextChunks(string $text, int $chunkSize, int $overlap): array
{
$len = mb_strlen($text, 'UTF-8');
$chunks = [];
$start = 0;
while ($start < $len) {
$targetEnd = min($len, $start + $chunkSize);
$window = mb_substr($text, $start, $targetEnd - $start, 'UTF-8');
$end = $targetEnd;
if ($targetEnd < $len) {
$breakAt = mb_strrpos($window, "\n\n", 0, 'UTF-8');
if ($breakAt === false || $breakAt < (int)($chunkSize * 0.55)) {
$breakAt = mb_strrpos($window, "\n", 0, 'UTF-8');
}
if ($breakAt !== false && $breakAt > (int)($chunkSize * 0.45)) {
$end = $start + $breakAt;
}
}
$chunkText = trim(mb_substr($text, $start, max(1, $end - $start), 'UTF-8'));
if ($chunkText !== '') {
$chunks[] = ['start' => $start, 'text' => $chunkText];
}
if ($end >= $len) {
break;
}
$nextStart = max(0, $end - $overlap);
if ($nextStart <= $start) {
$nextStart = $end;
}
$start = $nextStart;
}
return $chunks;
}
private function mergeTimelineEvents(array $events): array
{
$merged = [];
foreach ($events as $event) {
if (!is_array($event)) {
continue;
}
$key = $this->timelineEventSignature($event);
if (!isset($merged[$key])) {
$merged[$key] = $event;
continue;
}
$existing = $merged[$key];
$candidateExcerpt = (string)($event['source_excerpt'] ?? '');
$existingExcerpt = (string)($existing['source_excerpt'] ?? '');
$additionalExcerpt = $candidateExcerpt;
if ($this->timelineConfidenceRank((string)($event['confidence'] ?? 'medium')) > $this->timelineConfidenceRank((string)($existing['confidence'] ?? 'medium'))) {
$merged[$key] = $event;
$additionalExcerpt = $existingExcerpt;
}
$oldExcerpt = (string)($merged[$key]['source_excerpt'] ?? '');
$newExcerpt = $additionalExcerpt;
if ($newExcerpt !== '' && $oldExcerpt !== '' && $newExcerpt !== $oldExcerpt && mb_strlen($oldExcerpt, 'UTF-8') < 260) {
$merged[$key]['source_excerpt'] = $oldExcerpt . ' / ' . $newExcerpt;
}
}
$events = array_values($merged);
usort($events, static function (array $a, array $b): int {
$ad = (string)($a['date'] ?? '');
$bd = (string)($b['date'] ?? '');
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
$cmp = strcmp($ai, $bi);
if ($cmp !== 0) {
return $cmp;
}
return strcmp((string)($a['time'] ?? ''), (string)($b['time'] ?? ''));
});
return $events;
}
private function timelineEventSignature(array $event): string
{
$date = mb_strtolower(trim((string)($event['date'] ?? '')), 'UTF-8');
$time = mb_strtolower(trim((string)($event['time'] ?? '')), 'UTF-8');
$actor = mb_strtolower(trim((string)($event['actor'] ?? 'unknown')), 'UTF-8');
$body = mb_strtolower(trim((string)($event['event'] ?? '')), 'UTF-8');
$body = (string)preg_replace('/[^\p{L}\p{N}]+/u', ' ', $body);
$body = trim((string)preg_replace('/\s+/u', ' ', $body));
return $date . '|' . $time . '|' . $actor . '|' . mb_substr($body, 0, 96, 'UTF-8');
}
private function timelineConfidenceRank(string $confidence): int
{
return match ($confidence) {
'high' => 3,
'medium' => 2,
default => 1,
};
}
private function timelineDateHintCount(string $text): int private function timelineDateHintCount(string $text): int
{ {
preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric); preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric);
@@ -1106,13 +1379,14 @@ PROMPT;
return array_slice(array_values(array_unique($terms)), 0, 6); return array_slice(array_values(array_unique($terms)), 0, 6);
} }
private function requirePasteText(string $text): string private function requirePasteText(string $text, ?int $maxChars = null): string
{ {
$text = trim($text); $text = trim($text);
if (mb_strlen($text, 'UTF-8') < 20) { if (mb_strlen($text, 'UTF-8') < 20) {
dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short'); dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short');
} }
if (mb_strlen($text, 'UTF-8') > self::MAX_PASTE_CHARS) { $maxChars ??= self::MAX_PASTE_CHARS;
if (mb_strlen($text, 'UTF-8') > $maxChars) {
dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long'); dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long');
} }
return $text; return $text;
+108 -8
View File
@@ -1,6 +1,7 @@
<?php <?php
declare(strict_types=1); declare(strict_types=1);
require_once __DIR__ . '/bootstrap.php';
require_once __DIR__ . '/FreeTier.php'; require_once __DIR__ . '/FreeTier.php';
/** /**
@@ -14,6 +15,9 @@ final class ToolModels
public const TIMELINE_QUICK_CHAR_LIMIT = 25000; public const TIMELINE_QUICK_CHAR_LIMIT = 25000;
public const TIMELINE_STANDARD_CHAR_LIMIT = 55000; public const TIMELINE_STANDARD_CHAR_LIMIT = 55000;
public const TIMELINE_DEEP_CHAR_LIMIT = 128000; public const TIMELINE_DEEP_CHAR_LIMIT = 128000;
public const TIMELINE_QUICK_MAX_CHARS = 100000;
public const TIMELINE_STANDARD_MAX_CHARS = 300000;
public const TIMELINE_DEEP_MAX_CHARS = 600000;
public static function engineForUser(int $userId, string $requestedEngine): string public static function engineForUser(int $userId, string $requestedEngine): string
{ {
@@ -38,22 +42,32 @@ final class ToolModels
$tierEngine = self::engineForUser($userId, $requestedEngine); $tierEngine = self::engineForUser($userId, $requestedEngine);
$charCount = mb_strlen($text, 'UTF-8'); $charCount = mb_strlen($text, 'UTF-8');
if ($charCount > self::TIMELINE_DEEP_CHAR_LIMIT) { if ($charCount > self::TIMELINE_DEEP_MAX_CHARS) {
throw new DbnToolsHttpException( throw new DbnToolsHttpException(
'This timeline input is too large after selected documents or My Case context were added. Split the file or use fewer selected documents.', 'This timeline input is too large after selected documents or My Case context were added. Split the file or use fewer selected documents.',
413, 413,
'timeline_input_too_large', 'timeline_input_too_large',
['input_char_count' => $charCount, 'max_chars' => self::TIMELINE_DEEP_CHAR_LIMIT] ['input_char_count' => $charCount, 'max_chars' => self::TIMELINE_DEEP_MAX_CHARS]
); );
} }
$effectiveEngine = $tierEngine; $effectiveEngine = $tierEngine;
if ($charCount > self::TIMELINE_STANDARD_CHAR_LIMIT) { if ($charCount > self::timelineEngineMaxChars($effectiveEngine)) {
$effectiveEngine = 'azure_full'; $effectiveEngine = $charCount <= self::TIMELINE_STANDARD_MAX_CHARS ? 'azure_mini' : 'azure_full';
} elseif ($charCount > self::TIMELINE_QUICK_CHAR_LIMIT && $effectiveEngine === 'nova_lite') { } elseif ($charCount > self::TIMELINE_STANDARD_CHAR_LIMIT && $effectiveEngine === 'nova_lite') {
$effectiveEngine = 'azure_mini'; $effectiveEngine = $charCount <= self::TIMELINE_QUICK_MAX_CHARS ? 'nova_lite' : 'azure_mini';
} }
if ($charCount > self::timelineEngineMaxChars($effectiveEngine)) {
$effectiveEngine = 'azure_full';
}
$credits = self::timelineCreditsForSize($effectiveEngine, $charCount);
$baseCredits = self::timelineAdvertisedCredits($requestedEngine);
$requiresConfirmation = $credits > $baseCredits
|| self::timelineEngineRank($effectiveEngine) > self::timelineEngineRank($requestedEngine);
$chunked = $charCount > self::timelineEngineLimit($effectiveEngine);
return [ return [
'requested_engine' => $requestedEngine, 'requested_engine' => $requestedEngine,
'tier_engine' => $tierEngine, 'tier_engine' => $tierEngine,
@@ -61,13 +75,48 @@ final class ToolModels
'auto_upgraded_engine' => $effectiveEngine !== $tierEngine, 'auto_upgraded_engine' => $effectiveEngine !== $tierEngine,
'input_char_count' => $charCount, 'input_char_count' => $charCount,
'engine_limit_chars' => self::timelineEngineLimit($effectiveEngine), 'engine_limit_chars' => self::timelineEngineLimit($effectiveEngine),
'credits' => self::timelineCredits($effectiveEngine), 'max_char_limit' => self::timelineEngineMaxChars($effectiveEngine),
'chunked_timeline' => $chunked,
'timeline_chunk_count' => $chunked ? (int)ceil($charCount / self::timelineChunkSize($effectiveEngine)) : 1,
'estimated_credits' => $credits,
'credits' => $credits,
'base_credits' => $baseCredits,
'requires_confirmation' => $requiresConfirmation,
]; ];
} }
public static function assertTimelineQuoteAccepted(array $route, array $input): void
{
if (empty($route['requires_confirmation'])) {
return;
}
$accepted = !empty($input['accepted_timeline_quote'])
&& (int)($input['accepted_credits'] ?? 0) === (int)$route['credits']
&& (string)($input['accepted_effective_engine'] ?? '') === (string)$route['effective_engine'];
if ($accepted) {
return;
}
$engineLabel = self::timelineEngineLabel((string)$route['effective_engine']);
throw new DbnToolsHttpException(
'This timeline is larger than the selected engine can handle at the advertised price. Confirm the quoted engine and credits before running.',
409,
'timeline_quote_required',
['timeline_quote' => array_merge($route, [
'effective_engine_label' => $engineLabel,
'message' => 'Timeline will use ' . $engineLabel . ' for '
. number_format((int)$route['input_char_count'])
. ' characters across about ' . (int)$route['timeline_chunk_count']
. ' chunk(s), costing ' . (int)$route['credits'] . ' credit(s).',
])]
);
}
public static function timelineCredits(string $engine): int public static function timelineCredits(string $engine): int
{ {
return $engine === 'azure_full' ? 2 : 1; return self::timelineAdvertisedCredits($engine);
} }
public static function timelineEngineLimit(string $engine): int public static function timelineEngineLimit(string $engine): int
@@ -78,4 +127,55 @@ final class ToolModels
default => self::TIMELINE_DEEP_CHAR_LIMIT, default => self::TIMELINE_DEEP_CHAR_LIMIT,
}; };
} }
public static function timelineChunkSize(string $engine): int
{
return match ($engine) {
'nova_lite' => 10000,
'azure_mini' => 16000,
default => 30000,
};
}
public static function timelineEngineMaxChars(string $engine): int
{
return match ($engine) {
'nova_lite' => self::TIMELINE_QUICK_MAX_CHARS,
'azure_mini' => self::TIMELINE_STANDARD_MAX_CHARS,
default => self::TIMELINE_DEEP_MAX_CHARS,
};
}
public static function timelineCreditsForSize(string $engine, int $charCount): int
{
return match ($engine) {
'nova_lite' => $charCount <= self::TIMELINE_QUICK_CHAR_LIMIT ? 1 : 2,
'azure_mini' => $charCount <= self::TIMELINE_STANDARD_CHAR_LIMIT ? 1 : ($charCount <= 180000 ? 2 : 3),
default => $charCount <= self::TIMELINE_DEEP_CHAR_LIMIT ? 2 : ($charCount <= 350000 ? 4 : 6),
};
}
public static function timelineAdvertisedCredits(string $engine): int
{
return $engine === 'azure_full' ? 2 : 1;
}
public static function timelineEngineLabel(string $engine): string
{
return match ($engine) {
'nova_lite' => 'Quick',
'azure_full' => 'Deep',
default => 'Standard',
};
}
private static function timelineEngineRank(string $engine): int
{
return match ($engine) {
'nova_lite' => 1,
'azure_mini' => 2,
'azure_full' => 3,
default => 0,
};
}
} }
+6 -3
View File
@@ -890,9 +890,10 @@ function dbnToolsExcerpt(string $text, int $limit = 520): string
const DBN_TOOLS_EXTRACT_MAX_BYTES = 8 * 1024 * 1024; const DBN_TOOLS_EXTRACT_MAX_BYTES = 8 * 1024 * 1024;
const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000; const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000;
const DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT = 600000;
const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx']; const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
function dbnToolsExtractUploadedFile(array $file): array function dbnToolsExtractUploadedFile(array $file, int $textLimit = DBN_TOOLS_EXTRACT_TEXT_LIMIT): array
{ {
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE); $errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
if ($errCode !== UPLOAD_ERR_OK) { if ($errCode !== UPLOAD_ERR_OK) {
@@ -936,8 +937,9 @@ function dbnToolsExtractUploadedFile(array $file): array
} }
$truncated = false; $truncated = false;
if (mb_strlen($text, 'UTF-8') > DBN_TOOLS_EXTRACT_TEXT_LIMIT) { $textLimit = max(1000, min($textLimit, DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT));
$text = mb_substr($text, 0, DBN_TOOLS_EXTRACT_TEXT_LIMIT, 'UTF-8'); if (mb_strlen($text, 'UTF-8') > $textLimit) {
$text = mb_substr($text, 0, $textLimit, 'UTF-8');
$truncated = true; $truncated = true;
} }
@@ -947,6 +949,7 @@ function dbnToolsExtractUploadedFile(array $file): array
'filename' => $originalName, 'filename' => $originalName,
'chars' => mb_strlen($text, 'UTF-8'), 'chars' => mb_strlen($text, 'UTF-8'),
'truncated' => $truncated, 'truncated' => $truncated,
'limit' => $textLimit,
]; ];
} }