From 17ad54cf36ae1f3254986c614b50628faf407e63 Mon Sep 17 00:00:00 2001 From: davegilligan Date: Mon, 25 May 2026 12:34:41 +0200 Subject: [PATCH] Add chunked timeline routing --- api/extract.php | 4 +- api/timeline-stream.php | 14 +- api/timeline.php | 9 +- assets/js/tools.js | 120 +++++++++++++++-- includes/LegalTools.php | 280 +++++++++++++++++++++++++++++++++++++++- includes/ToolModels.php | 116 +++++++++++++++-- includes/bootstrap.php | 9 +- 7 files changed, 521 insertions(+), 31 deletions(-) diff --git a/api/extract.php b/api/extract.php index 598751c..e39c1de 100644 --- a/api/extract.php +++ b/api/extract.php @@ -12,7 +12,9 @@ try { dbnToolsError('No file was uploaded.', 422, 'missing_file'); } - $result = dbnToolsExtractUploadedFile($_FILES['file']); + $tool = (string)($_POST['tool'] ?? ''); + $limit = $tool === 'timeline' ? DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT : DBN_TOOLS_EXTRACT_TEXT_LIMIT; + $result = dbnToolsExtractUploadedFile($_FILES['file'], $limit); $ftRemaining = dbnToolsFreeTierDeduct($ftUid, 'extract'); if ($ftRemaining >= 0) { header('X-Credits-Remaining: ' . $ftRemaining); diff --git a/api/timeline-stream.php b/api/timeline-stream.php index 49325de..29e8183 100644 --- a/api/timeline-stream.php +++ b/api/timeline-stream.php @@ -9,7 +9,7 @@ dbnToolsRequireAuth(); // Parse input and run credit pre-check BEFORE emitting SSE headers so that // auth/credit errors can still return JSON (dbnToolsError / dbnToolsAbort). -$input = dbnToolsJsonInput(400000); +$input = dbnToolsJsonInput(1500000); $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); $_validEngines = ['nova_lite', 'azure_mini', 'azure_full']; @@ -17,7 +17,7 @@ $_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, t ? (string)$input['engine'] : 'azure_mini'; try { - $text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', 128000, false)); + $text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', ToolModels::TIMELINE_DEEP_MAX_CHARS, false)); if (mb_strlen(trim($text), 'UTF-8') < 10) { dbnToolsError('Paste text, upload a file, or select a document before running.', 422, 'empty_text'); } @@ -33,6 +33,7 @@ try { } $timelineRoute = ToolModels::timelineRoute($ftUid, $_requestedEngine, $text); + ToolModels::assertTimelineQuoteAccepted($timelineRoute, $input); $ftUid = dbnToolsFreeTierCheckAmount('timeline', (int)$timelineRoute['credits']); } catch (DbnToolsHttpException $e) { dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra); @@ -69,6 +70,11 @@ try { 'msg' => 'This input is ' . number_format((int)$timelineRoute['input_char_count']) . " characters, so Timeline is using {$label} for reliability.", ]); } + if (!empty($timelineRoute['chunked_timeline'])) { + sseEmit('status', [ + 'msg' => 'Processing ' . (int)$timelineRoute['timeline_chunk_count'] . ' timeline chunk(s).', + ]); + } $validFocus = ['all', 'deadlines', 'hearings', 'cps']; $focus = in_array((string)($input['focus'] ?? ''), $validFocus, true) @@ -110,6 +116,10 @@ try { 'auto_upgraded_engine' => $timelineRoute['auto_upgraded_engine'], 'input_char_count' => $timelineRoute['input_char_count'], 'engine_limit_chars' => $timelineRoute['engine_limit_chars'], + 'max_char_limit' => $timelineRoute['max_char_limit'], + 'chunked_timeline' => $timelineRoute['chunked_timeline'], + 'timeline_chunk_count' => $timelineRoute['timeline_chunk_count'], + 'estimated_credits' => $timelineRoute['estimated_credits'], 'credits_charged' => $timelineRoute['credits'], ]); diff --git a/api/timeline.php b/api/timeline.php index 6ef027c..03f36e0 100644 --- a/api/timeline.php +++ b/api/timeline.php @@ -6,7 +6,7 @@ require_once __DIR__ . '/../includes/ToolModels.php'; dbnToolsRequireMethod('POST'); dbnToolsRequireAuth(); -$input = dbnToolsJsonInput(400000); +$input = dbnToolsJsonInput(1500000); $language = dbnToolsNormalizeLanguage($input['language'] ?? 'en'); $_validEngines = ['nova_lite', 'azure_mini', 'azure_full']; $_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, true) @@ -15,7 +15,7 @@ $_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, t $start = microtime(true); try { - $text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', 128000, false)); + $text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', ToolModels::TIMELINE_DEEP_MAX_CHARS, false)); if (mb_strlen(trim($text), 'UTF-8') < 10) { dbnToolsAbort('Paste text, upload a file, or select a document before running.', 422, 'empty_text'); } @@ -33,6 +33,7 @@ try { } $timelineRoute = ToolModels::timelineRoute($ftUid, $_requestedEngine, $text); + ToolModels::assertTimelineQuoteAccepted($timelineRoute, $input); $ftUid = dbnToolsFreeTierCheckAmount('timeline', (int)$timelineRoute['credits']); $validFocus = ['all', 'deadlines', 'hearings', 'cps']; @@ -71,6 +72,10 @@ try { 'auto_upgraded_engine' => $timelineRoute['auto_upgraded_engine'], 'input_char_count' => $timelineRoute['input_char_count'], 'engine_limit_chars' => $timelineRoute['engine_limit_chars'], + 'max_char_limit' => $timelineRoute['max_char_limit'], + 'chunked_timeline' => $timelineRoute['chunked_timeline'], + 'timeline_chunk_count' => $timelineRoute['timeline_chunk_count'], + 'estimated_credits' => $timelineRoute['estimated_credits'], 'credits_charged' => $timelineRoute['credits'], ]); diff --git a/assets/js/tools.js b/assets/js/tools.js index f5ffe2b..50cfff1 100644 --- a/assets/js/tools.js +++ b/assets/js/tools.js @@ -400,6 +400,7 @@ let lastOriginalText = ''; let lastRedactPayload = null; let lastRunEngine = null; let lastToolPayload = null; +let pendingTimelineQuote = null; const VOCAB_PRESETS = { barnerett: 'Barnevernet, Fylkesnemnda, barnevernloven, barneloven, barnets beste, samvær, foreldreansvar, omsorgsovertakelse, sakkyndig, advokat, prosessfullmektig, dommer, vitne, tolk, bistandsadvokat, fosterforeldre, fosterhjem, akuttvedtak, statsforvalter, Bufetat, saksbehandler, rettslig medhold, begjæring, samtykke, tilsynsfører', @@ -764,10 +765,60 @@ function timelineEngineLabel(engine) { } function timelineClientRoute(engine, charCount) { - let effective = engine; - if (charCount > 55000) effective = 'azure_full'; - else if (charCount > 25000 && effective === 'nova_lite') effective = 'azure_mini'; - return { effective, upgraded: effective !== engine }; + return timelineClientQuote(engine, charCount); +} + +function timelineClientQuote(engine, charCount) { + const valid = ['nova_lite', 'azure_mini', 'azure_full']; + const requested = valid.includes(engine) ? engine : 'azure_mini'; + const singleLimits = { nova_lite: 25000, azure_mini: 55000, azure_full: 128000 }; + const maxLimits = { nova_lite: 100000, azure_mini: 300000, azure_full: 600000 }; + const chunkSizes = { nova_lite: 10000, azure_mini: 16000, azure_full: 30000 }; + const ranks = { nova_lite: 1, azure_mini: 2, azure_full: 3 }; + const baseCredits = requested === 'azure_full' ? 2 : 1; + let effective = requested; + + if (charCount > 600000) { + return { + error: true, + message: `This timeline input is ${charCount.toLocaleString()} characters. Split the file or use fewer selected documents; the current maximum is 600,000 characters.`, + }; + } + if (charCount > maxLimits[effective]) { + effective = charCount <= maxLimits.azure_mini ? 'azure_mini' : 'azure_full'; + } + if (charCount > maxLimits[effective]) effective = 'azure_full'; + + let credits = 1; + if (effective === 'nova_lite') { + credits = charCount <= singleLimits.nova_lite ? 1 : 2; + } else if (effective === 'azure_mini') { + credits = charCount <= singleLimits.azure_mini ? 1 : (charCount <= 180000 ? 2 : 3); + } else { + credits = charCount <= singleLimits.azure_full ? 2 : (charCount <= 350000 ? 4 : 6); + } + + const chunked = charCount > singleLimits[effective]; + return { + requested, + effective, + upgraded: ranks[effective] > ranks[requested], + charCount, + credits, + baseCredits, + chunked, + chunkCount: chunked ? Math.ceil(charCount / chunkSizes[effective]) : 1, + requiresConfirmation: credits > baseCredits || ranks[effective] > ranks[requested], + }; +} + +function timelineQuoteMessage(quote) { + return [ + `Timeline will use ${timelineEngineLabel(quote.effective)} for ${Number(quote.charCount || 0).toLocaleString()} characters.`, + quote.chunked ? `It will process about ${quote.chunkCount} chunks.` : 'It can run in a single pass.', + `Cost: ${quote.credits} credit${quote.credits === 1 ? '' : 's'}.`, + 'Continue?' + ].join('\n'); } function currentTimelineFocus() { @@ -1122,15 +1173,36 @@ async function runTool(event) { let timelineRouteNotice = ''; if (state.activeTool === 'timeline') { payload.engine = currentTimelineEngine(); - const clientRoute = timelineClientRoute(payload.engine, text.length); + const clientRoute = timelineClientQuote(payload.engine, text.length); + if (clientRoute.error) { + els.status.textContent = clientRoute.message; + return; + } + const pendingQuoteApplies = pendingTimelineQuote + && pendingTimelineQuote.text === text + && pendingTimelineQuote.requested === payload.engine; + if (pendingQuoteApplies) { + payload.accepted_timeline_quote = true; + payload.accepted_credits = pendingTimelineQuote.credits; + payload.accepted_effective_engine = pendingTimelineQuote.effective; + pendingTimelineQuote = null; + } else if (clientRoute.requiresConfirmation) { + if (!window.confirm(timelineQuoteMessage(clientRoute))) { + els.status.textContent = 'Timeline run cancelled before any credits were charged.'; + return; + } + payload.accepted_timeline_quote = true; + payload.accepted_credits = clientRoute.credits; + payload.accepted_effective_engine = clientRoute.effective; + } payload.focus = currentTimelineFocus(); payload.confidence_filter = currentConfidenceFilter(); payload.include_relative = currentIncludeRelative(); payload.include_background = currentIncludeBackground(); payload.user_notes = (document.getElementById('timelineNotes')?.value || '').trim(); payload.use_my_case = (typeof window.dbnGetUseMyCase === 'function') ? window.dbnGetUseMyCase() : false; - timelineRouteNotice = clientRoute.upgraded - ? `This input is ${text.length.toLocaleString()} characters, so Timeline will use ${timelineEngineLabel(clientRoute.effective)} for reliability.` + timelineRouteNotice = clientRoute.upgraded || clientRoute.chunked + ? `This input is ${text.length.toLocaleString()} characters, so Timeline will use ${timelineEngineLabel(clientRoute.effective)}${clientRoute.chunked ? ` across about ${clientRoute.chunkCount} chunks` : ''}.` : ''; } @@ -1157,7 +1229,30 @@ async function runTool(event) { headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), }); - if (!resp.ok) throw new Error(`HTTP ${resp.status}`); + if (!resp.ok) { + const errData = await resp.json().catch(() => ({})); + const quote = errData.timeline_quote; + if (errData.error?.code === 'timeline_quote_required' && quote) { + const confirmQuote = { + effective: quote.effective_engine, + charCount: quote.input_char_count, + credits: quote.credits || quote.estimated_credits, + chunked: Boolean(quote.chunked_timeline), + chunkCount: quote.timeline_chunk_count || 1, + }; + if (window.confirm(timelineQuoteMessage(confirmQuote))) { + pendingTimelineQuote = { + text, + requested: payload.engine, + effective: confirmQuote.effective, + credits: Number(confirmQuote.credits || 0), + }; + return runTool(event); + } + throw new Error('Timeline run cancelled before any credits were charged.'); + } + throw new Error(errData.error?.message || `HTTP ${resp.status}`); + } const reader = resp.body.getReader(); const dec = new TextDecoder(); let buf = '', event = ''; @@ -1194,8 +1289,8 @@ async function runTool(event) { renderResults(data); renderTrace(data.trace || []); const routeMeta = data.trace_metadata || {}; - const serverRouteNotice = state.activeTool === 'timeline' && routeMeta.auto_upgraded_engine - ? ` Used ${timelineEngineLabel(routeMeta.effective_engine)} for ${Number(routeMeta.input_char_count || 0).toLocaleString()} characters.` + const serverRouteNotice = state.activeTool === 'timeline' && (routeMeta.auto_upgraded_engine || routeMeta.chunked_timeline || routeMeta.credits_charged) + ? ` Used ${timelineEngineLabel(routeMeta.effective_engine)} for ${Number(routeMeta.input_char_count || 0).toLocaleString()} characters${routeMeta.chunked_timeline ? ` across ${routeMeta.timeline_chunk_count || 1} chunks` : ''}; charged ${routeMeta.credits_charged || routeMeta.estimated_credits || 1} credit(s).` : ''; els.status.textContent = `Done in ${data.latency_ms || 0} ms.${serverRouteNotice}`; if (['ask', 'redact', 'timeline'].includes(state.activeTool)) { @@ -1299,6 +1394,7 @@ async function handleFiles(fileList) { for (const file of files) { const formData = new FormData(); formData.append('file', file); + formData.append('tool', state.activeTool); const resp = await fetch('api/extract.php', { method: 'POST', @@ -1318,7 +1414,7 @@ async function handleFiles(fileList) { const combined = parts[0].text; - const MAX_COMBINED = 128000; + const MAX_COMBINED = state.activeTool === 'timeline' ? 600000 : 128000; const combinedTruncated = combined.length > MAX_COMBINED; els.input.value = combinedTruncated ? combined.slice(0, MAX_COMBINED) : combined; @@ -1328,7 +1424,7 @@ async function handleFiles(fileList) { els.uploadPrompt.classList.add('is-hidden'); els.uploadFileInfo.classList.remove('is-hidden'); - const truncNote = (anyTruncated || combinedTruncated) ? ' — truncated to 128 000 char limit' : ''; + const truncNote = (anyTruncated || combinedTruncated) ? ` - truncated to ${MAX_COMBINED.toLocaleString()} char limit` : ''; els.status.textContent = `Extracted ${totalChars.toLocaleString()} chars from ${parts[0].filename}${truncNote}.`; } catch (err) { els.status.textContent = err.message; diff --git a/includes/LegalTools.php b/includes/LegalTools.php index 2408d94..387afb1 100644 --- a/includes/LegalTools.php +++ b/includes/LegalTools.php @@ -7,6 +7,7 @@ require_once __DIR__ . '/AzureOpenAiGateway.php'; final class DbnLegalToolsService { private const MAX_PASTE_CHARS = 128000; + private const MAX_TIMELINE_CHARS = 600000; private DbnAzureOpenAiGateway $azure; @@ -353,7 +354,7 @@ PROMPT; string $userNotes = '', ?callable $onProgress = null ): array { - $text = $this->requirePasteText($text); + $text = $this->requirePasteText($text, self::MAX_TIMELINE_CHARS); $engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini'; $focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all'; @@ -382,6 +383,23 @@ PROMPT; ? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---" : ''; + $charCount = mb_strlen($text, 'UTF-8'); + $singlePassLimit = $this->timelineSinglePassLimit($engine); + if ($charCount > $singlePassLimit) { + return $this->timelineChunked( + $text, + $language, + $engine, + $focus, + $confidenceFilter, + $includeRelative, + $includeBackground, + $userNotes, + $onProgress, + $inputDateHintCount + ); + } + $prompt = << 'gpt-4o', 'nova_lite' => 'nova-lite', default => 'gpt-4o-mini' }; + $chunkSize = $this->timelineChunkSize($engine); + $chunks = $this->timelineTextChunks($text, $chunkSize, 900); + $chunkCount = count($chunks); + $events = []; + $chunkFailures = 0; + $usedFallbackExtractor = false; + + $onProgress && $onProgress('Splitting timeline into ' . $chunkCount . " chunk(s)\u{2026}"); + + foreach ($chunks as $idx => $chunk) { + $chunkNo = $idx + 1; + $chunkText = trim((string)$chunk['text']); + if (mb_strlen($chunkText, 'UTF-8') < 20) { + continue; + } + + $onProgress && $onProgress("Extracting timeline chunk {$chunkNo}/{$chunkCount}\u{2026}"); + try { + $result = $this->timeline( + $chunkText, + $language, + $engine, + $focus, + $confidenceFilter, + $includeRelative, + $includeBackground, + $userNotes, + null + ); + $chunkEvents = is_array($result['events'] ?? null) ? $result['events'] : []; + if (!empty($result['trace_metadata']['used_fallback_extractor'])) { + $usedFallbackExtractor = true; + } + } catch (DbnToolsHttpException $e) { + $chunkFailures++; + $chunkEvents = []; + if ($this->timelineDateHintCount($chunkText) > 0) { + $chunkEvents = $this->fallbackTimelineEvents($chunkText); + if ($chunkEvents) { + $usedFallbackExtractor = true; + } + } + if (!$chunkEvents && $e->status >= 500) { + error_log('timeline chunk failed: ' . $e->errorCode . ' ' . $e->getMessage()); + } + } catch (Throwable $e) { + $chunkFailures++; + $chunkEvents = $this->fallbackTimelineEvents($chunkText); + if ($chunkEvents) { + $usedFallbackExtractor = true; + } + error_log('timeline chunk throwable: ' . $e->getMessage()); + } + + foreach ($chunkEvents as $event) { + if (!is_array($event)) { + continue; + } + $event['chunk_index'] = $chunkNo; + $event['source_position'] = (int)$chunk['start']; + $events[] = $event; + } + } + + $events = $this->mergeTimelineEvents($events); + if ($confidenceFilter === 'high_medium') { + $events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low')); + } + if (!$includeRelative) { + $events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute')); + } + + $focusLabel = match ($focus) { + 'deadlines' => 'legal deadlines', + 'hearings' => 'court hearings', + 'cps' => 'CPS milestones', + default => 'all events', + }; + $isoDates = array_values(array_filter(array_map(fn($ev) => (string)($ev['date'] ?? ''), $events), fn($d) => preg_match('/^\d{4}-\d{2}-\d{2}$/', $d))); + sort($isoDates); + $range = $isoDates ? (' from ' . $isoDates[0] . ' to ' . $isoDates[count($isoDates) - 1]) : ''; + $actors = array_values(array_unique(array_filter(array_map(fn($ev) => (string)($ev['actor'] ?? ''), $events), fn($a) => $a !== '' && $a !== 'unknown'))); + $summary = count($events) . " event(s) extracted from {$chunkCount} chunk(s){$range}."; + if ($actors) { + $summary .= ' Main actors: ' . implode(', ', array_slice($actors, 0, 8)) . '.'; + } + + $uncertain = []; + if ($chunkFailures > 0) { + $uncertain[] = "{$chunkFailures} chunk(s) needed fallback extraction or could not be fully parsed; review medium-confidence entries against the source."; + } + if ($usedFallbackExtractor) { + $uncertain[] = 'Deterministic fallback extraction was used for at least one chunk.'; + } + + $trace = [ + $this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Chunked timeline mode; without saving the text or output.", 'complete'), + $this->trace('Search tools used', 'No external corpus search; source is the user-pasted text split into local chunks.', 'complete'), + $this->trace('Evidence found', count($events) . " event(s) identified across {$chunkCount} chunk(s).", count($events) ? 'complete' : 'warning'), + $this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text. Overlapping chunks were de-duplicated.', 'complete'), + $this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($uncertain), $uncertain ? 'warning' : 'complete'), + $this->trace('Next practical step', 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'complete'), + ]; + + return [ + 'tool' => 'timeline', + 'language' => $language, + 'what_we_found' => $summary, + 'events' => $events, + 'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; split into local timeline chunks; not stored.']], + 'what_remains_uncertain' => $uncertain, + 'next_practical_step' => 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', + 'trace' => $trace, + 'trace_metadata' => [ + 'chunk_count' => count($events), + 'source_count' => $chunkCount, + 'deployment' => $engineLabel, + 'input_date_hint_count' => $inputDateHintCount, + 'used_fallback_extractor' => $usedFallbackExtractor, + 'chunked_timeline' => true, + 'timeline_chunk_count' => $chunkCount, + 'chunk_failures' => $chunkFailures, + ], + 'disclaimer' => dbnToolsDisclaimer($language), + ]; + } + + private function timelineSinglePassLimit(string $engine): int + { + return match ($engine) { + 'nova_lite' => 25000, + 'azure_mini' => 55000, + default => 128000, + }; + } + + private function timelineChunkSize(string $engine): int + { + return match ($engine) { + 'nova_lite' => 10000, + 'azure_mini' => 16000, + default => 30000, + }; + } + + private function timelineTextChunks(string $text, int $chunkSize, int $overlap): array + { + $len = mb_strlen($text, 'UTF-8'); + $chunks = []; + $start = 0; + while ($start < $len) { + $targetEnd = min($len, $start + $chunkSize); + $window = mb_substr($text, $start, $targetEnd - $start, 'UTF-8'); + $end = $targetEnd; + if ($targetEnd < $len) { + $breakAt = mb_strrpos($window, "\n\n", 0, 'UTF-8'); + if ($breakAt === false || $breakAt < (int)($chunkSize * 0.55)) { + $breakAt = mb_strrpos($window, "\n", 0, 'UTF-8'); + } + if ($breakAt !== false && $breakAt > (int)($chunkSize * 0.45)) { + $end = $start + $breakAt; + } + } + + $chunkText = trim(mb_substr($text, $start, max(1, $end - $start), 'UTF-8')); + if ($chunkText !== '') { + $chunks[] = ['start' => $start, 'text' => $chunkText]; + } + if ($end >= $len) { + break; + } + $nextStart = max(0, $end - $overlap); + if ($nextStart <= $start) { + $nextStart = $end; + } + $start = $nextStart; + } + return $chunks; + } + + private function mergeTimelineEvents(array $events): array + { + $merged = []; + foreach ($events as $event) { + if (!is_array($event)) { + continue; + } + $key = $this->timelineEventSignature($event); + if (!isset($merged[$key])) { + $merged[$key] = $event; + continue; + } + $existing = $merged[$key]; + $candidateExcerpt = (string)($event['source_excerpt'] ?? ''); + $existingExcerpt = (string)($existing['source_excerpt'] ?? ''); + $additionalExcerpt = $candidateExcerpt; + if ($this->timelineConfidenceRank((string)($event['confidence'] ?? 'medium')) > $this->timelineConfidenceRank((string)($existing['confidence'] ?? 'medium'))) { + $merged[$key] = $event; + $additionalExcerpt = $existingExcerpt; + } + $oldExcerpt = (string)($merged[$key]['source_excerpt'] ?? ''); + $newExcerpt = $additionalExcerpt; + if ($newExcerpt !== '' && $oldExcerpt !== '' && $newExcerpt !== $oldExcerpt && mb_strlen($oldExcerpt, 'UTF-8') < 260) { + $merged[$key]['source_excerpt'] = $oldExcerpt . ' / ' . $newExcerpt; + } + } + + $events = array_values($merged); + usort($events, static function (array $a, array $b): int { + $ad = (string)($a['date'] ?? ''); + $bd = (string)($b['date'] ?? ''); + $ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99'; + $bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99'; + $cmp = strcmp($ai, $bi); + if ($cmp !== 0) { + return $cmp; + } + return strcmp((string)($a['time'] ?? ''), (string)($b['time'] ?? '')); + }); + return $events; + } + + private function timelineEventSignature(array $event): string + { + $date = mb_strtolower(trim((string)($event['date'] ?? '')), 'UTF-8'); + $time = mb_strtolower(trim((string)($event['time'] ?? '')), 'UTF-8'); + $actor = mb_strtolower(trim((string)($event['actor'] ?? 'unknown')), 'UTF-8'); + $body = mb_strtolower(trim((string)($event['event'] ?? '')), 'UTF-8'); + $body = (string)preg_replace('/[^\p{L}\p{N}]+/u', ' ', $body); + $body = trim((string)preg_replace('/\s+/u', ' ', $body)); + return $date . '|' . $time . '|' . $actor . '|' . mb_substr($body, 0, 96, 'UTF-8'); + } + + private function timelineConfidenceRank(string $confidence): int + { + return match ($confidence) { + 'high' => 3, + 'medium' => 2, + default => 1, + }; + } + private function timelineDateHintCount(string $text): int { preg_match_all('/(? self::MAX_PASTE_CHARS) { + $maxChars ??= self::MAX_PASTE_CHARS; + if (mb_strlen($text, 'UTF-8') > $maxChars) { dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long'); } return $text; diff --git a/includes/ToolModels.php b/includes/ToolModels.php index 1312cba..0621314 100644 --- a/includes/ToolModels.php +++ b/includes/ToolModels.php @@ -1,6 +1,7 @@ self::TIMELINE_DEEP_CHAR_LIMIT) { + if ($charCount > self::TIMELINE_DEEP_MAX_CHARS) { throw new DbnToolsHttpException( 'This timeline input is too large after selected documents or My Case context were added. Split the file or use fewer selected documents.', 413, 'timeline_input_too_large', - ['input_char_count' => $charCount, 'max_chars' => self::TIMELINE_DEEP_CHAR_LIMIT] + ['input_char_count' => $charCount, 'max_chars' => self::TIMELINE_DEEP_MAX_CHARS] ); } $effectiveEngine = $tierEngine; - if ($charCount > self::TIMELINE_STANDARD_CHAR_LIMIT) { - $effectiveEngine = 'azure_full'; - } elseif ($charCount > self::TIMELINE_QUICK_CHAR_LIMIT && $effectiveEngine === 'nova_lite') { - $effectiveEngine = 'azure_mini'; + if ($charCount > self::timelineEngineMaxChars($effectiveEngine)) { + $effectiveEngine = $charCount <= self::TIMELINE_STANDARD_MAX_CHARS ? 'azure_mini' : 'azure_full'; + } elseif ($charCount > self::TIMELINE_STANDARD_CHAR_LIMIT && $effectiveEngine === 'nova_lite') { + $effectiveEngine = $charCount <= self::TIMELINE_QUICK_MAX_CHARS ? 'nova_lite' : 'azure_mini'; } + if ($charCount > self::timelineEngineMaxChars($effectiveEngine)) { + $effectiveEngine = 'azure_full'; + } + + $credits = self::timelineCreditsForSize($effectiveEngine, $charCount); + $baseCredits = self::timelineAdvertisedCredits($requestedEngine); + $requiresConfirmation = $credits > $baseCredits + || self::timelineEngineRank($effectiveEngine) > self::timelineEngineRank($requestedEngine); + $chunked = $charCount > self::timelineEngineLimit($effectiveEngine); + return [ 'requested_engine' => $requestedEngine, 'tier_engine' => $tierEngine, @@ -61,13 +75,48 @@ final class ToolModels 'auto_upgraded_engine' => $effectiveEngine !== $tierEngine, 'input_char_count' => $charCount, 'engine_limit_chars' => self::timelineEngineLimit($effectiveEngine), - 'credits' => self::timelineCredits($effectiveEngine), + 'max_char_limit' => self::timelineEngineMaxChars($effectiveEngine), + 'chunked_timeline' => $chunked, + 'timeline_chunk_count' => $chunked ? (int)ceil($charCount / self::timelineChunkSize($effectiveEngine)) : 1, + 'estimated_credits' => $credits, + 'credits' => $credits, + 'base_credits' => $baseCredits, + 'requires_confirmation' => $requiresConfirmation, ]; } + public static function assertTimelineQuoteAccepted(array $route, array $input): void + { + if (empty($route['requires_confirmation'])) { + return; + } + + $accepted = !empty($input['accepted_timeline_quote']) + && (int)($input['accepted_credits'] ?? 0) === (int)$route['credits'] + && (string)($input['accepted_effective_engine'] ?? '') === (string)$route['effective_engine']; + + if ($accepted) { + return; + } + + $engineLabel = self::timelineEngineLabel((string)$route['effective_engine']); + throw new DbnToolsHttpException( + 'This timeline is larger than the selected engine can handle at the advertised price. Confirm the quoted engine and credits before running.', + 409, + 'timeline_quote_required', + ['timeline_quote' => array_merge($route, [ + 'effective_engine_label' => $engineLabel, + 'message' => 'Timeline will use ' . $engineLabel . ' for ' + . number_format((int)$route['input_char_count']) + . ' characters across about ' . (int)$route['timeline_chunk_count'] + . ' chunk(s), costing ' . (int)$route['credits'] . ' credit(s).', + ])] + ); + } + public static function timelineCredits(string $engine): int { - return $engine === 'azure_full' ? 2 : 1; + return self::timelineAdvertisedCredits($engine); } public static function timelineEngineLimit(string $engine): int @@ -78,4 +127,55 @@ final class ToolModels default => self::TIMELINE_DEEP_CHAR_LIMIT, }; } + + public static function timelineChunkSize(string $engine): int + { + return match ($engine) { + 'nova_lite' => 10000, + 'azure_mini' => 16000, + default => 30000, + }; + } + + public static function timelineEngineMaxChars(string $engine): int + { + return match ($engine) { + 'nova_lite' => self::TIMELINE_QUICK_MAX_CHARS, + 'azure_mini' => self::TIMELINE_STANDARD_MAX_CHARS, + default => self::TIMELINE_DEEP_MAX_CHARS, + }; + } + + public static function timelineCreditsForSize(string $engine, int $charCount): int + { + return match ($engine) { + 'nova_lite' => $charCount <= self::TIMELINE_QUICK_CHAR_LIMIT ? 1 : 2, + 'azure_mini' => $charCount <= self::TIMELINE_STANDARD_CHAR_LIMIT ? 1 : ($charCount <= 180000 ? 2 : 3), + default => $charCount <= self::TIMELINE_DEEP_CHAR_LIMIT ? 2 : ($charCount <= 350000 ? 4 : 6), + }; + } + + public static function timelineAdvertisedCredits(string $engine): int + { + return $engine === 'azure_full' ? 2 : 1; + } + + public static function timelineEngineLabel(string $engine): string + { + return match ($engine) { + 'nova_lite' => 'Quick', + 'azure_full' => 'Deep', + default => 'Standard', + }; + } + + private static function timelineEngineRank(string $engine): int + { + return match ($engine) { + 'nova_lite' => 1, + 'azure_mini' => 2, + 'azure_full' => 3, + default => 0, + }; + } } diff --git a/includes/bootstrap.php b/includes/bootstrap.php index a2e957d..c43bed4 100644 --- a/includes/bootstrap.php +++ b/includes/bootstrap.php @@ -890,9 +890,10 @@ function dbnToolsExcerpt(string $text, int $limit = 520): string const DBN_TOOLS_EXTRACT_MAX_BYTES = 8 * 1024 * 1024; const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000; +const DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT = 600000; const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx']; -function dbnToolsExtractUploadedFile(array $file): array +function dbnToolsExtractUploadedFile(array $file, int $textLimit = DBN_TOOLS_EXTRACT_TEXT_LIMIT): array { $errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE); if ($errCode !== UPLOAD_ERR_OK) { @@ -936,8 +937,9 @@ function dbnToolsExtractUploadedFile(array $file): array } $truncated = false; - if (mb_strlen($text, 'UTF-8') > DBN_TOOLS_EXTRACT_TEXT_LIMIT) { - $text = mb_substr($text, 0, DBN_TOOLS_EXTRACT_TEXT_LIMIT, 'UTF-8'); + $textLimit = max(1000, min($textLimit, DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT)); + if (mb_strlen($text, 'UTF-8') > $textLimit) { + $text = mb_substr($text, 0, $textLimit, 'UTF-8'); $truncated = true; } @@ -947,6 +949,7 @@ function dbnToolsExtractUploadedFile(array $file): array 'filename' => $originalName, 'chars' => mb_strlen($text, 'UTF-8'), 'truncated' => $truncated, + 'limit' => $textLimit, ]; }