Add chunked timeline routing
This commit is contained in:
+3
-1
@@ -12,7 +12,9 @@ try {
|
||||
dbnToolsError('No file was uploaded.', 422, 'missing_file');
|
||||
}
|
||||
|
||||
$result = dbnToolsExtractUploadedFile($_FILES['file']);
|
||||
$tool = (string)($_POST['tool'] ?? '');
|
||||
$limit = $tool === 'timeline' ? DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT : DBN_TOOLS_EXTRACT_TEXT_LIMIT;
|
||||
$result = dbnToolsExtractUploadedFile($_FILES['file'], $limit);
|
||||
$ftRemaining = dbnToolsFreeTierDeduct($ftUid, 'extract');
|
||||
if ($ftRemaining >= 0) {
|
||||
header('X-Credits-Remaining: ' . $ftRemaining);
|
||||
|
||||
+12
-2
@@ -9,7 +9,7 @@ dbnToolsRequireAuth();
|
||||
|
||||
// Parse input and run credit pre-check BEFORE emitting SSE headers so that
|
||||
// auth/credit errors can still return JSON (dbnToolsError / dbnToolsAbort).
|
||||
$input = dbnToolsJsonInput(400000);
|
||||
$input = dbnToolsJsonInput(1500000);
|
||||
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
|
||||
|
||||
$_validEngines = ['nova_lite', 'azure_mini', 'azure_full'];
|
||||
@@ -17,7 +17,7 @@ $_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, t
|
||||
? (string)$input['engine'] : 'azure_mini';
|
||||
|
||||
try {
|
||||
$text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', 128000, false));
|
||||
$text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', ToolModels::TIMELINE_DEEP_MAX_CHARS, false));
|
||||
if (mb_strlen(trim($text), 'UTF-8') < 10) {
|
||||
dbnToolsError('Paste text, upload a file, or select a document before running.', 422, 'empty_text');
|
||||
}
|
||||
@@ -33,6 +33,7 @@ try {
|
||||
}
|
||||
|
||||
$timelineRoute = ToolModels::timelineRoute($ftUid, $_requestedEngine, $text);
|
||||
ToolModels::assertTimelineQuoteAccepted($timelineRoute, $input);
|
||||
$ftUid = dbnToolsFreeTierCheckAmount('timeline', (int)$timelineRoute['credits']);
|
||||
} catch (DbnToolsHttpException $e) {
|
||||
dbnToolsError($e->getMessage(), $e->status, $e->errorCode, $e->extra);
|
||||
@@ -69,6 +70,11 @@ try {
|
||||
'msg' => 'This input is ' . number_format((int)$timelineRoute['input_char_count']) . " characters, so Timeline is using {$label} for reliability.",
|
||||
]);
|
||||
}
|
||||
if (!empty($timelineRoute['chunked_timeline'])) {
|
||||
sseEmit('status', [
|
||||
'msg' => 'Processing ' . (int)$timelineRoute['timeline_chunk_count'] . ' timeline chunk(s).',
|
||||
]);
|
||||
}
|
||||
|
||||
$validFocus = ['all', 'deadlines', 'hearings', 'cps'];
|
||||
$focus = in_array((string)($input['focus'] ?? ''), $validFocus, true)
|
||||
@@ -110,6 +116,10 @@ try {
|
||||
'auto_upgraded_engine' => $timelineRoute['auto_upgraded_engine'],
|
||||
'input_char_count' => $timelineRoute['input_char_count'],
|
||||
'engine_limit_chars' => $timelineRoute['engine_limit_chars'],
|
||||
'max_char_limit' => $timelineRoute['max_char_limit'],
|
||||
'chunked_timeline' => $timelineRoute['chunked_timeline'],
|
||||
'timeline_chunk_count' => $timelineRoute['timeline_chunk_count'],
|
||||
'estimated_credits' => $timelineRoute['estimated_credits'],
|
||||
'credits_charged' => $timelineRoute['credits'],
|
||||
]);
|
||||
|
||||
|
||||
+7
-2
@@ -6,7 +6,7 @@ require_once __DIR__ . '/../includes/ToolModels.php';
|
||||
|
||||
dbnToolsRequireMethod('POST');
|
||||
dbnToolsRequireAuth();
|
||||
$input = dbnToolsJsonInput(400000);
|
||||
$input = dbnToolsJsonInput(1500000);
|
||||
$language = dbnToolsNormalizeLanguage($input['language'] ?? 'en');
|
||||
$_validEngines = ['nova_lite', 'azure_mini', 'azure_full'];
|
||||
$_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, true)
|
||||
@@ -15,7 +15,7 @@ $_requestedEngine = in_array((string)($input['engine'] ?? ''), $_validEngines, t
|
||||
$start = microtime(true);
|
||||
|
||||
try {
|
||||
$text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', 128000, false));
|
||||
$text = dbnToolsInjectDocContent($input, dbnToolsString($input, 'text', ToolModels::TIMELINE_DEEP_MAX_CHARS, false));
|
||||
if (mb_strlen(trim($text), 'UTF-8') < 10) {
|
||||
dbnToolsAbort('Paste text, upload a file, or select a document before running.', 422, 'empty_text');
|
||||
}
|
||||
@@ -33,6 +33,7 @@ try {
|
||||
}
|
||||
|
||||
$timelineRoute = ToolModels::timelineRoute($ftUid, $_requestedEngine, $text);
|
||||
ToolModels::assertTimelineQuoteAccepted($timelineRoute, $input);
|
||||
$ftUid = dbnToolsFreeTierCheckAmount('timeline', (int)$timelineRoute['credits']);
|
||||
|
||||
$validFocus = ['all', 'deadlines', 'hearings', 'cps'];
|
||||
@@ -71,6 +72,10 @@ try {
|
||||
'auto_upgraded_engine' => $timelineRoute['auto_upgraded_engine'],
|
||||
'input_char_count' => $timelineRoute['input_char_count'],
|
||||
'engine_limit_chars' => $timelineRoute['engine_limit_chars'],
|
||||
'max_char_limit' => $timelineRoute['max_char_limit'],
|
||||
'chunked_timeline' => $timelineRoute['chunked_timeline'],
|
||||
'timeline_chunk_count' => $timelineRoute['timeline_chunk_count'],
|
||||
'estimated_credits' => $timelineRoute['estimated_credits'],
|
||||
'credits_charged' => $timelineRoute['credits'],
|
||||
]);
|
||||
|
||||
|
||||
+108
-12
@@ -400,6 +400,7 @@ let lastOriginalText = '';
|
||||
let lastRedactPayload = null;
|
||||
let lastRunEngine = null;
|
||||
let lastToolPayload = null;
|
||||
let pendingTimelineQuote = null;
|
||||
|
||||
const VOCAB_PRESETS = {
|
||||
barnerett: 'Barnevernet, Fylkesnemnda, barnevernloven, barneloven, barnets beste, samvær, foreldreansvar, omsorgsovertakelse, sakkyndig, advokat, prosessfullmektig, dommer, vitne, tolk, bistandsadvokat, fosterforeldre, fosterhjem, akuttvedtak, statsforvalter, Bufetat, saksbehandler, rettslig medhold, begjæring, samtykke, tilsynsfører',
|
||||
@@ -764,10 +765,60 @@ function timelineEngineLabel(engine) {
|
||||
}
|
||||
|
||||
function timelineClientRoute(engine, charCount) {
|
||||
let effective = engine;
|
||||
if (charCount > 55000) effective = 'azure_full';
|
||||
else if (charCount > 25000 && effective === 'nova_lite') effective = 'azure_mini';
|
||||
return { effective, upgraded: effective !== engine };
|
||||
return timelineClientQuote(engine, charCount);
|
||||
}
|
||||
|
||||
function timelineClientQuote(engine, charCount) {
|
||||
const valid = ['nova_lite', 'azure_mini', 'azure_full'];
|
||||
const requested = valid.includes(engine) ? engine : 'azure_mini';
|
||||
const singleLimits = { nova_lite: 25000, azure_mini: 55000, azure_full: 128000 };
|
||||
const maxLimits = { nova_lite: 100000, azure_mini: 300000, azure_full: 600000 };
|
||||
const chunkSizes = { nova_lite: 10000, azure_mini: 16000, azure_full: 30000 };
|
||||
const ranks = { nova_lite: 1, azure_mini: 2, azure_full: 3 };
|
||||
const baseCredits = requested === 'azure_full' ? 2 : 1;
|
||||
let effective = requested;
|
||||
|
||||
if (charCount > 600000) {
|
||||
return {
|
||||
error: true,
|
||||
message: `This timeline input is ${charCount.toLocaleString()} characters. Split the file or use fewer selected documents; the current maximum is 600,000 characters.`,
|
||||
};
|
||||
}
|
||||
if (charCount > maxLimits[effective]) {
|
||||
effective = charCount <= maxLimits.azure_mini ? 'azure_mini' : 'azure_full';
|
||||
}
|
||||
if (charCount > maxLimits[effective]) effective = 'azure_full';
|
||||
|
||||
let credits = 1;
|
||||
if (effective === 'nova_lite') {
|
||||
credits = charCount <= singleLimits.nova_lite ? 1 : 2;
|
||||
} else if (effective === 'azure_mini') {
|
||||
credits = charCount <= singleLimits.azure_mini ? 1 : (charCount <= 180000 ? 2 : 3);
|
||||
} else {
|
||||
credits = charCount <= singleLimits.azure_full ? 2 : (charCount <= 350000 ? 4 : 6);
|
||||
}
|
||||
|
||||
const chunked = charCount > singleLimits[effective];
|
||||
return {
|
||||
requested,
|
||||
effective,
|
||||
upgraded: ranks[effective] > ranks[requested],
|
||||
charCount,
|
||||
credits,
|
||||
baseCredits,
|
||||
chunked,
|
||||
chunkCount: chunked ? Math.ceil(charCount / chunkSizes[effective]) : 1,
|
||||
requiresConfirmation: credits > baseCredits || ranks[effective] > ranks[requested],
|
||||
};
|
||||
}
|
||||
|
||||
function timelineQuoteMessage(quote) {
|
||||
return [
|
||||
`Timeline will use ${timelineEngineLabel(quote.effective)} for ${Number(quote.charCount || 0).toLocaleString()} characters.`,
|
||||
quote.chunked ? `It will process about ${quote.chunkCount} chunks.` : 'It can run in a single pass.',
|
||||
`Cost: ${quote.credits} credit${quote.credits === 1 ? '' : 's'}.`,
|
||||
'Continue?'
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function currentTimelineFocus() {
|
||||
@@ -1122,15 +1173,36 @@ async function runTool(event) {
|
||||
let timelineRouteNotice = '';
|
||||
if (state.activeTool === 'timeline') {
|
||||
payload.engine = currentTimelineEngine();
|
||||
const clientRoute = timelineClientRoute(payload.engine, text.length);
|
||||
const clientRoute = timelineClientQuote(payload.engine, text.length);
|
||||
if (clientRoute.error) {
|
||||
els.status.textContent = clientRoute.message;
|
||||
return;
|
||||
}
|
||||
const pendingQuoteApplies = pendingTimelineQuote
|
||||
&& pendingTimelineQuote.text === text
|
||||
&& pendingTimelineQuote.requested === payload.engine;
|
||||
if (pendingQuoteApplies) {
|
||||
payload.accepted_timeline_quote = true;
|
||||
payload.accepted_credits = pendingTimelineQuote.credits;
|
||||
payload.accepted_effective_engine = pendingTimelineQuote.effective;
|
||||
pendingTimelineQuote = null;
|
||||
} else if (clientRoute.requiresConfirmation) {
|
||||
if (!window.confirm(timelineQuoteMessage(clientRoute))) {
|
||||
els.status.textContent = 'Timeline run cancelled before any credits were charged.';
|
||||
return;
|
||||
}
|
||||
payload.accepted_timeline_quote = true;
|
||||
payload.accepted_credits = clientRoute.credits;
|
||||
payload.accepted_effective_engine = clientRoute.effective;
|
||||
}
|
||||
payload.focus = currentTimelineFocus();
|
||||
payload.confidence_filter = currentConfidenceFilter();
|
||||
payload.include_relative = currentIncludeRelative();
|
||||
payload.include_background = currentIncludeBackground();
|
||||
payload.user_notes = (document.getElementById('timelineNotes')?.value || '').trim();
|
||||
payload.use_my_case = (typeof window.dbnGetUseMyCase === 'function') ? window.dbnGetUseMyCase() : false;
|
||||
timelineRouteNotice = clientRoute.upgraded
|
||||
? `This input is ${text.length.toLocaleString()} characters, so Timeline will use ${timelineEngineLabel(clientRoute.effective)} for reliability.`
|
||||
timelineRouteNotice = clientRoute.upgraded || clientRoute.chunked
|
||||
? `This input is ${text.length.toLocaleString()} characters, so Timeline will use ${timelineEngineLabel(clientRoute.effective)}${clientRoute.chunked ? ` across about ${clientRoute.chunkCount} chunks` : ''}.`
|
||||
: '';
|
||||
}
|
||||
|
||||
@@ -1157,7 +1229,30 @@ async function runTool(event) {
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(payload),
|
||||
});
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
|
||||
if (!resp.ok) {
|
||||
const errData = await resp.json().catch(() => ({}));
|
||||
const quote = errData.timeline_quote;
|
||||
if (errData.error?.code === 'timeline_quote_required' && quote) {
|
||||
const confirmQuote = {
|
||||
effective: quote.effective_engine,
|
||||
charCount: quote.input_char_count,
|
||||
credits: quote.credits || quote.estimated_credits,
|
||||
chunked: Boolean(quote.chunked_timeline),
|
||||
chunkCount: quote.timeline_chunk_count || 1,
|
||||
};
|
||||
if (window.confirm(timelineQuoteMessage(confirmQuote))) {
|
||||
pendingTimelineQuote = {
|
||||
text,
|
||||
requested: payload.engine,
|
||||
effective: confirmQuote.effective,
|
||||
credits: Number(confirmQuote.credits || 0),
|
||||
};
|
||||
return runTool(event);
|
||||
}
|
||||
throw new Error('Timeline run cancelled before any credits were charged.');
|
||||
}
|
||||
throw new Error(errData.error?.message || `HTTP ${resp.status}`);
|
||||
}
|
||||
const reader = resp.body.getReader();
|
||||
const dec = new TextDecoder();
|
||||
let buf = '', event = '';
|
||||
@@ -1194,8 +1289,8 @@ async function runTool(event) {
|
||||
renderResults(data);
|
||||
renderTrace(data.trace || []);
|
||||
const routeMeta = data.trace_metadata || {};
|
||||
const serverRouteNotice = state.activeTool === 'timeline' && routeMeta.auto_upgraded_engine
|
||||
? ` Used ${timelineEngineLabel(routeMeta.effective_engine)} for ${Number(routeMeta.input_char_count || 0).toLocaleString()} characters.`
|
||||
const serverRouteNotice = state.activeTool === 'timeline' && (routeMeta.auto_upgraded_engine || routeMeta.chunked_timeline || routeMeta.credits_charged)
|
||||
? ` Used ${timelineEngineLabel(routeMeta.effective_engine)} for ${Number(routeMeta.input_char_count || 0).toLocaleString()} characters${routeMeta.chunked_timeline ? ` across ${routeMeta.timeline_chunk_count || 1} chunks` : ''}; charged ${routeMeta.credits_charged || routeMeta.estimated_credits || 1} credit(s).`
|
||||
: '';
|
||||
els.status.textContent = `Done in ${data.latency_ms || 0} ms.${serverRouteNotice}`;
|
||||
if (['ask', 'redact', 'timeline'].includes(state.activeTool)) {
|
||||
@@ -1299,6 +1394,7 @@ async function handleFiles(fileList) {
|
||||
for (const file of files) {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
formData.append('tool', state.activeTool);
|
||||
|
||||
const resp = await fetch('api/extract.php', {
|
||||
method: 'POST',
|
||||
@@ -1318,7 +1414,7 @@ async function handleFiles(fileList) {
|
||||
|
||||
const combined = parts[0].text;
|
||||
|
||||
const MAX_COMBINED = 128000;
|
||||
const MAX_COMBINED = state.activeTool === 'timeline' ? 600000 : 128000;
|
||||
const combinedTruncated = combined.length > MAX_COMBINED;
|
||||
els.input.value = combinedTruncated ? combined.slice(0, MAX_COMBINED) : combined;
|
||||
|
||||
@@ -1328,7 +1424,7 @@ async function handleFiles(fileList) {
|
||||
els.uploadPrompt.classList.add('is-hidden');
|
||||
els.uploadFileInfo.classList.remove('is-hidden');
|
||||
|
||||
const truncNote = (anyTruncated || combinedTruncated) ? ' — truncated to 128 000 char limit' : '';
|
||||
const truncNote = (anyTruncated || combinedTruncated) ? ` - truncated to ${MAX_COMBINED.toLocaleString()} char limit` : '';
|
||||
els.status.textContent = `Extracted ${totalChars.toLocaleString()} chars from ${parts[0].filename}${truncNote}.`;
|
||||
} catch (err) {
|
||||
els.status.textContent = err.message;
|
||||
|
||||
+277
-3
@@ -7,6 +7,7 @@ require_once __DIR__ . '/AzureOpenAiGateway.php';
|
||||
final class DbnLegalToolsService
|
||||
{
|
||||
private const MAX_PASTE_CHARS = 128000;
|
||||
private const MAX_TIMELINE_CHARS = 600000;
|
||||
|
||||
private DbnAzureOpenAiGateway $azure;
|
||||
|
||||
@@ -353,7 +354,7 @@ PROMPT;
|
||||
string $userNotes = '',
|
||||
?callable $onProgress = null
|
||||
): array {
|
||||
$text = $this->requirePasteText($text);
|
||||
$text = $this->requirePasteText($text, self::MAX_TIMELINE_CHARS);
|
||||
$engine = in_array($engine, ['nova_lite', 'azure_mini', 'azure_full'], true) ? $engine : 'azure_mini';
|
||||
$focus = in_array($focus, ['all', 'deadlines', 'hearings', 'cps'], true) ? $focus : 'all';
|
||||
|
||||
@@ -382,6 +383,23 @@ PROMPT;
|
||||
? "\n\nUser-provided context notes (use these to resolve ambiguities, not as source events):\n---\n" . $userNotes . "\n---"
|
||||
: '';
|
||||
|
||||
$charCount = mb_strlen($text, 'UTF-8');
|
||||
$singlePassLimit = $this->timelineSinglePassLimit($engine);
|
||||
if ($charCount > $singlePassLimit) {
|
||||
return $this->timelineChunked(
|
||||
$text,
|
||||
$language,
|
||||
$engine,
|
||||
$focus,
|
||||
$confidenceFilter,
|
||||
$includeRelative,
|
||||
$includeBackground,
|
||||
$userNotes,
|
||||
$onProgress,
|
||||
$inputDateHintCount
|
||||
);
|
||||
}
|
||||
|
||||
$prompt = <<<PROMPT
|
||||
Build a chronological timeline from the pasted text in {$locale}.{$userNotesBlock}
|
||||
|
||||
@@ -589,6 +607,261 @@ PROMPT;
|
||||
];
|
||||
}
|
||||
|
||||
private function timelineChunked(
|
||||
string $text,
|
||||
string $language,
|
||||
string $engine,
|
||||
string $focus,
|
||||
string $confidenceFilter,
|
||||
bool $includeRelative,
|
||||
bool $includeBackground,
|
||||
string $userNotes,
|
||||
?callable $onProgress,
|
||||
int $inputDateHintCount
|
||||
): array {
|
||||
$engineLabel = match ($engine) { 'azure_full' => 'gpt-4o', 'nova_lite' => 'nova-lite', default => 'gpt-4o-mini' };
|
||||
$chunkSize = $this->timelineChunkSize($engine);
|
||||
$chunks = $this->timelineTextChunks($text, $chunkSize, 900);
|
||||
$chunkCount = count($chunks);
|
||||
$events = [];
|
||||
$chunkFailures = 0;
|
||||
$usedFallbackExtractor = false;
|
||||
|
||||
$onProgress && $onProgress('Splitting timeline into ' . $chunkCount . " chunk(s)\u{2026}");
|
||||
|
||||
foreach ($chunks as $idx => $chunk) {
|
||||
$chunkNo = $idx + 1;
|
||||
$chunkText = trim((string)$chunk['text']);
|
||||
if (mb_strlen($chunkText, 'UTF-8') < 20) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$onProgress && $onProgress("Extracting timeline chunk {$chunkNo}/{$chunkCount}\u{2026}");
|
||||
try {
|
||||
$result = $this->timeline(
|
||||
$chunkText,
|
||||
$language,
|
||||
$engine,
|
||||
$focus,
|
||||
$confidenceFilter,
|
||||
$includeRelative,
|
||||
$includeBackground,
|
||||
$userNotes,
|
||||
null
|
||||
);
|
||||
$chunkEvents = is_array($result['events'] ?? null) ? $result['events'] : [];
|
||||
if (!empty($result['trace_metadata']['used_fallback_extractor'])) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
} catch (DbnToolsHttpException $e) {
|
||||
$chunkFailures++;
|
||||
$chunkEvents = [];
|
||||
if ($this->timelineDateHintCount($chunkText) > 0) {
|
||||
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
|
||||
if ($chunkEvents) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
}
|
||||
if (!$chunkEvents && $e->status >= 500) {
|
||||
error_log('timeline chunk failed: ' . $e->errorCode . ' ' . $e->getMessage());
|
||||
}
|
||||
} catch (Throwable $e) {
|
||||
$chunkFailures++;
|
||||
$chunkEvents = $this->fallbackTimelineEvents($chunkText);
|
||||
if ($chunkEvents) {
|
||||
$usedFallbackExtractor = true;
|
||||
}
|
||||
error_log('timeline chunk throwable: ' . $e->getMessage());
|
||||
}
|
||||
|
||||
foreach ($chunkEvents as $event) {
|
||||
if (!is_array($event)) {
|
||||
continue;
|
||||
}
|
||||
$event['chunk_index'] = $chunkNo;
|
||||
$event['source_position'] = (int)$chunk['start'];
|
||||
$events[] = $event;
|
||||
}
|
||||
}
|
||||
|
||||
$events = $this->mergeTimelineEvents($events);
|
||||
if ($confidenceFilter === 'high_medium') {
|
||||
$events = array_values(array_filter($events, fn($ev) => ($ev['confidence'] ?? 'low') !== 'low'));
|
||||
}
|
||||
if (!$includeRelative) {
|
||||
$events = array_values(array_filter($events, fn($ev) => ($ev['date_type'] ?? 'absolute') === 'absolute'));
|
||||
}
|
||||
|
||||
$focusLabel = match ($focus) {
|
||||
'deadlines' => 'legal deadlines',
|
||||
'hearings' => 'court hearings',
|
||||
'cps' => 'CPS milestones',
|
||||
default => 'all events',
|
||||
};
|
||||
$isoDates = array_values(array_filter(array_map(fn($ev) => (string)($ev['date'] ?? ''), $events), fn($d) => preg_match('/^\d{4}-\d{2}-\d{2}$/', $d)));
|
||||
sort($isoDates);
|
||||
$range = $isoDates ? (' from ' . $isoDates[0] . ' to ' . $isoDates[count($isoDates) - 1]) : '';
|
||||
$actors = array_values(array_unique(array_filter(array_map(fn($ev) => (string)($ev['actor'] ?? ''), $events), fn($a) => $a !== '' && $a !== 'unknown')));
|
||||
$summary = count($events) . " event(s) extracted from {$chunkCount} chunk(s){$range}.";
|
||||
if ($actors) {
|
||||
$summary .= ' Main actors: ' . implode(', ', array_slice($actors, 0, 8)) . '.';
|
||||
}
|
||||
|
||||
$uncertain = [];
|
||||
if ($chunkFailures > 0) {
|
||||
$uncertain[] = "{$chunkFailures} chunk(s) needed fallback extraction or could not be fully parsed; review medium-confidence entries against the source.";
|
||||
}
|
||||
if ($usedFallbackExtractor) {
|
||||
$uncertain[] = 'Deterministic fallback extraction was used for at least one chunk.';
|
||||
}
|
||||
|
||||
$trace = [
|
||||
$this->trace('Query interpretation', "Extract {$focusLabel} from pasted text. Engine: {$engineLabel}. Chunked timeline mode; without saving the text or output.", 'complete'),
|
||||
$this->trace('Search tools used', 'No external corpus search; source is the user-pasted text split into local chunks.', 'complete'),
|
||||
$this->trace('Evidence found', count($events) . " event(s) identified across {$chunkCount} chunk(s).", count($events) ? 'complete' : 'warning'),
|
||||
$this->trace('Citation confidence', 'Confidence is per event and based only on the pasted text. Overlapping chunks were de-duplicated.', 'complete'),
|
||||
$this->trace('Uncertainty / missing evidence', $this->uncertaintySummary($uncertain), $uncertain ? 'warning' : 'complete'),
|
||||
$this->trace('Next practical step', 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.', 'complete'),
|
||||
];
|
||||
|
||||
return [
|
||||
'tool' => 'timeline',
|
||||
'language' => $language,
|
||||
'what_we_found' => $summary,
|
||||
'events' => $events,
|
||||
'evidence_trail' => [['title' => 'Pasted text', 'excerpt' => 'Processed in-memory only; split into local timeline chunks; not stored.']],
|
||||
'what_remains_uncertain' => $uncertain,
|
||||
'next_practical_step' => 'Review the merged timeline against the original uploaded document, especially duplicated or medium-confidence entries.',
|
||||
'trace' => $trace,
|
||||
'trace_metadata' => [
|
||||
'chunk_count' => count($events),
|
||||
'source_count' => $chunkCount,
|
||||
'deployment' => $engineLabel,
|
||||
'input_date_hint_count' => $inputDateHintCount,
|
||||
'used_fallback_extractor' => $usedFallbackExtractor,
|
||||
'chunked_timeline' => true,
|
||||
'timeline_chunk_count' => $chunkCount,
|
||||
'chunk_failures' => $chunkFailures,
|
||||
],
|
||||
'disclaimer' => dbnToolsDisclaimer($language),
|
||||
];
|
||||
}
|
||||
|
||||
private function timelineSinglePassLimit(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 25000,
|
||||
'azure_mini' => 55000,
|
||||
default => 128000,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineChunkSize(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 10000,
|
||||
'azure_mini' => 16000,
|
||||
default => 30000,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineTextChunks(string $text, int $chunkSize, int $overlap): array
|
||||
{
|
||||
$len = mb_strlen($text, 'UTF-8');
|
||||
$chunks = [];
|
||||
$start = 0;
|
||||
while ($start < $len) {
|
||||
$targetEnd = min($len, $start + $chunkSize);
|
||||
$window = mb_substr($text, $start, $targetEnd - $start, 'UTF-8');
|
||||
$end = $targetEnd;
|
||||
if ($targetEnd < $len) {
|
||||
$breakAt = mb_strrpos($window, "\n\n", 0, 'UTF-8');
|
||||
if ($breakAt === false || $breakAt < (int)($chunkSize * 0.55)) {
|
||||
$breakAt = mb_strrpos($window, "\n", 0, 'UTF-8');
|
||||
}
|
||||
if ($breakAt !== false && $breakAt > (int)($chunkSize * 0.45)) {
|
||||
$end = $start + $breakAt;
|
||||
}
|
||||
}
|
||||
|
||||
$chunkText = trim(mb_substr($text, $start, max(1, $end - $start), 'UTF-8'));
|
||||
if ($chunkText !== '') {
|
||||
$chunks[] = ['start' => $start, 'text' => $chunkText];
|
||||
}
|
||||
if ($end >= $len) {
|
||||
break;
|
||||
}
|
||||
$nextStart = max(0, $end - $overlap);
|
||||
if ($nextStart <= $start) {
|
||||
$nextStart = $end;
|
||||
}
|
||||
$start = $nextStart;
|
||||
}
|
||||
return $chunks;
|
||||
}
|
||||
|
||||
private function mergeTimelineEvents(array $events): array
|
||||
{
|
||||
$merged = [];
|
||||
foreach ($events as $event) {
|
||||
if (!is_array($event)) {
|
||||
continue;
|
||||
}
|
||||
$key = $this->timelineEventSignature($event);
|
||||
if (!isset($merged[$key])) {
|
||||
$merged[$key] = $event;
|
||||
continue;
|
||||
}
|
||||
$existing = $merged[$key];
|
||||
$candidateExcerpt = (string)($event['source_excerpt'] ?? '');
|
||||
$existingExcerpt = (string)($existing['source_excerpt'] ?? '');
|
||||
$additionalExcerpt = $candidateExcerpt;
|
||||
if ($this->timelineConfidenceRank((string)($event['confidence'] ?? 'medium')) > $this->timelineConfidenceRank((string)($existing['confidence'] ?? 'medium'))) {
|
||||
$merged[$key] = $event;
|
||||
$additionalExcerpt = $existingExcerpt;
|
||||
}
|
||||
$oldExcerpt = (string)($merged[$key]['source_excerpt'] ?? '');
|
||||
$newExcerpt = $additionalExcerpt;
|
||||
if ($newExcerpt !== '' && $oldExcerpt !== '' && $newExcerpt !== $oldExcerpt && mb_strlen($oldExcerpt, 'UTF-8') < 260) {
|
||||
$merged[$key]['source_excerpt'] = $oldExcerpt . ' / ' . $newExcerpt;
|
||||
}
|
||||
}
|
||||
|
||||
$events = array_values($merged);
|
||||
usort($events, static function (array $a, array $b): int {
|
||||
$ad = (string)($a['date'] ?? '');
|
||||
$bd = (string)($b['date'] ?? '');
|
||||
$ai = preg_match('/^\d{4}-\d{2}-\d{2}$/', $ad) ? $ad : '9999-99-99';
|
||||
$bi = preg_match('/^\d{4}-\d{2}-\d{2}$/', $bd) ? $bd : '9999-99-99';
|
||||
$cmp = strcmp($ai, $bi);
|
||||
if ($cmp !== 0) {
|
||||
return $cmp;
|
||||
}
|
||||
return strcmp((string)($a['time'] ?? ''), (string)($b['time'] ?? ''));
|
||||
});
|
||||
return $events;
|
||||
}
|
||||
|
||||
private function timelineEventSignature(array $event): string
|
||||
{
|
||||
$date = mb_strtolower(trim((string)($event['date'] ?? '')), 'UTF-8');
|
||||
$time = mb_strtolower(trim((string)($event['time'] ?? '')), 'UTF-8');
|
||||
$actor = mb_strtolower(trim((string)($event['actor'] ?? 'unknown')), 'UTF-8');
|
||||
$body = mb_strtolower(trim((string)($event['event'] ?? '')), 'UTF-8');
|
||||
$body = (string)preg_replace('/[^\p{L}\p{N}]+/u', ' ', $body);
|
||||
$body = trim((string)preg_replace('/\s+/u', ' ', $body));
|
||||
return $date . '|' . $time . '|' . $actor . '|' . mb_substr($body, 0, 96, 'UTF-8');
|
||||
}
|
||||
|
||||
private function timelineConfidenceRank(string $confidence): int
|
||||
{
|
||||
return match ($confidence) {
|
||||
'high' => 3,
|
||||
'medium' => 2,
|
||||
default => 1,
|
||||
};
|
||||
}
|
||||
|
||||
private function timelineDateHintCount(string $text): int
|
||||
{
|
||||
preg_match_all('/(?<!\d)\d{1,2}\.\d{1,2}\.(?:\d{2,4})?(?!\d)/u', $text, $numeric);
|
||||
@@ -1106,13 +1379,14 @@ PROMPT;
|
||||
return array_slice(array_values(array_unique($terms)), 0, 6);
|
||||
}
|
||||
|
||||
private function requirePasteText(string $text): string
|
||||
private function requirePasteText(string $text, ?int $maxChars = null): string
|
||||
{
|
||||
$text = trim($text);
|
||||
if (mb_strlen($text, 'UTF-8') < 20) {
|
||||
dbnToolsAbort('Paste at least 20 characters of text.', 422, 'text_too_short');
|
||||
}
|
||||
if (mb_strlen($text, 'UTF-8') > self::MAX_PASTE_CHARS) {
|
||||
$maxChars ??= self::MAX_PASTE_CHARS;
|
||||
if (mb_strlen($text, 'UTF-8') > $maxChars) {
|
||||
dbnToolsAbort('Pasted text is too long for the MVP limit.', 422, 'text_too_long');
|
||||
}
|
||||
return $text;
|
||||
|
||||
+108
-8
@@ -1,6 +1,7 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/bootstrap.php';
|
||||
require_once __DIR__ . '/FreeTier.php';
|
||||
|
||||
/**
|
||||
@@ -14,6 +15,9 @@ final class ToolModels
|
||||
public const TIMELINE_QUICK_CHAR_LIMIT = 25000;
|
||||
public const TIMELINE_STANDARD_CHAR_LIMIT = 55000;
|
||||
public const TIMELINE_DEEP_CHAR_LIMIT = 128000;
|
||||
public const TIMELINE_QUICK_MAX_CHARS = 100000;
|
||||
public const TIMELINE_STANDARD_MAX_CHARS = 300000;
|
||||
public const TIMELINE_DEEP_MAX_CHARS = 600000;
|
||||
|
||||
public static function engineForUser(int $userId, string $requestedEngine): string
|
||||
{
|
||||
@@ -38,22 +42,32 @@ final class ToolModels
|
||||
$tierEngine = self::engineForUser($userId, $requestedEngine);
|
||||
$charCount = mb_strlen($text, 'UTF-8');
|
||||
|
||||
if ($charCount > self::TIMELINE_DEEP_CHAR_LIMIT) {
|
||||
if ($charCount > self::TIMELINE_DEEP_MAX_CHARS) {
|
||||
throw new DbnToolsHttpException(
|
||||
'This timeline input is too large after selected documents or My Case context were added. Split the file or use fewer selected documents.',
|
||||
413,
|
||||
'timeline_input_too_large',
|
||||
['input_char_count' => $charCount, 'max_chars' => self::TIMELINE_DEEP_CHAR_LIMIT]
|
||||
['input_char_count' => $charCount, 'max_chars' => self::TIMELINE_DEEP_MAX_CHARS]
|
||||
);
|
||||
}
|
||||
|
||||
$effectiveEngine = $tierEngine;
|
||||
if ($charCount > self::TIMELINE_STANDARD_CHAR_LIMIT) {
|
||||
$effectiveEngine = 'azure_full';
|
||||
} elseif ($charCount > self::TIMELINE_QUICK_CHAR_LIMIT && $effectiveEngine === 'nova_lite') {
|
||||
$effectiveEngine = 'azure_mini';
|
||||
if ($charCount > self::timelineEngineMaxChars($effectiveEngine)) {
|
||||
$effectiveEngine = $charCount <= self::TIMELINE_STANDARD_MAX_CHARS ? 'azure_mini' : 'azure_full';
|
||||
} elseif ($charCount > self::TIMELINE_STANDARD_CHAR_LIMIT && $effectiveEngine === 'nova_lite') {
|
||||
$effectiveEngine = $charCount <= self::TIMELINE_QUICK_MAX_CHARS ? 'nova_lite' : 'azure_mini';
|
||||
}
|
||||
|
||||
if ($charCount > self::timelineEngineMaxChars($effectiveEngine)) {
|
||||
$effectiveEngine = 'azure_full';
|
||||
}
|
||||
|
||||
$credits = self::timelineCreditsForSize($effectiveEngine, $charCount);
|
||||
$baseCredits = self::timelineAdvertisedCredits($requestedEngine);
|
||||
$requiresConfirmation = $credits > $baseCredits
|
||||
|| self::timelineEngineRank($effectiveEngine) > self::timelineEngineRank($requestedEngine);
|
||||
$chunked = $charCount > self::timelineEngineLimit($effectiveEngine);
|
||||
|
||||
return [
|
||||
'requested_engine' => $requestedEngine,
|
||||
'tier_engine' => $tierEngine,
|
||||
@@ -61,13 +75,48 @@ final class ToolModels
|
||||
'auto_upgraded_engine' => $effectiveEngine !== $tierEngine,
|
||||
'input_char_count' => $charCount,
|
||||
'engine_limit_chars' => self::timelineEngineLimit($effectiveEngine),
|
||||
'credits' => self::timelineCredits($effectiveEngine),
|
||||
'max_char_limit' => self::timelineEngineMaxChars($effectiveEngine),
|
||||
'chunked_timeline' => $chunked,
|
||||
'timeline_chunk_count' => $chunked ? (int)ceil($charCount / self::timelineChunkSize($effectiveEngine)) : 1,
|
||||
'estimated_credits' => $credits,
|
||||
'credits' => $credits,
|
||||
'base_credits' => $baseCredits,
|
||||
'requires_confirmation' => $requiresConfirmation,
|
||||
];
|
||||
}
|
||||
|
||||
public static function assertTimelineQuoteAccepted(array $route, array $input): void
|
||||
{
|
||||
if (empty($route['requires_confirmation'])) {
|
||||
return;
|
||||
}
|
||||
|
||||
$accepted = !empty($input['accepted_timeline_quote'])
|
||||
&& (int)($input['accepted_credits'] ?? 0) === (int)$route['credits']
|
||||
&& (string)($input['accepted_effective_engine'] ?? '') === (string)$route['effective_engine'];
|
||||
|
||||
if ($accepted) {
|
||||
return;
|
||||
}
|
||||
|
||||
$engineLabel = self::timelineEngineLabel((string)$route['effective_engine']);
|
||||
throw new DbnToolsHttpException(
|
||||
'This timeline is larger than the selected engine can handle at the advertised price. Confirm the quoted engine and credits before running.',
|
||||
409,
|
||||
'timeline_quote_required',
|
||||
['timeline_quote' => array_merge($route, [
|
||||
'effective_engine_label' => $engineLabel,
|
||||
'message' => 'Timeline will use ' . $engineLabel . ' for '
|
||||
. number_format((int)$route['input_char_count'])
|
||||
. ' characters across about ' . (int)$route['timeline_chunk_count']
|
||||
. ' chunk(s), costing ' . (int)$route['credits'] . ' credit(s).',
|
||||
])]
|
||||
);
|
||||
}
|
||||
|
||||
public static function timelineCredits(string $engine): int
|
||||
{
|
||||
return $engine === 'azure_full' ? 2 : 1;
|
||||
return self::timelineAdvertisedCredits($engine);
|
||||
}
|
||||
|
||||
public static function timelineEngineLimit(string $engine): int
|
||||
@@ -78,4 +127,55 @@ final class ToolModels
|
||||
default => self::TIMELINE_DEEP_CHAR_LIMIT,
|
||||
};
|
||||
}
|
||||
|
||||
public static function timelineChunkSize(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 10000,
|
||||
'azure_mini' => 16000,
|
||||
default => 30000,
|
||||
};
|
||||
}
|
||||
|
||||
public static function timelineEngineMaxChars(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => self::TIMELINE_QUICK_MAX_CHARS,
|
||||
'azure_mini' => self::TIMELINE_STANDARD_MAX_CHARS,
|
||||
default => self::TIMELINE_DEEP_MAX_CHARS,
|
||||
};
|
||||
}
|
||||
|
||||
public static function timelineCreditsForSize(string $engine, int $charCount): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => $charCount <= self::TIMELINE_QUICK_CHAR_LIMIT ? 1 : 2,
|
||||
'azure_mini' => $charCount <= self::TIMELINE_STANDARD_CHAR_LIMIT ? 1 : ($charCount <= 180000 ? 2 : 3),
|
||||
default => $charCount <= self::TIMELINE_DEEP_CHAR_LIMIT ? 2 : ($charCount <= 350000 ? 4 : 6),
|
||||
};
|
||||
}
|
||||
|
||||
public static function timelineAdvertisedCredits(string $engine): int
|
||||
{
|
||||
return $engine === 'azure_full' ? 2 : 1;
|
||||
}
|
||||
|
||||
public static function timelineEngineLabel(string $engine): string
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 'Quick',
|
||||
'azure_full' => 'Deep',
|
||||
default => 'Standard',
|
||||
};
|
||||
}
|
||||
|
||||
private static function timelineEngineRank(string $engine): int
|
||||
{
|
||||
return match ($engine) {
|
||||
'nova_lite' => 1,
|
||||
'azure_mini' => 2,
|
||||
'azure_full' => 3,
|
||||
default => 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -890,9 +890,10 @@ function dbnToolsExcerpt(string $text, int $limit = 520): string
|
||||
|
||||
const DBN_TOOLS_EXTRACT_MAX_BYTES = 8 * 1024 * 1024;
|
||||
const DBN_TOOLS_EXTRACT_TEXT_LIMIT = 128000;
|
||||
const DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT = 600000;
|
||||
const DBN_TOOLS_EXTRACT_ALLOWED_EXTS = ['txt', 'pdf', 'docx'];
|
||||
|
||||
function dbnToolsExtractUploadedFile(array $file): array
|
||||
function dbnToolsExtractUploadedFile(array $file, int $textLimit = DBN_TOOLS_EXTRACT_TEXT_LIMIT): array
|
||||
{
|
||||
$errCode = (int)($file['error'] ?? UPLOAD_ERR_NO_FILE);
|
||||
if ($errCode !== UPLOAD_ERR_OK) {
|
||||
@@ -936,8 +937,9 @@ function dbnToolsExtractUploadedFile(array $file): array
|
||||
}
|
||||
|
||||
$truncated = false;
|
||||
if (mb_strlen($text, 'UTF-8') > DBN_TOOLS_EXTRACT_TEXT_LIMIT) {
|
||||
$text = mb_substr($text, 0, DBN_TOOLS_EXTRACT_TEXT_LIMIT, 'UTF-8');
|
||||
$textLimit = max(1000, min($textLimit, DBN_TOOLS_TIMELINE_EXTRACT_TEXT_LIMIT));
|
||||
if (mb_strlen($text, 'UTF-8') > $textLimit) {
|
||||
$text = mb_substr($text, 0, $textLimit, 'UTF-8');
|
||||
$truncated = true;
|
||||
}
|
||||
|
||||
@@ -947,6 +949,7 @@ function dbnToolsExtractUploadedFile(array $file): array
|
||||
'filename' => $originalName,
|
||||
'chars' => mb_strlen($text, 'UTF-8'),
|
||||
'truncated' => $truncated,
|
||||
'limit' => $textLimit,
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user