mirror of
https://github.com/exo-explore/exo.git
synced 2026-02-19 15:27:02 -05:00
Compare commits
2 Commits
feat/mac-s
...
meta-insta
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
21c363e997 | ||
|
|
b1c0e3116d |
@@ -12,7 +12,6 @@
|
||||
} from "$lib/stores/app.svelte";
|
||||
import type { MessageAttachment } from "$lib/stores/app.svelte";
|
||||
import MarkdownContent from "./MarkdownContent.svelte";
|
||||
import PrefillProgressBar from "./PrefillProgressBar.svelte";
|
||||
import TokenHeatmap from "./TokenHeatmap.svelte";
|
||||
import PrefillProgressBar from "./PrefillProgressBar.svelte";
|
||||
import ImageLightbox from "./ImageLightbox.svelte";
|
||||
@@ -626,9 +625,7 @@
|
||||
<MarkdownContent
|
||||
content={message.content || (loading ? response : "")}
|
||||
/>
|
||||
{#if loading && !message.content && prefill}
|
||||
<PrefillProgressBar progress={prefill} class="mt-2" />
|
||||
{:else if loading && !message.content}
|
||||
{#if loading && !message.content}
|
||||
<span
|
||||
class="inline-block w-2 h-4 bg-exo-yellow/70 ml-1 cursor-blink"
|
||||
></span>
|
||||
|
||||
@@ -14,21 +14,6 @@
|
||||
: 0,
|
||||
);
|
||||
|
||||
const etaText = $derived.by(() => {
|
||||
if (progress.processed <= 0 || progress.total <= 0) return null;
|
||||
const elapsedMs = performance.now() - progress.startedAt;
|
||||
if (elapsedMs < 200) return null; // need a minimum sample window
|
||||
const tokensPerMs = progress.processed / elapsedMs;
|
||||
const remainingTokens = progress.total - progress.processed;
|
||||
const remainingMs = remainingTokens / tokensPerMs;
|
||||
const remainingSec = Math.ceil(remainingMs / 1000);
|
||||
if (remainingSec <= 0) return null;
|
||||
if (remainingSec < 60) return `~${remainingSec}s remaining`;
|
||||
const mins = Math.floor(remainingSec / 60);
|
||||
const secs = remainingSec % 60;
|
||||
return `~${mins}m ${secs}s remaining`;
|
||||
});
|
||||
|
||||
function formatTokenCount(count: number | undefined): string {
|
||||
if (count == null) return "0";
|
||||
if (count >= 1000) {
|
||||
@@ -55,11 +40,8 @@
|
||||
style="width: {percentage}%"
|
||||
></div>
|
||||
</div>
|
||||
<div
|
||||
class="flex items-center justify-between text-xs text-exo-light-gray/70 mt-0.5 font-mono"
|
||||
>
|
||||
<span>{etaText ?? ""}</span>
|
||||
<span>{percentage}%</span>
|
||||
<div class="text-right text-xs text-exo-light-gray/70 mt-0.5 font-mono">
|
||||
{percentage}%
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -276,8 +276,6 @@ export interface TokenData {
|
||||
export interface PrefillProgress {
|
||||
processed: number;
|
||||
total: number;
|
||||
/** Timestamp (performance.now()) when prefill started. */
|
||||
startedAt: number;
|
||||
}
|
||||
|
||||
export interface Message {
|
||||
@@ -522,12 +520,12 @@ class AppStore {
|
||||
messages = $state<Message[]>([]);
|
||||
currentResponse = $state("");
|
||||
isLoading = $state(false);
|
||||
prefillProgress = $state<PrefillProgress | null>(null);
|
||||
|
||||
// Performance metrics
|
||||
ttftMs = $state<number | null>(null); // Time to first token in ms
|
||||
tps = $state<number | null>(null); // Tokens per second
|
||||
totalTokens = $state<number>(0); // Total tokens in current response
|
||||
prefillProgress = $state<PrefillProgress | null>(null);
|
||||
|
||||
// Abort controller for stopping generation
|
||||
private currentAbortController: AbortController | null = null;
|
||||
@@ -2020,7 +2018,6 @@ class AppStore {
|
||||
): Promise<void> {
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = "";
|
||||
let currentEventType = "";
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
@@ -2036,15 +2033,7 @@ class AppStore {
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) {
|
||||
currentEventType = "";
|
||||
continue;
|
||||
}
|
||||
|
||||
if (trimmed.startsWith("event: ")) {
|
||||
currentEventType = trimmed.slice(7);
|
||||
continue;
|
||||
}
|
||||
if (!trimmed) continue;
|
||||
|
||||
// Handle SSE comments (": key json") for prefill progress etc.
|
||||
if (trimmed.startsWith(": ") && onEvent) {
|
||||
@@ -2066,22 +2055,14 @@ class AppStore {
|
||||
|
||||
if (trimmed.startsWith("data: ")) {
|
||||
const data = trimmed.slice(6);
|
||||
if (data === "[DONE]") {
|
||||
currentEventType = "";
|
||||
continue;
|
||||
}
|
||||
if (data === "[DONE]") continue;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(data);
|
||||
if (currentEventType && onEvent?.[currentEventType]) {
|
||||
onEvent[currentEventType](parsed);
|
||||
} else {
|
||||
onChunk(parsed as T);
|
||||
}
|
||||
const parsed = JSON.parse(data) as T;
|
||||
onChunk(parsed);
|
||||
} catch {
|
||||
// Skip malformed JSON
|
||||
}
|
||||
currentEventType = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2182,7 +2163,6 @@ class AppStore {
|
||||
|
||||
this.isLoading = true;
|
||||
this.currentResponse = "";
|
||||
this.prefillProgress = null;
|
||||
this.ttftMs = null;
|
||||
this.tps = null;
|
||||
this.totalTokens = 0;
|
||||
@@ -2387,11 +2367,6 @@ class AppStore {
|
||||
}
|
||||
|
||||
if (tokenContent) {
|
||||
// Clear prefill progress once tokens start arriving
|
||||
if (this.prefillProgress !== null) {
|
||||
this.prefillProgress = null;
|
||||
}
|
||||
|
||||
// Track first token for TTFT
|
||||
if (firstTokenTime === null) {
|
||||
firstTokenTime = performance.now();
|
||||
@@ -2445,7 +2420,6 @@ class AppStore {
|
||||
this.prefillProgress = {
|
||||
processed: inner.processed_tokens,
|
||||
total: inner.total_tokens,
|
||||
startedAt: this.prefillProgress?.startedAt ?? performance.now(),
|
||||
};
|
||||
},
|
||||
},
|
||||
@@ -2500,7 +2474,6 @@ class AppStore {
|
||||
this.prefillProgress = null;
|
||||
this.isLoading = false;
|
||||
this.currentResponse = "";
|
||||
this.prefillProgress = null;
|
||||
this.saveConversationsToStorage();
|
||||
}
|
||||
}
|
||||
@@ -3133,10 +3106,10 @@ export const hasStartedChat = () => appStore.hasStartedChat;
|
||||
export const messages = () => appStore.messages;
|
||||
export const currentResponse = () => appStore.currentResponse;
|
||||
export const isLoading = () => appStore.isLoading;
|
||||
export const prefillProgress = () => appStore.prefillProgress;
|
||||
export const ttftMs = () => appStore.ttftMs;
|
||||
export const tps = () => appStore.tps;
|
||||
export const totalTokens = () => appStore.totalTokens;
|
||||
export const prefillProgress = () => appStore.prefillProgress;
|
||||
export const topologyData = () => appStore.topologyData;
|
||||
export const instances = () => appStore.instances;
|
||||
export const runners = () => appStore.runners;
|
||||
|
||||
@@ -338,7 +338,17 @@ class DownloadCoordinator:
|
||||
),
|
||||
)
|
||||
elif progress.status in ["in_progress", "not_started"]:
|
||||
if progress.downloaded_bytes_this_session.in_bytes == 0:
|
||||
if (
|
||||
progress.downloaded_bytes.in_bytes
|
||||
>= progress.total_bytes.in_bytes
|
||||
> 0
|
||||
):
|
||||
status = DownloadCompleted(
|
||||
node_id=self.node_id,
|
||||
shard_metadata=progress.shard,
|
||||
total_bytes=progress.total_bytes,
|
||||
)
|
||||
elif progress.downloaded_bytes_this_session.in_bytes == 0:
|
||||
status = DownloadPending(
|
||||
node_id=self.node_id,
|
||||
shard_metadata=progress.shard,
|
||||
|
||||
@@ -258,7 +258,7 @@ def main():
|
||||
target = min(max(soft, 65535), hard)
|
||||
resource.setrlimit(resource.RLIMIT_NOFILE, (target, hard))
|
||||
|
||||
mp.set_start_method("spawn")
|
||||
mp.set_start_method("spawn", force=True)
|
||||
# TODO: Refactor the current verbosity system
|
||||
logger_setup(EXO_LOG, args.verbosity)
|
||||
logger.info("Starting EXO")
|
||||
|
||||
@@ -31,7 +31,6 @@ from exo.shared.types.openai_responses import (
|
||||
ResponseOutputText,
|
||||
ResponsesRequest,
|
||||
ResponsesResponse,
|
||||
ResponsesStreamEvent,
|
||||
ResponseTextDeltaEvent,
|
||||
ResponseTextDoneEvent,
|
||||
ResponseUsage,
|
||||
@@ -39,11 +38,6 @@ from exo.shared.types.openai_responses import (
|
||||
from exo.shared.types.text_generation import InputMessage, TextGenerationTaskParams
|
||||
|
||||
|
||||
def _format_sse(event: ResponsesStreamEvent) -> str:
|
||||
"""Format a streaming event as an SSE message."""
|
||||
return f"event: {event.type}\ndata: {event.model_dump_json()}\n\n"
|
||||
|
||||
|
||||
def _extract_content(content: str | list[ResponseContentPart]) -> str:
|
||||
"""Extract plain text from a content field that may be a string or list of parts."""
|
||||
if isinstance(content, str):
|
||||
@@ -225,13 +219,13 @@ async def generate_responses_stream(
|
||||
created_event = ResponseCreatedEvent(
|
||||
sequence_number=next(seq), response=initial_response
|
||||
)
|
||||
yield _format_sse(created_event)
|
||||
yield f"event: response.created\ndata: {created_event.model_dump_json()}\n\n"
|
||||
|
||||
# response.in_progress
|
||||
in_progress_event = ResponseInProgressEvent(
|
||||
sequence_number=next(seq), response=initial_response
|
||||
)
|
||||
yield _format_sse(in_progress_event)
|
||||
yield f"event: response.in_progress\ndata: {in_progress_event.model_dump_json()}\n\n"
|
||||
|
||||
# response.output_item.added
|
||||
initial_item = ResponseMessageItem(
|
||||
@@ -242,7 +236,7 @@ async def generate_responses_stream(
|
||||
item_added = ResponseOutputItemAddedEvent(
|
||||
sequence_number=next(seq), output_index=0, item=initial_item
|
||||
)
|
||||
yield _format_sse(item_added)
|
||||
yield f"event: response.output_item.added\ndata: {item_added.model_dump_json()}\n\n"
|
||||
|
||||
# response.content_part.added
|
||||
initial_part = ResponseOutputText(text="")
|
||||
@@ -253,7 +247,7 @@ async def generate_responses_stream(
|
||||
content_index=0,
|
||||
part=initial_part,
|
||||
)
|
||||
yield _format_sse(part_added)
|
||||
yield f"event: response.content_part.added\ndata: {part_added.model_dump_json()}\n\n"
|
||||
|
||||
accumulated_text = ""
|
||||
function_call_items: list[ResponseFunctionCallItem] = []
|
||||
@@ -287,7 +281,7 @@ async def generate_responses_stream(
|
||||
output_index=next_output_index,
|
||||
item=fc_item,
|
||||
)
|
||||
yield _format_sse(fc_added)
|
||||
yield f"event: response.output_item.added\ndata: {fc_added.model_dump_json()}\n\n"
|
||||
|
||||
# response.function_call_arguments.delta
|
||||
args_delta = ResponseFunctionCallArgumentsDeltaEvent(
|
||||
@@ -296,7 +290,7 @@ async def generate_responses_stream(
|
||||
output_index=next_output_index,
|
||||
delta=tool.arguments,
|
||||
)
|
||||
yield _format_sse(args_delta)
|
||||
yield f"event: response.function_call_arguments.delta\ndata: {args_delta.model_dump_json()}\n\n"
|
||||
|
||||
# response.function_call_arguments.done
|
||||
args_done = ResponseFunctionCallArgumentsDoneEvent(
|
||||
@@ -306,7 +300,7 @@ async def generate_responses_stream(
|
||||
name=tool.name,
|
||||
arguments=tool.arguments,
|
||||
)
|
||||
yield _format_sse(args_done)
|
||||
yield f"event: response.function_call_arguments.done\ndata: {args_done.model_dump_json()}\n\n"
|
||||
|
||||
# response.output_item.done
|
||||
fc_done_item = ResponseFunctionCallItem(
|
||||
@@ -321,7 +315,7 @@ async def generate_responses_stream(
|
||||
output_index=next_output_index,
|
||||
item=fc_done_item,
|
||||
)
|
||||
yield _format_sse(fc_item_done)
|
||||
yield f"event: response.output_item.done\ndata: {fc_item_done.model_dump_json()}\n\n"
|
||||
|
||||
function_call_items.append(fc_done_item)
|
||||
next_output_index += 1
|
||||
@@ -337,7 +331,7 @@ async def generate_responses_stream(
|
||||
content_index=0,
|
||||
delta=chunk.text,
|
||||
)
|
||||
yield _format_sse(delta_event)
|
||||
yield f"event: response.output_text.delta\ndata: {delta_event.model_dump_json()}\n\n"
|
||||
|
||||
# response.output_text.done
|
||||
text_done = ResponseTextDoneEvent(
|
||||
@@ -347,7 +341,7 @@ async def generate_responses_stream(
|
||||
content_index=0,
|
||||
text=accumulated_text,
|
||||
)
|
||||
yield _format_sse(text_done)
|
||||
yield f"event: response.output_text.done\ndata: {text_done.model_dump_json()}\n\n"
|
||||
|
||||
# response.content_part.done
|
||||
final_part = ResponseOutputText(text=accumulated_text)
|
||||
@@ -358,7 +352,7 @@ async def generate_responses_stream(
|
||||
content_index=0,
|
||||
part=final_part,
|
||||
)
|
||||
yield _format_sse(part_done)
|
||||
yield f"event: response.content_part.done\ndata: {part_done.model_dump_json()}\n\n"
|
||||
|
||||
# response.output_item.done
|
||||
final_message_item = ResponseMessageItem(
|
||||
@@ -369,7 +363,7 @@ async def generate_responses_stream(
|
||||
item_done = ResponseOutputItemDoneEvent(
|
||||
sequence_number=next(seq), output_index=0, item=final_message_item
|
||||
)
|
||||
yield _format_sse(item_done)
|
||||
yield f"event: response.output_item.done\ndata: {item_done.model_dump_json()}\n\n"
|
||||
|
||||
# Create usage from usage data if available
|
||||
usage = None
|
||||
@@ -394,4 +388,4 @@ async def generate_responses_stream(
|
||||
completed_event = ResponseCompletedEvent(
|
||||
sequence_number=next(seq), response=final_response
|
||||
)
|
||||
yield _format_sse(completed_event)
|
||||
yield f"event: response.completed\ndata: {completed_event.model_dump_json()}\n\n"
|
||||
|
||||
@@ -562,8 +562,6 @@ class API:
|
||||
if command_id in self._text_generation_queues:
|
||||
del self._text_generation_queues[command_id]
|
||||
|
||||
|
||||
|
||||
async def _collect_text_generation_with_stats(
|
||||
self, command_id: CommandId
|
||||
) -> BenchChatCompletionResponse:
|
||||
|
||||
@@ -90,8 +90,6 @@ def prefill(
|
||||
)
|
||||
if has_ssm:
|
||||
snapshots.append(snapshot_ssm_states(cache))
|
||||
if on_prefill_progress is not None:
|
||||
on_prefill_progress(processed, total)
|
||||
|
||||
if on_prefill_progress is not None:
|
||||
on_prefill_progress(processed, total)
|
||||
|
||||
@@ -98,11 +98,16 @@ class RunnerSupervisor:
|
||||
|
||||
def shutdown(self):
|
||||
logger.info("Runner supervisor shutting down")
|
||||
self._ev_recv.close()
|
||||
self._task_sender.close()
|
||||
self._event_sender.close()
|
||||
self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
|
||||
self._cancel_sender.close()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._ev_recv.close()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._task_sender.close()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._event_sender.close()
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._cancel_sender.send(TaskId("CANCEL_CURRENT_TASK"))
|
||||
with contextlib.suppress(ClosedResourceError):
|
||||
self._cancel_sender.close()
|
||||
self.runner_process.join(5)
|
||||
if not self.runner_process.is_alive():
|
||||
logger.info("Runner process succesfully terminated")
|
||||
|
||||
Reference in New Issue
Block a user