Compare commits

..

1 Commits

Author SHA1 Message Date
Ryuichi Leo Takashige
0955966b2a Claude-generated settings, no idea if it works 2026-02-26 13:49:05 +00:00
10 changed files with 470 additions and 31 deletions

View File

@@ -170,5 +170,30 @@
{/if}
Downloads
</a>
<a
href="/#/settings"
class="text-sm text-white/70 hover:text-exo-yellow transition-colors tracking-wider uppercase flex items-center gap-2 cursor-pointer"
title="Settings"
>
<svg
class="w-4 h-4"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
stroke-width="2"
>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M10.325 4.317c.426-1.756 2.924-1.756 3.35 0a1.724 1.724 0 002.573 1.066c1.543-.94 3.31.826 2.37 2.37a1.724 1.724 0 001.065 2.572c1.756.426 1.756 2.924 0 3.35a1.724 1.724 0 00-1.066 2.573c.94 1.543-.826 3.31-2.37 2.37a1.724 1.724 0 00-2.572 1.065c-.426 1.756-2.924 1.756-3.35 0a1.724 1.724 0 00-2.573-1.066c-1.543.94-3.31-.826-2.37-2.37a1.724 1.724 0 00-1.065-2.572c-1.756-.426-1.756-2.924 0-3.35a1.724 1.724 0 001.066-2.573c-.94-1.543.826-3.31 2.37-2.37.996.608 2.296.07 2.572-1.065z"
/>
<path
stroke-linecap="round"
stroke-linejoin="round"
d="M15 12a3 3 0 11-6 0 3 3 0 016 0z"
/>
</svg>
Settings
</a>
</nav>
</header>

View File

@@ -0,0 +1,87 @@
/**
* SettingsStore - Manages exo runtime settings via the /settings API.
*/
export interface MemorySettings {
oom_prevention: boolean;
memory_threshold: number;
memory_floor_gb: number;
}
export interface GenerationSettings {
prefill_step_size: number;
max_tokens: number;
kv_cache_bits: 4 | 8 | null;
}
export interface ExoSettings {
memory: MemorySettings;
generation: GenerationSettings;
}
function defaultSettings(): ExoSettings {
return {
memory: {
oom_prevention: false,
memory_threshold: 0.8,
memory_floor_gb: 5.0,
},
generation: {
prefill_step_size: 4096,
max_tokens: 32168,
kv_cache_bits: null,
},
};
}
class SettingsStore {
settings = $state<ExoSettings>(defaultSettings());
loading = $state(false);
error = $state<string | null>(null);
async load(): Promise<void> {
this.loading = true;
this.error = null;
try {
const response = await fetch("/settings");
if (!response.ok) {
throw new Error(`Failed to fetch settings: ${response.status}`);
}
this.settings = (await response.json()) as ExoSettings;
} catch (err) {
console.error("Failed to load settings:", err);
this.error = err instanceof Error ? err.message : "Unknown error";
} finally {
this.loading = false;
}
}
async save(updated: ExoSettings): Promise<boolean> {
this.loading = true;
this.error = null;
try {
const response = await fetch("/settings", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(updated),
});
if (!response.ok) {
throw new Error(`Failed to save settings: ${response.status}`);
}
this.settings = (await response.json()) as ExoSettings;
return true;
} catch (err) {
console.error("Failed to save settings:", err);
this.error = err instanceof Error ? err.message : "Unknown error";
return false;
} finally {
this.loading = false;
}
}
resetToDefaults(): ExoSettings {
return defaultSettings();
}
}
export const settingsStore = new SettingsStore();

View File

@@ -0,0 +1,193 @@
<script lang="ts">
import { onMount } from "svelte";
import { fade } from "svelte/transition";
import HeaderNav from "$lib/components/HeaderNav.svelte";
import { settingsStore, type ExoSettings } from "$lib/stores/settings.svelte";
import { addToast } from "$lib/stores/toast.svelte";
let draft = $state<ExoSettings | null>(null);
const loading = $derived(settingsStore.loading);
onMount(async () => {
await settingsStore.load();
draft = structuredClone(settingsStore.settings);
});
async function handleSave() {
if (!draft) return;
const ok = await settingsStore.save(draft);
if (ok) {
addToast({ type: "success", message: "Settings saved" });
} else {
addToast({ type: "error", message: settingsStore.error ?? "Failed to save settings" });
}
}
function handleReset() {
draft = settingsStore.resetToDefaults();
}
const KV_OPTIONS: { label: string; value: 4 | 8 | null }[] = [
{ label: "None (full precision)", value: null },
{ label: "4-bit", value: 4 },
{ label: "8-bit", value: 8 },
];
</script>
<HeaderNav showHome={true} />
{#if draft}
<div class="min-h-screen bg-background text-foreground" in:fade={{ duration: 200 }}>
<div class="max-w-2xl mx-auto px-6 py-8">
<h1 class="text-2xl font-bold text-exo-yellow tracking-wider uppercase mb-8">Settings</h1>
<!-- Memory / Safety -->
<section class="mb-10">
<h2 class="text-sm font-semibold text-white/50 tracking-widest uppercase mb-4">Memory / Safety</h2>
<div class="space-y-5">
<!-- OOM Prevention Toggle -->
<div class="flex items-center justify-between">
<div>
<div class="text-sm text-white/90">OOM Prevention</div>
<div class="text-xs text-white/40 mt-0.5">Stop generation when memory is low</div>
</div>
<button
onclick={() => { if (draft) draft.memory.oom_prevention = !draft.memory.oom_prevention; }}
class="relative w-11 h-6 rounded-full transition-colors duration-200 cursor-pointer {draft.memory.oom_prevention ? 'bg-exo-yellow' : 'bg-exo-medium-gray'}"
role="switch"
aria-checked={draft.memory.oom_prevention}
>
<span
class="absolute top-0.5 left-0.5 w-5 h-5 rounded-full bg-white shadow transition-transform duration-200 {draft.memory.oom_prevention ? 'translate-x-5' : 'translate-x-0'}"
></span>
</button>
</div>
<!-- Memory Threshold Slider -->
<div>
<div class="flex items-center justify-between mb-1.5">
<div>
<div class="text-sm text-white/90">Memory Threshold</div>
<div class="text-xs text-white/40 mt-0.5">KV cache eviction triggers above this level</div>
</div>
<span class="text-sm font-mono text-exo-yellow">{(draft.memory.memory_threshold * 100).toFixed(0)}%</span>
</div>
<input
type="range"
min="0.5"
max="0.99"
step="0.01"
bind:value={draft.memory.memory_threshold}
class="w-full h-1.5 rounded-full appearance-none cursor-pointer bg-exo-medium-gray accent-exo-yellow"
/>
</div>
<!-- Memory Floor -->
<div>
<div class="flex items-center justify-between mb-1.5">
<div>
<div class="text-sm text-white/90">Memory Floor</div>
<div class="text-xs text-white/40 mt-0.5">Minimum free memory to reserve (GB)</div>
</div>
<span class="text-sm font-mono text-exo-yellow">{draft.memory.memory_floor_gb.toFixed(1)} GB</span>
</div>
<input
type="number"
min="0"
max="64"
step="0.5"
bind:value={draft.memory.memory_floor_gb}
class="w-full bg-exo-medium-gray border border-exo-light-gray/20 rounded px-3 py-1.5 text-sm text-white/90 font-mono focus:outline-none focus:border-exo-yellow/50"
/>
</div>
</div>
</section>
<!-- Generation / Performance -->
<section class="mb-10">
<h2 class="text-sm font-semibold text-white/50 tracking-widest uppercase mb-4">Generation / Performance</h2>
<div class="space-y-5">
<!-- Prefill Step Size -->
<div>
<div class="flex items-center justify-between mb-1.5">
<div>
<div class="text-sm text-white/90">Prefill Step Size</div>
<div class="text-xs text-white/40 mt-0.5">Token chunk size during prompt processing</div>
</div>
<span class="text-sm font-mono text-exo-yellow">{draft.generation.prefill_step_size.toLocaleString()}</span>
</div>
<input
type="number"
min="128"
max="32768"
step="128"
bind:value={draft.generation.prefill_step_size}
class="w-full bg-exo-medium-gray border border-exo-light-gray/20 rounded px-3 py-1.5 text-sm text-white/90 font-mono focus:outline-none focus:border-exo-yellow/50"
/>
</div>
<!-- Max Tokens -->
<div>
<div class="flex items-center justify-between mb-1.5">
<div>
<div class="text-sm text-white/90">Max Tokens</div>
<div class="text-xs text-white/40 mt-0.5">Maximum generation length per response</div>
</div>
<span class="text-sm font-mono text-exo-yellow">{draft.generation.max_tokens.toLocaleString()}</span>
</div>
<input
type="number"
min="1"
max="131072"
step="1024"
bind:value={draft.generation.max_tokens}
class="w-full bg-exo-medium-gray border border-exo-light-gray/20 rounded px-3 py-1.5 text-sm text-white/90 font-mono focus:outline-none focus:border-exo-yellow/50"
/>
</div>
<!-- KV Cache Bits -->
<div>
<div class="mb-1.5">
<div class="text-sm text-white/90">KV Cache Quantization</div>
<div class="text-xs text-white/40 mt-0.5">Lower bits save memory at slight quality cost</div>
</div>
<select
bind:value={draft.generation.kv_cache_bits}
class="w-full bg-exo-medium-gray border border-exo-light-gray/20 rounded px-3 py-1.5 text-sm text-white/90 font-mono focus:outline-none focus:border-exo-yellow/50 cursor-pointer"
>
{#each KV_OPTIONS as opt}
<option value={opt.value}>{opt.label}</option>
{/each}
</select>
</div>
</div>
</section>
<!-- Action Buttons -->
<div class="flex items-center gap-3">
<button
onclick={handleSave}
disabled={loading}
class="px-5 py-2 rounded text-sm font-semibold tracking-wider uppercase transition-colors cursor-pointer
bg-exo-yellow text-exo-black hover:bg-exo-yellow-darker
disabled:opacity-50 disabled:cursor-not-allowed"
>
{loading ? "Saving..." : "Save"}
</button>
<button
onclick={handleReset}
disabled={loading}
class="px-5 py-2 rounded text-sm font-semibold tracking-wider uppercase transition-colors cursor-pointer
border border-exo-light-gray/30 text-white/70 hover:border-exo-yellow/50 hover:text-exo-yellow
disabled:opacity-50 disabled:cursor-not-allowed"
>
Reset to Defaults
</button>
</div>
</div>
</div>
{:else}
<div class="min-h-screen bg-background flex items-center justify-center">
<div class="text-white/40 text-sm">Loading settings...</div>
</div>
{/if}

View File

@@ -166,6 +166,13 @@ from exo.shared.types.openai_responses import (
ResponsesRequest,
ResponsesResponse,
)
from exo.shared.types.settings import (
ExoSettings,
load_settings,
)
from exo.shared.types.settings import (
save_settings as save_settings_to_file,
)
from exo.shared.types.state import State
from exo.shared.types.worker.downloads import DownloadCompleted
from exo.shared.types.worker.instances import Instance, InstanceId, InstanceMeta
@@ -349,6 +356,8 @@ class API:
self.app.get("/v1/traces/{task_id}/raw")(self.get_trace_raw)
self.app.get("/onboarding")(self.get_onboarding)
self.app.post("/onboarding")(self.complete_onboarding)
self.app.get("/settings")(self.get_settings)
self.app.post("/settings")(self.save_settings)
async def place_instance(self, payload: PlaceInstanceParams):
command = PlaceInstance(
@@ -1825,3 +1834,13 @@ class API:
ONBOARDING_COMPLETE_FILE.parent.mkdir(parents=True, exist_ok=True)
ONBOARDING_COMPLETE_FILE.write_text("true")
return JSONResponse({"completed": True})
async def get_settings(self) -> JSONResponse:
settings = load_settings()
return JSONResponse(settings.model_dump())
async def save_settings(self, request: Request) -> JSONResponse:
body = cast(object, await request.json())
settings = ExoSettings.model_validate(body)
save_settings_to_file(settings)
return JSONResponse(settings.model_dump())

View File

@@ -1,5 +1,4 @@
import ctypes
import os
import sys
from math import ceil
from typing import Self, overload
@@ -157,24 +156,12 @@ class Memory(FrozenModel):
return f"{val:.2f} {unit}".rstrip("0").rstrip(".") + f" {unit}"
# Fraction of device memory above which LRU eviction kicks in.
# Smaller machines need more aggressive eviction.
def _default_memory_threshold() -> float:
total_gb = Memory.from_bytes(psutil.virtual_memory().total).in_gb
if total_gb >= 128:
return 0.85
if total_gb >= 64:
return 0.80
if total_gb >= 32:
return 0.75
return 0.70
def _load_memory_settings() -> tuple[float, "Memory"]:
"""Load memory threshold and floor from settings (lazy import to avoid circular dep)."""
from exo.shared.types.settings import load_settings
MEMORY_THRESHOLD = float(
os.environ.get("EXO_MEMORY_THRESHOLD", _default_memory_threshold())
)
MEMORY_FLOOR = Memory.from_gb(float(os.environ.get("EXO_MEMORY_FLOOR", "5")))
s = load_settings()
return s.memory.memory_threshold, Memory.from_gb(s.memory.memory_floor_gb)
_libc: ctypes.CDLL | None = None
@@ -217,9 +204,10 @@ def get_memory_pressure() -> float:
def get_memory_limit() -> Memory:
threshold, floor = _load_memory_settings()
total = psutil.virtual_memory().total
floor = min(int(total * (1 - MEMORY_THRESHOLD)), MEMORY_FLOOR.in_bytes)
return Memory.from_bytes(total - floor)
safety = min(int(total * (1 - threshold)), floor.in_bytes)
return Memory.from_bytes(total - safety)
def get_memory_available_locally() -> Memory:

View File

@@ -0,0 +1,121 @@
import os
import tomllib
from typing import Literal
import psutil
from pydantic import ConfigDict, Field, ValidationError
from exo.shared.constants import EXO_CONFIG_FILE
from exo.shared.logging import logger
from exo.shared.types.memory import Memory
from exo.utils.pydantic_ext import CamelCaseModel
def _default_memory_threshold() -> float:
total_gb = Memory.from_bytes(psutil.virtual_memory().total).in_gb
if total_gb >= 128:
return 0.85
if total_gb >= 64:
return 0.80
if total_gb >= 32:
return 0.75
return 0.70
class MemorySettings(CamelCaseModel):
model_config = ConfigDict(
alias_generator=None,
validate_by_name=True,
extra="forbid",
strict=False,
)
oom_prevention: bool = False
memory_threshold: float = Field(default_factory=_default_memory_threshold, ge=0.0, le=1.0)
memory_floor_gb: float = Field(default=5.0, ge=0.0)
class GenerationSettings(CamelCaseModel):
model_config = ConfigDict(
alias_generator=None,
validate_by_name=True,
extra="forbid",
strict=False,
)
prefill_step_size: int = Field(default=4096, ge=1)
max_tokens: int = Field(default=32168, ge=1)
kv_cache_bits: Literal[4, 8] | None = None
class ExoSettings(CamelCaseModel):
model_config = ConfigDict(
alias_generator=None,
validate_by_name=True,
extra="ignore",
strict=False,
)
memory: MemorySettings = Field(default_factory=MemorySettings)
generation: GenerationSettings = Field(default_factory=GenerationSettings)
_cached_settings: ExoSettings | None = None
_cached_mtime: float = 0.0
def load_settings() -> ExoSettings:
global _cached_settings, _cached_mtime # noqa: PLW0603
try:
mtime = EXO_CONFIG_FILE.stat().st_mtime
if _cached_settings is not None and mtime == _cached_mtime:
return _cached_settings
with open(EXO_CONFIG_FILE, "rb") as f:
data = tomllib.load(f)
settings = ExoSettings.model_validate(data)
_cached_mtime = mtime
except FileNotFoundError:
settings = ExoSettings()
except (tomllib.TOMLDecodeError, ValidationError) as e:
logger.warning(f"Invalid config file {EXO_CONFIG_FILE}: {e}")
settings = ExoSettings()
# Env vars override config file for backward compat.
env_threshold = os.environ.get("EXO_MEMORY_THRESHOLD")
if env_threshold is not None:
settings = settings.model_copy(
update={"memory": settings.memory.model_copy(update={"memory_threshold": float(env_threshold)})}
)
env_floor = os.environ.get("EXO_MEMORY_FLOOR")
if env_floor is not None:
settings = settings.model_copy(
update={"memory": settings.memory.model_copy(update={"memory_floor_gb": float(env_floor)})}
)
_cached_settings = settings
return settings
def save_settings(settings: ExoSettings) -> None:
global _cached_settings, _cached_mtime # noqa: PLW0603
EXO_CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
lines = [
"[memory]",
f"oom_prevention = {'true' if settings.memory.oom_prevention else 'false'}",
f"memory_threshold = {settings.memory.memory_threshold}",
f"memory_floor_gb = {settings.memory.memory_floor_gb}",
"",
"[generation]",
f"prefill_step_size = {settings.generation.prefill_step_size}",
f"max_tokens = {settings.generation.max_tokens}",
]
if settings.generation.kv_cache_bits is not None:
lines.append(f"kv_cache_bits = {settings.generation.kv_cache_bits}")
EXO_CONFIG_FILE.write_text("\n".join(lines) + "\n")
_cached_settings = settings
_cached_mtime = EXO_CONFIG_FILE.stat().st_mtime

View File

@@ -12,7 +12,7 @@ from anyio import fail_after, open_process, to_thread
from anyio.streams.buffered import BufferedByteReceiveStream
from anyio.streams.text import TextReceiveStream
from loguru import logger
from pydantic import ValidationError
from pydantic import ConfigDict, ValidationError
from exo.shared.constants import EXO_CONFIG_FILE, EXO_MODELS_DIR
from exo.shared.types.memory import Memory
@@ -295,6 +295,8 @@ class ThunderboltBridgeInfo(TaggedModel):
class NodeConfig(TaggedModel):
"""Node configuration from EXO_CONFIG_FILE, reloaded from the file only at startup. Other changes should come in through the API and propagate from there"""
model_config = ConfigDict(extra="ignore")
@classmethod
async def gather(cls) -> Self | None:
cfg_file = anyio.Path(EXO_CONFIG_FILE)

View File

@@ -10,10 +10,11 @@ from mlx_lm.models.cache import (
)
from mlx_lm.tokenizer_utils import TokenizerWrapper
from exo.shared.types.memory import MEMORY_THRESHOLD, Memory, get_memory_pressure
from exo.shared.types.memory import Memory, get_memory_pressure
from exo.shared.types.mlx import KVCacheType
from exo.shared.types.settings import load_settings
from exo.worker.engines.mlx import Model
from exo.worker.engines.mlx.constants import CACHE_GROUP_SIZE, KV_CACHE_BITS
from exo.worker.engines.mlx.constants import CACHE_GROUP_SIZE
from exo.worker.runner.bootstrap import logger
@@ -206,7 +207,7 @@ class KVPrefixCache:
# Evict LRU entries until below threshold
while (
len(self.caches) > 0
and self.get_memory_used_percentage() > MEMORY_THRESHOLD
and self.get_memory_used_percentage() > load_settings().memory.memory_threshold
):
lru_index = self._last_used.index(min(self._last_used))
evicted_tokens = len(self.prompts[lru_index])
@@ -341,13 +342,14 @@ def make_kv_cache(
return model.make_cache() # type: ignore
if max_kv_size is None:
if KV_CACHE_BITS is None:
kv_cache_bits = load_settings().generation.kv_cache_bits
if kv_cache_bits is None:
logger.info("Using default KV cache")
return [KVCache() for _ in model.layers]
else:
logger.info("Using quantized KV cache")
return [
QuantizedKVCache(group_size=CACHE_GROUP_SIZE, bits=KV_CACHE_BITS)
QuantizedKVCache(group_size=CACHE_GROUP_SIZE, bits=kv_cache_bits)
for _ in model.layers
]
else:

View File

@@ -20,6 +20,7 @@ from exo.shared.types.api import (
from exo.shared.types.common import ModelId
from exo.shared.types.memory import Memory, get_memory_available_locally
from exo.shared.types.mlx import KVCacheType
from exo.shared.types.settings import load_settings
from exo.shared.types.text_generation import InputMessage, TextGenerationTaskParams
from exo.shared.types.worker.runner_response import (
GenerationResponse,
@@ -39,7 +40,6 @@ from exo.worker.engines.mlx.constants import (
DEFAULT_TOP_LOGPROBS,
KV_BITS,
KV_GROUP_SIZE,
MAX_TOKENS,
)
from exo.worker.engines.mlx.utils_mlx import (
apply_chat_template,
@@ -112,7 +112,7 @@ def prefill(
max_tokens=1,
sampler=sampler,
prompt_cache=cache,
prefill_step_size=4096,
prefill_step_size=load_settings().generation.prefill_step_size,
kv_group_size=KV_GROUP_SIZE,
kv_bits=KV_BITS,
prompt_progress_callback=progress_callback,
@@ -343,7 +343,7 @@ def mlx_generate(
f"KV cache hit: {prefix_hit_length}/{len(all_prompt_tokens)} tokens cached ({100 * prefix_hit_length / len(all_prompt_tokens):.1f}%)"
)
if bytes_per_token.in_bytes > 0:
if bytes_per_token.in_bytes > 0 and load_settings().memory.oom_prevention:
oom_error = _check_memory_budget(
bytes_per_token=bytes_per_token,
total_sequence_tokens=len(all_prompt_tokens),
@@ -395,7 +395,7 @@ def mlx_generate(
# stream_generate starts from the last token
last_token = prompt_tokens[-2:]
max_tokens = task.max_output_tokens or MAX_TOKENS
max_tokens = task.max_output_tokens or load_settings().generation.max_tokens
accumulated_text = ""
generated_text_parts: list[str] = []
generation_start_time = time.perf_counter()

View File

@@ -36,6 +36,7 @@ from exo.shared.types.memory import (
get_memory_pressure,
get_memory_pressure_threshold,
)
from exo.shared.types.settings import load_settings
from exo.shared.types.tasks import (
ConnectToGroup,
LoadModel,
@@ -355,7 +356,8 @@ def main(
TaskId("CANCEL_CURRENT_TASK") in cancelled_tasks
)
oom_local = (
bytes_per_token.in_bytes > 0
load_settings().memory.oom_prevention
and bytes_per_token.in_bytes > 0
and get_memory_pressure()
> get_memory_pressure_threshold()
)