Ollama: optional unload after response
This commit is contained in:
@@ -44,6 +44,8 @@ Configure via the environment variables:
|
|||||||
- `LLM_CONTEXT_SHALLOW_COUNT`: Results with headlines only (additional breadth). Default `15`.
|
- `LLM_CONTEXT_SHALLOW_COUNT`: Results with headlines only (additional breadth). Default `15`.
|
||||||
- `LLM_TABS`: Tab whitelist, comma delimiter. Default `general,science,it,news`.
|
- `LLM_TABS`: Tab whitelist, comma delimiter. Default `general,science,it,news`.
|
||||||
- `LLM_INTERACTIVE`: UI mode. Default is `true` (interactive: copy, regenerate, follow up). Set to `false` for simple response only mode.
|
- `LLM_INTERACTIVE`: UI mode. Default is `true` (interactive: copy, regenerate, follow up). Set to `false` for simple response only mode.
|
||||||
|
* `LLM_OLLAMA_UNLOAD_AFTER`: If true, unload Ollama model immediately after each response (calls `/api/chat` with `keep_alive: 0`).
|
||||||
|
* `LLM_OLLAMA_UNLOAD_URL`: Override unload endpoint (default derived from `LLM_URL` host/port).
|
||||||
|
|
||||||
## How It Works
|
## How It Works
|
||||||
1 user initial search
|
1 user initial search
|
||||||
|
|||||||
@@ -17,6 +17,11 @@ TOKEN_EXPIRY_SEC = 3600
|
|||||||
STREAM_CHUNK_SIZE = 128
|
STREAM_CHUNK_SIZE = 128
|
||||||
STREAM_TIMEOUT_SEC = 60
|
STREAM_TIMEOUT_SEC = 60
|
||||||
|
|
||||||
|
def _env_flag(name: str, default: bool = False) -> bool:
|
||||||
|
v = os.getenv(name)
|
||||||
|
if v is None:
|
||||||
|
return default
|
||||||
|
return v.strip().lower() in ('1', 'true', 'yes', 'on')
|
||||||
def _get_streaming_connection(url: str):
|
def _get_streaming_connection(url: str):
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
host = parsed.hostname
|
host = parsed.hostname
|
||||||
@@ -438,6 +443,43 @@ class SXNGPlugin(Plugin):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _ollama_unload_model(self) -> None:
|
||||||
|
"""
|
||||||
|
Force-unload an Ollama model after a response by calling the native /api/chat endpoint:
|
||||||
|
{"model": "...", "messages": [], "keep_alive": 0}
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if self.provider != 'ollama':
|
||||||
|
return
|
||||||
|
if not getattr(self, 'ollama_unload_after', False):
|
||||||
|
return
|
||||||
|
unload_url = (getattr(self, 'ollama_unload_url', '') or '').strip()
|
||||||
|
if not unload_url:
|
||||||
|
return
|
||||||
|
|
||||||
|
conn = None
|
||||||
|
try:
|
||||||
|
conn, path = _get_streaming_connection(unload_url)
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": self.model,
|
||||||
|
"messages": [],
|
||||||
|
"keep_alive": 0
|
||||||
|
})
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
# Optional: if Ollama is behind auth, reuse LLM_KEY
|
||||||
|
if self.api_key and self.api_key not in ('none', 'ollama'):
|
||||||
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||||
|
conn.request("POST", path, body=payload, headers=headers)
|
||||||
|
res = conn.getresponse()
|
||||||
|
res.read() # drain
|
||||||
|
if res.status >= 400:
|
||||||
|
logger.warning(f"{PLUGIN_NAME}: Ollama unload failed: {res.status} {res.reason}")
|
||||||
|
finally:
|
||||||
|
if conn:
|
||||||
|
conn.close()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"{PLUGIN_NAME}: Ollama unload error: {e}")
|
||||||
|
|
||||||
def _load_config(self):
|
def _load_config(self):
|
||||||
self.interactive = os.getenv('LLM_INTERACTIVE', 'true').lower().strip() in ('true', '1', 'yes', 'on')
|
self.interactive = os.getenv('LLM_INTERACTIVE', 'true').lower().strip() in ('true', '1', 'yes', 'on')
|
||||||
raw_provider = os.getenv('LLM_PROVIDER', '').lower().strip()
|
raw_provider = os.getenv('LLM_PROVIDER', '').lower().strip()
|
||||||
@@ -510,6 +552,26 @@ class SXNGPlugin(Plugin):
|
|||||||
raw_url = f"https://{raw_url}"
|
raw_url = f"https://{raw_url}"
|
||||||
self.endpoint_url = raw_url
|
self.endpoint_url = raw_url
|
||||||
|
|
||||||
|
|
||||||
|
# Ollama: optional "unload after response" behavior (plugin-specific).
|
||||||
|
# Enable with:
|
||||||
|
# LLM_OLLAMA_UNLOAD_AFTER=1
|
||||||
|
# Optional override:
|
||||||
|
# LLM_OLLAMA_UNLOAD_URL=http://<host>:11434/api/chat
|
||||||
|
self.ollama_unload_after = (
|
||||||
|
_env_flag('LLM_OLLAMA_UNLOAD_AFTER', False) or _env_flag('OLLAMA_UNLOAD_AFTER', False)
|
||||||
|
)
|
||||||
|
self.ollama_unload_url = (os.getenv('LLM_OLLAMA_UNLOAD_URL') or os.getenv('OLLAMA_UNLOAD_URL') or '').strip()
|
||||||
|
if self.provider == 'ollama' and self.ollama_unload_after and not self.ollama_unload_url:
|
||||||
|
try:
|
||||||
|
p = urlparse(self.endpoint_url)
|
||||||
|
scheme = p.scheme or 'http'
|
||||||
|
host = p.hostname or 'localhost'
|
||||||
|
port = p.port
|
||||||
|
netloc = f"{host}:{port}" if port else host
|
||||||
|
self.ollama_unload_url = f"{scheme}://{netloc}/api/chat"
|
||||||
|
except Exception:
|
||||||
|
self.ollama_unload_url = "http://localhost:11434/api/chat"
|
||||||
if self.api_key:
|
if self.api_key:
|
||||||
self.secret = os.getenv('SXNG_LLM_SECRET') or hashlib.sha256(self.api_key.encode()).hexdigest()
|
self.secret = os.getenv('SXNG_LLM_SECRET') or hashlib.sha256(self.api_key.encode()).hexdigest()
|
||||||
else:
|
else:
|
||||||
@@ -820,6 +882,25 @@ class SXNGPlugin(Plugin):
|
|||||||
if conn: conn.close()
|
if conn: conn.close()
|
||||||
|
|
||||||
generator = stream_gemini if self.is_gemini else stream_openai_compatible
|
generator = stream_gemini if self.is_gemini else stream_openai_compatible
|
||||||
|
|
||||||
|
|
||||||
|
# If configured, force-unload Ollama model right after finishing the stream.
|
||||||
|
|
||||||
|
# This uses the native /api/chat endpoint with keep_alive=0.
|
||||||
|
|
||||||
|
if self.provider == 'ollama' and getattr(self, 'ollama_unload_after', False):
|
||||||
|
|
||||||
|
gen_fn = generator
|
||||||
|
|
||||||
|
def generator():
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
yield from gen_fn()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
|
||||||
|
self._ollama_unload_model()
|
||||||
return Response(generator(), mimetype='text/event-stream', headers={
|
return Response(generator(), mimetype='text/event-stream', headers={
|
||||||
'X-Accel-Buffering': 'no',
|
'X-Accel-Buffering': 'no',
|
||||||
'Cache-Control': 'no-cache, no-store',
|
'Cache-Control': 'no-cache, no-store',
|
||||||
|
|||||||
Reference in New Issue
Block a user