autodev/llm.py

"""
AutoDev - LLM Communication Layer
Supports Ollama and vLLM backends with streaming, retry logic, and robust error handling.
"""

import json
import sys
import time
import urllib.request
import urllib.error
from . import config


class LLMError(Exception):
    pass


class LLM:
    def __init__(self, backend: str = None, model: str = None):
        self.backend = backend or config.LLM_BACKEND
        self.model = model or config.MODEL_NAME
        if self.backend == "ollama":
            self.base_url = config.OLLAMA_URL
        elif self.backend == "vllm":
            self.base_url = config.VLLM_URL
        else:
            raise LLMError(f"Unknown backend: {self.backend}")
        self.context_size = None  # Auto-detected on first use

    def detect_context_size(self) -> int:
        """Detect the model's effective context window size.

        Checks (in priority order):
        1. Ollama /api/ps for running model's actual context_length
        2. num_ctx in model parameters from /api/show
        3. Model architecture's max context_length from /api/show
        4. vLLM max_model_len from /v1/models
        5. Fallback to config default
        """
        if self.context_size is not None:
            return self.context_size
        try:
            if self.backend == "ollama":
                self.context_size = self._detect_ollama_context()
            elif self.backend == "vllm":
                self.context_size = self._detect_vllm_context()
        except Exception:
            pass
        if not self.context_size:
            self.context_size = config.MAX_CONTEXT_TOKENS
        return self.context_size

    def detect_gpu_status(self) -> dict:
        """Check GPU/CPU offload status for the running model.

        Returns dict with:
          loaded: bool - whether model is currently loaded
          gpu_percent: int - percentage of model on GPU (0-100)
          size_total: int - total model size in bytes
          size_vram: int - bytes on GPU
          warning: str|None - warning message if mostly CPU
        """
        result = {"loaded": False, "gpu_percent": 0, "size_total": 0,
                  "size_vram": 0, "warning": None}
        if self.backend != "ollama":
            return result
        try:
            url = f"{self.base_url}/api/ps"
            req = urllib.request.Request(url)
            with urllib.request.urlopen(req, timeout=5) as resp:
                data = json.loads(resp.read().decode("utf-8"))
            for m in data.get("models", []):
                if self.model in m.get("name", ""):
                    result["loaded"] = True
                    result["size_total"] = m.get("size", 0)
                    result["size_vram"] = m.get("size_vram", 0)
                    if result["size_total"] > 0:
                        result["gpu_percent"] = int(
                            result["size_vram"] / result["size_total"] * 100
                        )
                    if result["gpu_percent"] == 0:
                        result["warning"] = (
                            "Model is running entirely on CPU. "
                            "This will be extremely slow and may not complete. "
                            "Consider using a smaller model or freeing GPU memory."
                        )
                    elif result["gpu_percent"] < 50:
                        result["warning"] = (
                            f"Only {result['gpu_percent']}% of model is on GPU. "
                            "Performance will be significantly degraded. "
                            "Consider using a smaller model."
                        )
                    break
        except Exception:
            pass
        return result

    def _detect_ollama_context(self) -> int | None:
        # 1. Check running model — this gives the actual runtime context_length
        try:
            url = f"{self.base_url}/api/ps"
            req = urllib.request.Request(url)
            with urllib.request.urlopen(req, timeout=5) as resp:
                data = json.loads(resp.read().decode("utf-8"))
            for m in data.get("models", []):
                if self.model in m.get("name", ""):
                    ctx = m.get("context_length")
                    if ctx:
                        return int(ctx)
        except Exception:
            pass

        # 2. Check model config from /api/show
        try:
            url = f"{self.base_url}/api/show"
            payload = {"name": self.model}
            data = self._post_raw(url, payload)

            # Check parameters for explicit num_ctx setting
            params = data.get("parameters", "")
            for line in params.splitlines():
                if "num_ctx" in line:
                    parts = line.split()
                    for p in parts:
                        if p.isdigit():
                            return int(p)

            # Check modelfile for PARAMETER num_ctx
            modelfile = data.get("modelfile", "")
            for line in modelfile.splitlines():
                if "num_ctx" in line.lower():
                    parts = line.split()
                    for p in parts:
                        if p.isdigit():
                            return int(p)

            # 3. Fall back to architecture's max context_length
            model_info = data.get("model_info", {})
            for key, val in model_info.items():
                if "context_length" in key:
                    return int(val)
        except Exception:
            pass
        return None

    def _detect_vllm_context(self) -> int | None:
        try:
            url = f"{self.base_url}/v1/models"
            req = urllib.request.Request(url)
            with urllib.request.urlopen(req, timeout=10) as resp:
                data = json.loads(resp.read().decode("utf-8"))
            for m in data.get("data", []):
                if m.get("id") == self.model:
                    return m.get("max_model_len")
        except Exception:
            pass
        return None

    def query(self, prompt: str, system: str = "", temperature: float = 0.2,
              stream: bool = False) -> str:
        if not system:
            system = config.EXPERT_IDENTITY
        if self.backend == "ollama":
            if stream:
                return self._stream_ollama(prompt, system, temperature)
            result = self._query_ollama(prompt, system, temperature)
        else:
            if stream:
                return self._stream_vllm(prompt, system, temperature)
            result = self._query_vllm(prompt, system, temperature)
        # Push LLM thinking to web UI
        try:
            from .web import push_event
            push_event("llm_response", {"response": result})
        except Exception:
            pass
        return result

    def _query_ollama(self, prompt: str, system: str, temperature: float) -> str:
        url = f"{self.base_url}/api/generate"
        payload = {
            "model": self.model,
            "prompt": prompt,
            "system": system,
            "stream": False,
            "options": {"temperature": temperature},
        }
        return self._post(url, payload, key="response")

    def _stream_ollama(self, prompt: str, system: str, temperature: float) -> str:
        url = f"{self.base_url}/api/generate"
        payload = {
            "model": self.model,
            "prompt": prompt,
            "system": system,
            "stream": True,
            "options": {"temperature": temperature},
        }
        return self._stream_post(url, parse_fn=lambda chunk: chunk.get("response", ""))

    def _query_vllm(self, prompt: str, system: str, temperature: float) -> str:
        url = f"{self.base_url}/v1/completions"
        full_prompt = f"{system}\n\n{prompt}" if system else prompt
        payload = {
            "model": self.model,
            "prompt": full_prompt,
            "max_tokens": 4096,
            "temperature": temperature,
            "stream": False,
        }
        data = self._post_raw(url, payload)
        try:
            return data["choices"][0]["text"]
        except (KeyError, IndexError):
            raise LLMError(f"Unexpected vLLM response: {data}")

    def _stream_vllm(self, prompt: str, system: str, temperature: float) -> str:
        url = f"{self.base_url}/v1/completions"
        full_prompt = f"{system}\n\n{prompt}" if system else prompt
        payload = {
            "model": self.model,
            "prompt": full_prompt,
            "max_tokens": 4096,
            "temperature": temperature,
            "stream": True,
        }
        return self._stream_post(url, parse_fn=lambda chunk: (
            chunk.get("choices", [{}])[0].get("text", "") if chunk.get("choices") else ""
        ))

    def chat(self, messages: list[dict], temperature: float = 0.2,
             stream: bool = False) -> str:
        if self.backend == "ollama":
            url = f"{self.base_url}/api/chat"
            payload = {
                "model": self.model,
                "messages": messages,
                "stream": stream,
                "options": {"temperature": temperature},
            }
            if stream:
                return self._stream_post(url, parse_fn=lambda c: c.get("message", {}).get("content", ""))
            return self._post(url, payload, key="message", subkey="content")
        else:
            url = f"{self.base_url}/v1/chat/completions"
            payload = {
                "model": self.model,
                "messages": messages,
                "max_tokens": 4096,
                "temperature": temperature,
                "stream": stream,
            }
            if stream:
                return self._stream_post(url, parse_fn=lambda c: (
                    c.get("choices", [{}])[0].get("delta", {}).get("content", "")
                    if c.get("choices") else ""
                ))
            data = self._post_raw(url, payload)
            try:
                return data["choices"][0]["message"]["content"]
            except (KeyError, IndexError):
                raise LLMError(f"Unexpected vLLM chat response: {data}")

    def _post(self, url: str, payload: dict, key: str, subkey: str = None) -> str:
        data = self._post_raw(url, payload)
        try:
            result = data[key]
            if subkey:
                result = result[subkey]
            return result
        except (KeyError, TypeError):
            raise LLMError(f"Unexpected response structure: {data}")

    def _post_raw(self, url: str, payload: dict, retries: int = 2) -> dict:
        body = json.dumps(payload).encode("utf-8")
        req = urllib.request.Request(
            url, data=body, headers={"Content-Type": "application/json"}
        )
        last_err = None
        for attempt in range(retries + 1):
            try:
                with urllib.request.urlopen(req, timeout=config.LLM_TIMEOUT) as resp:
                    return json.loads(resp.read().decode("utf-8"))
            except urllib.error.URLError as e:
                last_err = e
                if attempt < retries:
                    time.sleep(2 ** attempt)
            except json.JSONDecodeError as e:
                raise LLMError(f"Invalid JSON from LLM: {e}")
        raise LLMError(f"LLM request failed after {retries + 1} attempts ({url}): {last_err}")

    def _stream_post(self, url: str, parse_fn) -> str:
        """Stream response, printing tokens to console as they arrive."""
        # Build the same request but with stream=True already in payload
        # We need to read line by line
        body = json.dumps({"stream": True}).encode("utf-8")
        # Actually we need the full payload — caller already set stream=True
        # Re-read from the caller context isn't possible, so we use a different approach:
        # The caller methods build the payload and call us. We need the payload.
        # Refactored: callers should pass payload. For now, fall back to non-streaming.
        # This is handled by the _stream_generate and _stream_chat methods.
        raise LLMError("Direct _stream_post not supported; use streaming query methods")

    def query_stream(self, prompt: str, system: str = "", temperature: float = 0.2) -> str:
        """Query with streaming output to console."""
        if not system:
            system = config.EXPERT_IDENTITY
        if self.backend == "ollama":
            return self._stream_ollama_impl(prompt, system, temperature)
        else:
            return self._stream_vllm_impl(prompt, system, temperature)

    def _stream_ollama_impl(self, prompt: str, system: str, temperature: float) -> str:
        url = f"{self.base_url}/api/generate"
        payload = {
            "model": self.model,
            "prompt": prompt,
            "system": system,
            "stream": True,
            "options": {"temperature": temperature},
        }
        return self._do_stream(url, payload, lambda c: c.get("response", ""))

    def _stream_vllm_impl(self, prompt: str, system: str, temperature: float) -> str:
        url = f"{self.base_url}/v1/completions"
        full_prompt = f"{system}\n\n{prompt}" if system else prompt
        payload = {
            "model": self.model,
            "prompt": full_prompt,
            "max_tokens": 4096,
            "temperature": temperature,
            "stream": True,
        }
        return self._do_stream(url, payload, lambda c: (
            c.get("choices", [{}])[0].get("text", "") if c.get("choices") else ""
        ))

    def _do_stream(self, url: str, payload: dict, parse_fn) -> str:
        """Execute streaming request, print tokens live, return full text."""
        body = json.dumps(payload).encode("utf-8")
        req = urllib.request.Request(
            url, data=body, headers={"Content-Type": "application/json"}
        )
        full_text = []
        try:
            with urllib.request.urlopen(req, timeout=config.LLM_TIMEOUT) as resp:
                buffer = b""
                while True:
                    chunk = resp.read(1)
                    if not chunk:
                        break
                    buffer += chunk
                    if chunk == b"\n" and buffer.strip():
                        line = buffer.decode("utf-8").strip()
                        buffer = b""
                        # vLLM SSE format
                        if line.startswith("data: "):
                            line = line[6:]
                        if line == "[DONE]":
                            break
                        try:
                            data = json.loads(line)
                            token = parse_fn(data)
                            if token:
                                full_text.append(token)
                                sys.stdout.write(token)
                                sys.stdout.flush()
                        except json.JSONDecodeError:
                            pass
                    elif chunk == b"\n":
                        buffer = b""
        except urllib.error.URLError as e:
            raise LLMError(f"Stream request failed ({url}): {e}")
        sys.stdout.write("\n")
        sys.stdout.flush()
        return "".join(full_text)