docker-compose.yaml

version: "3"
services:
  llama-cpp:
    image: logdetective/runtime:latest-cuda
    build:
      context: .
      dockerfile: ./Containerfile.cuda
    hostname: "${LLAMA_CPP_HOST}"
    command: "llama-server --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT}"
    stdin_open: true
    tty: true
    env_file: .env
    ports:
      - "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}"
    volumes:
      - ${MODELS_PATH-./models}:/models:Z
    # these 4 lines are needed for CUDA acceleration
    # devices:
    #   - nvidia.com/gpu=all
    # security_opt:
    #   - "label=disable"
  server:
    image: logdetective/runtime:latest
    build:
      context: .
      dockerfile: ./Containerfile
    hostname: logdetective-server
    stdin_open: true
    tty: true
    volumes:
      - .:/src/:z
    ports:
      - "${LOGDETECTIVE_SERVER_PORT:-8080}:${LOGDETECTIVE_SERVER_PORT:-8080}"
    env_file: .env
    # --no-reload: doesn't work in a container - `PermissionError: Permission denied (os error 13) about ["/proc"]`
    # command: fastapi dev /src/logdetective/server.py --host 0.0.0.0 --port $LOGDETECTIVE_SERVER_PORT --no-reload
    # timeout set to 240 - 4 minutes should be enough for one LLM execution locally on a CPU
    command: ["gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--timeout", "240", "logdetective.server:app", "-b", "0.0.0.0:$LOGDETECTIVE_SERVER_PORT"]