-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdocker-compose.yaml
38 lines (38 loc) · 1.36 KB
/
docker-compose.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
version: "3"
services:
llama-cpp:
image: logdetective/runtime:latest-cuda
build:
context: .
dockerfile: ./Containerfile.cuda
hostname: "${LLAMA_CPP_HOST}"
command: "llama-server --host 0.0.0.0 --port ${LLAMA_CPP_SERVER_PORT}"
stdin_open: true
tty: true
env_file: .env
ports:
- "${LLAMA_CPP_SERVER_PORT:-8000}:${LLAMA_CPP_SERVER_PORT:-8000}"
volumes:
- ${MODELS_PATH-./models}:/models:Z
# these 4 lines are needed for CUDA acceleration
# devices:
# - nvidia.com/gpu=all
# security_opt:
# - "label=disable"
server:
image: logdetective/runtime:latest
build:
context: .
dockerfile: ./Containerfile
hostname: logdetective-server
stdin_open: true
tty: true
volumes:
- .:/src/:z
ports:
- "${LOGDETECTIVE_SERVER_PORT:-8080}:${LOGDETECTIVE_SERVER_PORT:-8080}"
env_file: .env
# --no-reload: doesn't work in a container - `PermissionError: Permission denied (os error 13) about ["/proc"]`
# command: fastapi dev /src/logdetective/server.py --host 0.0.0.0 --port $LOGDETECTIVE_SERVER_PORT --no-reload
# timeout set to 240 - 4 minutes should be enough for one LLM execution locally on a CPU
command: ["gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--timeout", "240", "logdetective.server:app", "-b", "0.0.0.0:$LOGDETECTIVE_SERVER_PORT"]