- Nginx: 외부 요청을 받아서 API 서버로 전달 (reverse proxy)
- Redis: 캐시 또는 큐 (요청 결과 저장 / 비동기 처리)
- vLLM: LLM inference 서버 (FastAPI 기반 API)
전체 Flow
[Nginx : 80]
↓
[FastAPI : 5000]
↓
[Redis : 6379]
↓
[vLLM : 8000]
GPU 사용시
nvidia-smi # GPU 드라이버가 제대로 설치되어 있는지
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt update
sudo apt install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi
vLLM 서버 실행
# vLLM 서버 실행
pip install vllm
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Llama-2-7b-chat-hf \
--host 0.0.0.0 \
--port 8000
Redis 서버 실행
# Redis 서버 실행
sudo apt update
sudo apt install redis-server -y
sudo systemctl start redis
sudo systemctl enable redis
pip install redis
FastAPI 중간 서버 (Redis + vLLM 연결)
비동기 방식
import hashlib
import json
import httpx
import redis.asyncio as redis
from fastapi import FastAPI, HTTPException
from contextlib import asynccontextmanager
# 1. Lifespan 설정: 클라이언트 재사용 및 연결 관리
@asynccontextmanager
async def lifespan(app: FastAPI):
# 시작 시 연결
app.state.redis = redis.Redis(host="localhost", port=6379, db=0, decode_responses=True)
app.state.http_client = httpx.AsyncClient(timeout=30.0)
yield
# 종료 시 해제
await app.state.redis.close()
await app.state.http_client.aclose()
app = FastAPI(lifespan=lifespan)
VLLM_URL = "http://localhost:8000/v1/chat/completions"
def make_cache_key(prompt: str):
return hashlib.sha256(prompt.encode()).hexdigest()
@app.post("/chat")
async def chat(prompt: str):
key = make_cache_key(prompt)
rd = app.state.redis
client = app.state.http_client
# 1. Redis 캐시 확인 (Redis 장애 시에도 서비스는 작동하도록 try 처리 권장)
try:
cached = await rd.get(key)
if cached:
return {"source": "cache", "response": json.loads(cached)}
except Exception:
pass # 캐시 서버 문제 시 vLLM으로 직접 진행
# 2. vLLM 비동기 요청
try:
payload = {
"model": "meta-llama/Llama-2-7b-chat-hf",
"messages": [{"role": "user", "content": prompt}]
}
res = await client.post(VLLM_URL, json=payload)
res.raise_for_status()
result = res.json()
except Exception as e:
raise HTTPException(status_code=500, detail=f"LLM Inference Error: {str(e)}")
# 3. Redis 저장 (성공 시에만)
await rd.setex(key, 3600, json.dumps(result))
return {"source": "vllm", "response": result}
동기식
from fastapi import FastAPI
import redis
import requests
import hashlib
import json
app = FastAPI()
# Redis 연결
r = redis.Redis(host="localhost", port=6379, db=0)
VLLM_URL = "http://localhost:8000/v1/chat/completions"
def make_cache_key(prompt: str):
return hashlib.sha256(prompt.encode()).hexdigest()
@app.post("/chat")
async def chat(prompt: str):
key = make_cache_key(prompt)
# 1. Redis 캐시 확인
cached = r.get(key)
if cached:
return {"source": "cache", "response": json.loads(cached)}
# 2. vLLM 요청
payload = {
"model": "meta-llama/Llama-2-7b-chat-hf",
"messages": [
{"role": "user", "content": prompt}
]
}
res = requests.post(VLLM_URL, json=payload)
result = res.json()
# 3. Redis 저장
r.setex(key, 3600, json.dumps(result))
return {"source": "vllm", "response": result}
Nginx 설정
sudo apt install nginx -y
# /etc/nginx/sites-available/default
server {
listen 80;
server_name _;
location / {
proxy_pass http://127.0.0.1:5000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}
}
sudo nginx -t
sudo systemctl restart nginx
Test
curl -X POST "http://localhost:8000/chat" \
-H "Content-Type: application/json" \
-d '{"prompt": "Hello, how are you?"}'
프로젝트 구조
project/
├── docker-compose.yml
├── nginx/
│ └── default.conf
└── app/
├── Dockerfile
└── app.py
app/app.py
from fastapi import FastAPI
import redis
import requests
import hashlib
import json
import os
app = FastAPI()
REDIS_HOST = os.getenv("REDIS_HOST", "redis")
VLLM_URL = os.getenv("VLLM_URL", "http://vllm:8000/v1/chat/completions")
r = redis.Redis(host=REDIS_HOST, port=6379, db=0)
def make_cache_key(prompt: str):
return hashlib.sha256(prompt.encode()).hexdigest()
@app.post("/chat")
async def chat(prompt: str):
key = make_cache_key(prompt)
cached = r.get(key)
if cached:
return {"source": "cache", "response": json.loads(cached)}
payload = {
"model": "meta-llama/Llama-2-7b-chat-hf",
"messages": [{"role": "user", "content": prompt}]
}
res = requests.post(VLLM_URL, json=payload)
result = res.json()
r.setex(key, 3600, json.dumps(result))
return {"source": "vllm", "response": result}
app/Dockerfile
FROM python:3.10-slim
WORKDIR /app
COPY app.py .
RUN pip install fastapi uvicorn redis requests
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5000"]
nginx/default.conf
server {
listen 80;
location / {
proxy_pass http://app:5000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}
}
docker-compose.yml
version: "3.9"
services:
nginx:
image: nginx:latest
container_name: nginx
ports:
- "80:80"
volumes:
- ./nginx/default.conf:/etc/nginx/conf.d/default.conf
depends_on:
- app
app:
build: ./app
container_name: fastapi-app
environment:
- REDIS_HOST=redis
- VLLM_URL=http://vllm:8000/v1/chat/completions
depends_on:
- redis
- vllm
redis:
image: redis:7
container_name: redis
ports:
- "6379:6379"
vllm:
image: vllm/vllm-openai:latest
container_name: vllm
runtime: nvidia # GPU 사용 시
environment:
- NVIDIA_VISIBLE_DEVICES=all
ports:
- "8000:8000"
command: >
--model meta-llama/Llama-2-7b-chat-hf
--host 0.0.0.0
--port 8000
실행
docker compose up -d --build
docker compose logs -f
curl -X POST "http://localhost/chat" \
-H "Content-Type: application/json" \
-d '{"prompt": "Explain Docker in simple terms"}''AI agent' 카테고리의 다른 글
| RAG Evaluation Metrics: Hit Rate and MRR (0) | 2026.04.01 |
|---|---|
| Metrics and Evaluation Frameworks for RAG Performance (1) | 2026.04.01 |
| 뉴스검색 API (0) | 2026.04.01 |
| Cloud 환경 vLLM, LangChain연결 (0) | 2026.04.01 |
| Tableau MCP 서버 구성 (0) | 2026.03.31 |