Hugging face를 가입
Hugging face Token 생성 - settings
sudo apt update
sudo apt install git-lfs
mkdir my_model
cd my_model
git clone https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-1.2B
#Hugging face 아이디 입력과 접속 토큰 (비밀번호 아님) 입력
sudo apt update
sudo apt install python3-pip
pip3 install vllm
pip3 install ray
작동되는데 간단하게 테스트
from vllm import LLM, SamplingParams
# EXAONE 모델 경로
model_path = "/home/ubuntu/my_model/EXAONE-4.0-1.2B” #경로확인
llm = LLM(
model=model_path,
trust_remote_code=True, # EXAONE 필수
gpu_memory_utilization=0.9,
tensor_parallel_size=1 # 1.2B → 보통 1로 충분
)
sampling_params = SamplingParams(
temperature=0.5,
top_p=0.7,
repetition_penalty=1.1,
max_tokens=1024
)
query = "EXAONE에 대해 알려줘."
response = llm.generate(query, sampling_params)
print(response[0].outputs[0].text)
외부에서 HTTP 통신을 통해 접근할 수 있도록 FastAPI와 연동
pip3 install uvicorn fastapi pydantic
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from vllm import LLM, SamplingParams
app = FastAPI()
model_path = "/home/ubuntu/my_model/EXAONE-4.0-1.2B" #경로확인
llm = LLM(
model=model_path,
trust_remote_code=True,
gpu_memory_utilization=0.9,
tensor_parallel_size=1
)
sampling_params = SamplingParams(
temperature=0.5,
top_p=0.7,
repetition_penalty=1.1,
max_tokens=1024
)
class QueryRequest(BaseModel):
query: str
@app.post("/generate/")
async def generate_response(request: QueryRequest):
try:
response = llm.generate(request.query, sampling_params)
result_text = response[0].outputs[0].text
return {"response": result_text}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
서버 실행
uvicorn server:app --host 0.0.0.0 --port 8080 --reload &
오류발생되면
python3 -m uvicorn server:app --host 0.0.0.0 --port 8080 --reload &
import requests
url = “http://127.0.0.1:8080/generate/” #외부아이피 확인하여 입력
payload = {
"query": " EXAONE에 대해 알려줘."
}
headers = {
"Content-Type": "application/json"
}
response = requests.post(url, json=payload, headers=headers)
print("Status Code:", response.status_code)
if response.status_code == 200:
print("Response:", response.json()["response"])
else:
print("Error:", response.text)
서버종료하려면
ps aux | grep uvicron
sudo kill PID번호
Docker로 서버 설치
Dockerfile
FROM nvidia/cuda:12.6.2-cudnn-devel-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
git \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
RUN python3 -m pip install --upgrade pip
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
VOLUME ["/app/models"]
COPY . .
CMD ["python3", "-m", "uvicorn", "async_server:app", "--host", "0.0.0.0", "--port", "8080"]
requirements.txt
vllm
uvicorn
fastapi
pydantic
async_server.py
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
model_path = "/home/ubuntu/my_model/EXAONE-4.0-1.2B" #경로확인
engine_args = AsyncEngineArgs( model=model_path, trust_remote_code=True,
gpu_memory_utilization=0.95, tensor_parallel_size=1)
llm = AsyncLLMEngine.from_engine_args(engine_args)
app = FastAPI()
class QueryRequest(BaseModel):
request_id: str
query: str
n: int = Field(default=1)
top_p: float = Field(default=0.7)
temperature: float = Field(default=0.5)
max_tokens: int = Field(default=1024)
seed: int = Field(default=42)
@app.post("/generate")
async def generate_post(request: QueryRequest):
sent_text = ""
async def stream_response():
nonlocal sent_text
sampling_params = SamplingParams( n=request.n, temperature=request.temperature,
top_p=request.top_p, repetition_penalty=1.1, max_tokens=request.max_tokens,
seed=request.seed )
results_generator = llm.generate(request.query, sampling_params,
request_id=request.request_id )
async for output in results_generator:
text = output.outputs[0].text
new_text = text[len(sent_text):]
sent_text = text
for word in new_text.split(" "):
if word:
yield word + " "
if output.finished:
return
return StreamingResponse( stream_response(), media_type="text/plain" )
vLLM 서빙(CUDA환경)
vllm serve 실행
NCCL_P2P_DISABLE=1 ~/.local/bin/vllm serve \
LGAI-EXAONE/EXAONE-4.0-1.2B \
--host 0.0.0.0 \
--port 8080 \
--max-model-len 4096 \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--trust-remote-code
import requests
import json
url = 'http://127.0.0.1:8080/v1/chat/completions' #외부아이피 확인하여 입력
payload = {
"model": "LGAI-EXAONE/EXAONE-4.0-1.2B"
"messages": [
{
"role": "user",
"content": “vLLM에 대해 설명해줘"
}
],
"max_tokens": 4000
}
headers = {
"Content-Type": "application/json"
}
response = requests.post(url, headers=headers, data=json.dumps(payload))
# 응답 상태 코드 확인
print("Status Code:", response.status_code)
# JSON 응답 출력
result = response.json()
print(json.dumps(result, ensure_ascii=False, indent=2))