Latent Space

잠재 공간 : 딥러닝 모델이 원본 데이터의 핵심 특징만 압축하여 저차원으로 표현한 추상적인 공간

AI agent

vLLM+FastAPI 로 EXAONE 서버구축하기

사유하는코드 2026. 1. 20. 01:57

Hugging face를 가입

 

Hugging face Token 생성 - settings

 

sudo apt update
sudo apt install git-lfs

mkdir my_model
cd my_model
git clone https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-1.2B

#Hugging face 아이디 입력과 접속 토큰 (비밀번호 아님) 입력

sudo apt update
sudo apt install python3-pip
pip3 install vllm
pip3 install ray

 

작동되는데 간단하게 테스트

from vllm import LLM, SamplingParams

# EXAONE 모델 경로
model_path = "/home/ubuntu/my_model/EXAONE-4.0-1.2B” #경로확인

llm = LLM(
    model=model_path,
    trust_remote_code=True,          # EXAONE 필수
    gpu_memory_utilization=0.9,
    tensor_parallel_size=1           # 1.2B → 보통 1로 충분
)

sampling_params = SamplingParams(
    temperature=0.5,
    top_p=0.7,
    repetition_penalty=1.1,
    max_tokens=1024
)

query = "EXAONE에 대해 알려줘."
response = llm.generate(query, sampling_params)

print(response[0].outputs[0].text)

 

외부에서 HTTP 통신을 통해 접근할 수 있도록 FastAPI와 연동

 

pip3 install uvicorn fastapi pydantic

 

import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from vllm import LLM, SamplingParams

app = FastAPI()
model_path = "/home/ubuntu/my_model/EXAONE-4.0-1.2B" #경로확인

llm = LLM(
    model=model_path,
    trust_remote_code=True,    
    gpu_memory_utilization=0.9,
    tensor_parallel_size=1
)

sampling_params = SamplingParams(
    temperature=0.5,
    top_p=0.7,
    repetition_penalty=1.1,
    max_tokens=1024
)

class QueryRequest(BaseModel):
    query: str

@app.post("/generate/")
async def generate_response(request: QueryRequest):
    try:
        response = llm.generate(request.query, sampling_params)
        result_text = response[0].outputs[0].text
        return {"response": result_text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

 

서버 실행

uvicorn server:app --host 0.0.0.0 --port 8080 --reload &

오류발생되면

python3 -m uvicorn server:app --host 0.0.0.0 --port 8080 --reload &

 

import requests

url = “http://127.0.0.1:8080/generate/#외부아이피 확인하여 입력

payload = {
    "query": " EXAONE에 대해 알려줘."
}

headers = {
    "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)
print("Status Code:", response.status_code)
if response.status_code == 200:
    print("Response:", response.json()["response"])
else:
    print("Error:", response.text)

 

서버종료하려면
ps aux | grep uvicron 
sudo kill PID번호

 

Docker로 서버 설치

 

Dockerfile

FROM nvidia/cuda:12.6.2-cudnn-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1

WORKDIR /app

RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    git \
    ca-certificates \
 && rm -rf /var/lib/apt/lists/*
RUN python3 -m pip install --upgrade pip

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

VOLUME ["/app/models"]

COPY . .

CMD ["python3", "-m", "uvicorn", "async_server:app", "--host", "0.0.0.0", "--port", "8080"]

 

requirements.txt

vllm
uvicorn
fastapi
pydantic

 

async_server.py

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams

model_path = "/home/ubuntu/my_model/EXAONE-4.0-1.2B" #경로확인
engine_args = AsyncEngineArgs( model=model_path, trust_remote_code=True,  
                                       gpu_memory_utilization=0.95, tensor_parallel_size=1)
llm = AsyncLLMEngine.from_engine_args(engine_args)

app = FastAPI()

class QueryRequest(BaseModel):
    request_id: str
    query: str
    n: int = Field(default=1)
    top_p: float = Field(default=0.7)
    temperature: float = Field(default=0.5)
    max_tokens: int = Field(default=1024)
    seed: int = Field(default=42)

@app.post("/generate")
async def generate_post(request: QueryRequest):
    sent_text = ""
    async def stream_response():
        nonlocal sent_text
        sampling_params = SamplingParams( n=request.n, temperature=request.temperature,
            top_p=request.top_p, repetition_penalty=1.1, max_tokens=request.max_tokens,
            seed=request.seed )
        results_generator = llm.generate(request.query, sampling_params,
                                                   request_id=request.request_id )
        async for output in results_generator:
            text = output.outputs[0].text
            new_text = text[len(sent_text):]
            sent_text = text
            for word in new_text.split(" "):
                if word:
                    yield word + " "
            if output.finished:
                return
    return StreamingResponse( stream_response(),  media_type="text/plain" )

 

vLLM 서빙(CUDA환경)

 

vllm serve 실행

 

NCCL_P2P_DISABLE=1 ~/.local/bin/vllm serve \

  LGAI-EXAONE/EXAONE-4.0-1.2B \

  --host 0.0.0.0 \

  --port 8080 \

  --max-model-len 4096 \

  --tensor-parallel-size 1 \

  --gpu-memory-utilization 0.9 \

  --trust-remote-code

 

import requests
import json

url = 'http://127.0.0.1:8080/v1/chat/completions' #외부아이피 확인하여 입력

payload = {
    "model": "LGAI-EXAONE/EXAONE-4.0-1.2B"
    "messages": [
        {
            "role": "user",
            "content": “vLLM에 대해 설명해줘"
        }
    ],
    "max_tokens": 4000
}

headers = {
    "Content-Type": "application/json"
}

response = requests.post(url, headers=headers, data=json.dumps(payload))

# 응답 상태 코드 확인
print("Status Code:", response.status_code)

# JSON 응답 출력
result = response.json()
print(json.dumps(result, ensure_ascii=False, indent=2))

'AI agent' 카테고리의 다른 글

Open code 설치 CLI  (0) 2026.02.02
LangChain v1 마이그레이션 가이드  (0) 2026.01.28
AI Agent와 하이퍼자동화  (2) 2026.01.17
GEO  (0) 2026.01.17
Google Antigravity 설치  (0) 2026.01.15