How to configure llama 3.18b model for self hosting and used in translate5 as language resource on hetzner ubuntu server with NVIDIA RTX 4000 SFF Ada


For building nvidia drivers we need kernal headers. This is handled with this.
Step 1: Check kernel headers (needed for driver build)
    uname -r
    dpkg -l | grep headers | grep $(uname -r)
If nothing shows, install headers:

    sudo apt install -y linux-headers-$(uname -r)


# Get the recommended driver: 
    ubuntu-drivers devices

# remove all prev drivers if needed
    sudo apt purge -y 'nvidia-*'
    sudo apt autoremove -y

    sudo apt update
    sudo apt install -y linux-headers-$(uname -r) build-essential
    sudo apt install -y nvidia-driver-580


    sudo reboot
# check if the drivers are loaded
    lsmod | grep nvidia
    
# the output will be like this    
    aleksandar@Ubuntu-2204-jammy-amd64-base:~$ lsmod | grep nvidia
    nvidia_uvm           1781760  0
    nvidia_drm            110592  0
    nvidia_modeset       1556480  1 nvidia_drm
    nvidia              103890944  2 nvidia_uvm,nvidia_modeset
    drm_kms_helper        315392  2 nvidia_drm
    drm                   622592  4 drm_kms_helper,nvidia,nvidia_drm

    
    nvidia-smi

# the output will be like this
aleksandar@Ubuntu-2204-jammy-amd64-base:~$ nvidia-smi
Thu Sep 25 09:55:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.65.06              Driver Version: 580.65.06      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA RTX 4000 SFF Ada ...    Off |   00000000:01:00.0 Off |                  Off |
| 30%   48C    P8              7W /   70W |       2MiB /  20475MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+



#check if the driver was build okay
    dkms status

# output like this
nvidia/580.65.06, 5.15.0-153-generic, x86_64: installed


# Install Docker
    curl -fsSL https://get.docker.com -o get-docker.sh
    sudo sh get-docker.sh

# Install Docker Compose
    sudo apt install docker-compose-plugin -y

# Add your user to docker group
    sudo usermod -aG docker $USER
    newgrp docker

# Install Python dependencies for monitoring
    sudo apt install python3-pip -y
    pip3 install requests psutil nvidia-ml-py tabulate

# Create project directory in /opt
    sudo mkdir -p /opt/vllm
    sudo chown -R $USER:$USER /opt/vllm
    
    
# in the vllm dir create new .env for all configs needed


# HuggingFace Configuration
HUGGING_FACE_HUB_TOKEN= the token from hugniung face
MODEL_NAME=meta-llama/Meta-Llama-3.1-8B-Instruct

# vLLM Configuration
VLLM_PORT=8000
VLLM_API_KEY=random generated key which will be used for the services
TENSOR_PARALLEL_SIZE=1
GPU_MEMORY_UTILIZATION=0.90
MAX_MODEL_LEN=4096

# Nginx Configuration
NGINX_PORT=80
NGINX_SSL_PORT=443

# Server Configuration
SERVER_IP=the ip of the server

# Now create docker-compose.yml  (in /opt/vllm) with this content

services:
  vllm:
    image: vllm/vllm-openai:latest
    container_name: vllm-server
    # Option 1: Use runtime (requires nvidia-container-toolkit)
    runtime: nvidia
    environment:
      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
    volumes:
      - ~/.cache/huggingface:/root/.cache/huggingface
      - ./models:/models
      - ./logs:/logs
    ports:
      - "127.0.0.1:${VLLM_PORT}:8000"
    command: >
      --model ${MODEL_NAME}
      --tensor-parallel-size ${TENSOR_PARALLEL_SIZE}
      --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION}
      --max-model-len ${MAX_MODEL_LEN}
      --api-key ${VLLM_API_KEY}
      --port 8000
      --host 0.0.0.0
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 300s
    restart: unless-stopped
    networks:
      - vllm-network
    logging:
      driver: "json-file"
      options:
        max-size: "100m"
        max-file: "10"

  nginx:
    image: nginx:alpine
    container_name: nginx-proxy
    ports:
      - "${NGINX_PORT}:80"
      - "${NGINX_SSL_PORT}:443"
    volumes:
      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
      - ./nginx/logs:/var/log/nginx
      - ./nginx/ssl:/etc/nginx/ssl:ro
    depends_on:
      - vllm
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    networks:
      - vllm-network
    logging:
      driver: "json-file"
      options:
        max-size: "50m"
        max-file: "5"

networks:
  vllm-network:
    driver: bridge
    


IMPORTANT. We need nvidia docker toolkit correctly installed
# 1. First, remove any old nvidia-docker packages
    sudo apt-get remove nvidia-docker nvidia-docker2 nvidia-container-runtime

# 2. Set up the NVIDIA Container Toolkit repository
    distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
    curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
    curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list

# 3. Update package list
    sudo apt-get update

# 4. Install NVIDIA Container Toolkit
    sudo apt-get install -y nvidia-container-toolkit

# 5. Configure Docker to use the NVIDIA runtime
    sudo nvidia-ctk runtime configure --runtime=docker

# 6. Restart Docker
    sudo systemctl restart docker

# 7. Test if it works
    docker run --rm --runtime=nvidia --gpus all nvidia/cuda:11.8.0-base-ubuntu22.04 nvidia-smi

Output should look like this:

aleksandar@Ubuntu-2204-jammy-amd64-base:/opt/vllm/scripts$ docker run --rm --runtime=nvidia --gpus all nvidia/cuda:11.8.0-base-ubuntu22.04 nvidia-smi
Unable to find image 'nvidia/cuda:11.8.0-base-ubuntu22.04' locally
11.8.0-base-ubuntu22.04: Pulling from nvidia/cuda
aece8493d397: Pull complete 
5e3b7ee77381: Pull complete 
5bd037f007fd: Pull complete 
4cda774ad2ec: Pull complete 
775f22adee62: Pull complete 
Digest: sha256:f895871972c1c91eb6a896eee68468f40289395a1e58c492e1be7929d0f8703b
Status: Downloaded newer image for nvidia/cuda:11.8.0-base-ubuntu22.04
Thu Sep 25 10:20:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.65.06              Driver Version: 580.65.06      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA RTX 4000 SFF Ada ...    Off |   00000000:01:00.0 Off |                  Off |
| 30%   31C    P8              5W /   70W |       2MiB /  20475MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+


# Now create the ngix conf

    touch /opt/vllm/nginx/nginx.conf 

events {
    worker_connections 1024;
}

http {
    upstream vllm_backend {
        server vllm:8000;
        keepalive 32;
    }

    # Rate limiting
    limit_req_zone $binary_remote_addr zone=api_limit:100m rate=100r/s;
    
    # Logging format
    log_format vllm_log '$remote_addr - $remote_user [$time_local] "$request" '
                        '$status $body_bytes_sent "$http_referer" '
                        '"$http_user_agent" rt=$request_time '
                        'uct="$upstream_connect_time" uht="$upstream_header_time" '
                        'urt="$upstream_response_time"';

    server {
        listen 80;
        server_name _;
        
        access_log /var/log/nginx/vllm_access.log vllm_log;
        error_log /var/log/nginx/vllm_error.log warn;

        # Health check endpoint
        location /health {
            access_log off;
            return 200 "healthy\n";
            add_header Content-Type text/plain;
        }

        # vLLM API endpoints
        location / {
            limit_req zone=api_limit burst=20 nodelay;
            
            proxy_pass http://vllm_backend;
            proxy_http_version 1.1;
            proxy_set_header Upgrade $http_upgrade;
            proxy_set_header Connection "upgrade";
            proxy_set_header Host $host;
            proxy_set_header X-Real-IP $remote_addr;
            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
            proxy_set_header X-Forwarded-Proto $scheme;
            
            # Timeouts for LLM streaming
            proxy_connect_timeout 60s;
            proxy_send_timeout 300s;
            proxy_read_timeout 300s;
            
            # Buffer settings for streaming
            proxy_buffering off;
            proxy_cache off;
            
            # CORS headers (adjust as needed)
            add_header Access-Control-Allow-Origin * always;
            add_header Access-Control-Allow-Methods "GET, POST, OPTIONS" always;
            add_header Access-Control-Allow-Headers "Authorization, Content-Type" always;
        }
    }
}


And at the end the startup script. You need to create the script under /opt/vllm/scripts and run it there.


#!/bin/bash

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"

cd "$PROJECT_DIR"

echo "=========================================="
echo "vLLM Deployment Startup"
echo "=========================================="

# Check if .env file exists
if [ ! -f .env ]; then
    echo "❌ Error: .env file not found!"
    echo "Please create .env file with your configuration"
    exit 1
fi

# Load environment variables
source .env

# Validate required environment variables
if [ -z "$HUGGING_FACE_HUB_TOKEN" ]; then
    echo "❌ Error: HUGGING_FACE_HUB_TOKEN not set in .env!"
    exit 1
fi

if [ -z "$VLLM_API_KEY" ]; then
    echo "⚠️  Warning: VLLM_API_KEY not set. Using default (not secure!)"
fi

# Check GPU availability
echo "🔍 Checking GPU availability..."
if ! nvidia-smi &> /dev/null; then
    echo "❌ Error: NVIDIA GPU not detected!"
    echo "Please ensure NVIDIA drivers are installed"
    exit 1
fi

echo "✅ GPU detected:"
nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader

# Check Docker
echo "🔍 Checking Docker..."
if ! docker --version &> /dev/null; then
    echo "❌ Error: Docker not installed!"
    exit 1
fi

# Check Docker Compose
if ! docker compose version &> /dev/null; then
    echo "❌ Error: Docker Compose not installed!"
    exit 1
fi

# Create necessary directories
echo "📁 Creating directories..."
mkdir -p nginx/logs
mkdir -p logs
mkdir -p models

# Stop existing containers if running
echo "🔄 Stopping existing containers (if any)..."
docker compose down 2>/dev/null || true

# Pull latest images
echo "📥 Pulling Docker images..."
docker compose pull

# Start services
echo "🚀 Starting services..."
docker compose up -d

# Wait for services to be ready
echo "⏳ Waiting for services to start (this may take a few minutes for model loading)..."
sleep 10

# Check startup logs
echo "📋 Checking startup logs..."
echo "--- vLLM logs ---"
docker compose logs --tail=20 vllm

echo ""
echo "--- Nginx logs ---"
docker compose logs --tail=10 nginx

# Wait more for model loading
echo "⏳ Waiting for model to load completely..."
MAX_WAIT=300  # 5 minutes maximum
WAITED=0

while [ $WAITED -lt $MAX_WAIT ]; do
    if curl -s -f -H "Authorization: Bearer ${VLLM_API_KEY}" http://localhost:8000/health > /dev/null 2>&1; then
        echo "✅ vLLM is ready!"
        break
    fi
    echo -n "."
    sleep 5
    WAITED=$((WAITED + 5))
done

if [ $WAITED -ge $MAX_WAIT ]; then
    echo ""
    echo "⚠️  Warning: vLLM health check timeout. Check logs with: docker compose logs vllm"
fi

# Run health check
echo ""
echo "🏥 Running health check..."
python3 scripts/health_check.py

echo ""
echo "=========================================="
echo "✅ Deployment Complete!"
echo "=========================================="
echo ""
echo "📌 Access Points:"
echo "  - vLLM API: http://${SERVER_IP}"
echo "  - API Docs: http://${SERVER_IP}/docs"
echo ""
echo "📝 Useful Commands:"
echo "  - View logs:        docker compose logs -f [vllm|nginx]"
echo "  - Health check:     python3 scripts/health_check.py"
echo "  - Start monitoring: python3 scripts/monitor.py"
echo "  - Stop services:    docker compose down"
echo "  - Restart services: docker compose restart"
echo ""
echo "🔑 API Usage:"
echo "  curl -X POST http://${SERVER_IP}/v1/chat/completions \\"
echo "    -H \"Content-Type: application/json\" \\"
echo "    -H \"Authorization: Bearer ${VLLM_API_KEY}\" \\"
echo "    -d '{\"model\": \"${MODEL_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]}'"
echo ""