Documentation Index
Fetch the complete documentation index at: https://docs.vast.ai/llms.txt
Use this file to discover all available pages before exploring further.
This example deploys a language model using vLLM’s AsyncLLMEngine and serves text generation requests.
Deployment
# deploy.py
from vastai import Deployment
from vastai.data.query import gpu_name, RTX_4090, RTX_5090
app = Deployment(name="vllm")
MODEL = "Qwen/Qwen3-0.6B"
@app.context()
class VLLMEngine:
async def __aenter__(self):
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
args = AsyncEngineArgs(model=MODEL, max_model_len=512)
self.engine = AsyncLLMEngine.from_engine_args(args)
# Warmup: run a dummy generation to ensure model is fully loaded
async for _ in self.engine.generate(
"warmup", SamplingParams(max_tokens=1), request_id="warmup"
):
pass
return self
async def __aexit__(self, *exc):
self.engine.shutdown_background_loop()
@app.remote(benchmark_dataset=[{"prompt": "Hello"}])
async def generate(prompt: str, max_tokens: int = 128) -> str:
from vllm import SamplingParams
import uuid
engine = app.get_context(VLLMEngine)
params = SamplingParams(max_tokens=max_tokens, temperature=0.7)
request_id = str(uuid.uuid4())
result = None
async for output in engine.engine.generate(prompt, params, request_id=request_id):
result = output
return result.outputs[0].text
image = app.image("vastai/vllm:v0.11.0-cuda-12.8-mvc-cuda-12.0", 32)
image.use_system_python()
image.pip_install("vllm==0.11.0", "transformers==4.57.0")
image.require(gpu_name.in_([RTX_4090, RTX_5090]))
app.configure_autoscaling(min_load=100)
app.ensure_ready()
Client
# client.py
import asyncio
from deploy import app, generate
async def main():
result = await generate("Explain quantum computing in one sentence.")
print(f"Response: {result}")
if __name__ == "__main__":
asyncio.run(main())
What This Demonstrates
- Using
@context with vLLM’s AsyncLLMEngine for high-performance LLM serving
- Warmup generation in
__aenter__ to pre-allocate KV cache before benchmarking
- Proper cleanup with
shutdown_background_loop() in __aexit__
- Using
image.use_system_python() to use the image’s built-in Python environment
- Specifying exact package versions for reproducibility
- A simple benchmark dataset with a short prompt