# SPDX-License-Identifier: Apache-2.0# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"""Simple example demonstrating streaming offline inference with AsyncLLM (V1 engine).This script shows the core functionality of vLLM's AsyncLLM engine for streamingtoken-by-token output in offline inference scenarios. It demonstrates DELTA modestreaming where you receive new tokens as they are generated.Usage: python examples/offline_inference/async_llm_streaming.py"""importasynciofromvllmimportSamplingParamsfromvllm.engine.arg_utilsimportAsyncEngineArgsfromvllm.sampling_paramsimportRequestOutputKindfromvllm.v1.engine.async_llmimportAsyncLLMasyncdefstream_response(engine:AsyncLLM,prompt:str,request_id:str)->None:""" Stream response from AsyncLLM and display tokens as they arrive. This function demonstrates the core streaming pattern: 1. Create SamplingParams with DELTA output kind 2. Call engine.generate() and iterate over the async generator 3. Print new tokens as they arrive 4. Handle the finished flag to know when generation is complete """print(f"\n🚀 Prompt: {prompt!r}")print("💬 Response: ",end="",flush=True)# Configure sampling parameters for streamingsampling_params=SamplingParams(max_tokens=100,temperature=0.8,top_p=0.95,seed=42,# For reproducible resultsoutput_kind=RequestOutputKind.DELTA,# Get only new tokens each iteration)try:# Stream tokens from AsyncLLMasyncforoutputinengine.generate(request_id=request_id,prompt=prompt,sampling_params=sampling_params):# Process each completion in the outputforcompletioninoutput.outputs:# In DELTA mode, we get only new tokens generated since last iterationnew_text=completion.textifnew_text:print(new_text,end="",flush=True)# Check if generation is finishedifoutput.finished:print("\n✅ Generation complete!")breakexceptExceptionase:print(f"\n❌ Error during streaming: {e}")raiseasyncdefmain():print("🔧 Initializing AsyncLLM...")# Create AsyncLLM engine with simple configurationengine_args=AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",enforce_eager=True,# Faster startup for examples)engine=AsyncLLM.from_engine_args(engine_args)try:# Example prompts to demonstrate streamingprompts=["The future of artificial intelligence is","In a galaxy far, far away","The key to happiness is",]print(f"🎯 Running {len(prompts)} streaming examples...")# Process each promptfori,promptinenumerate(prompts,1):print(f"\n{'='*60}")print(f"Example {i}/{len(prompts)}")print(f"{'='*60}")request_id=f"stream-example-{i}"awaitstream_response(engine,prompt,request_id)# Brief pause between examplesifi<len(prompts):awaitasyncio.sleep(0.5)print("\n🎉 All streaming examples completed!")finally:# Always clean up the engineprint("🔧 Shutting down engine...")engine.shutdown()if__name__=="__main__":try:asyncio.run(main())exceptKeyboardInterrupt:print("\n🛑 Interrupted by user")