# SPDX-License-Identifier: Apache-2.0# SPDX-FileCopyrightText: Copyright contributors to the vLLM project"""This script demonstrates how to extend the context lengthof a Qwen model using the YARN method (rope_parameters)and run a simple chat example.Usage: python examples/offline_inference/context_extension.py"""fromvllmimportLLM,SamplingParamsdefcreate_llm():rope_theta=1000000original_max_position_embeddings=32768factor=4.0# Use yarn to extend contexthf_overrides={"rope_parameters":{"rope_theta":rope_theta,"rope_type":"yarn","factor":factor,"original_max_position_embeddings":original_max_position_embeddings,},"max_model_len":int(original_max_position_embeddings*factor),}llm=LLM(model="Qwen/Qwen3-0.6B",hf_overrides=hf_overrides)returnllmdefrun_llm_chat(llm):sampling_params=SamplingParams(temperature=0.8,top_p=0.95,max_tokens=128,)conversation=[{"role":"system","content":"You are a helpful assistant"},{"role":"user","content":"Hello"},{"role":"assistant","content":"Hello! How can I assist you today?"},]outputs=llm.chat(conversation,sampling_params,use_tqdm=False)returnoutputsdefprint_outputs(outputs):print("\nGenerated Outputs:\n"+"-"*80)foroutputinoutputs:prompt=output.promptgenerated_text=output.outputs[0].textprint(f"Prompt: {prompt!r}\n")print(f"Generated text: {generated_text!r}")print("-"*80)defmain():llm=create_llm()outputs=run_llm_chat(llm)print_outputs(outputs)if__name__=="__main__":main()