Skip to content

vllm.model_executor.model_loader.tensorizer

__all__ module-attribute

__all__ = [
    "EncryptionParams",
    "DecryptionParams",
    "TensorDeserializer",
    "TensorSerializer",
    "open_stream",
    "convert_bytes",
    "get_mem_usage",
    "no_init_or_tensor",
    "TensorizerConfig",
]

_read_stream module-attribute

_read_stream = placeholder_attr('_read_stream')

_write_stream module-attribute

_write_stream = placeholder_attr('_write_stream')

logger module-attribute

logger = init_logger(__name__)

tensorizer module-attribute

tensorizer = PlaceholderModule('tensorizer')

MetaTensorMode

Bases: TorchDispatchMode

Source code in vllm/model_executor/model_loader/tensorizer.py
class MetaTensorMode(TorchDispatchMode):

    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
        kwargs = kwargs or {}

        if func._schema.name == "aten::empty" and "device" not in kwargs:
            kwargs["device"] = "meta"

        return func(*args, **kwargs)

__torch_dispatch__

__torch_dispatch__(func, types, args=(), kwargs=None)
Source code in vllm/model_executor/model_loader/tensorizer.py
def __torch_dispatch__(self, func, types, args=(), kwargs=None):
    kwargs = kwargs or {}

    if func._schema.name == "aten::empty" and "device" not in kwargs:
        kwargs["device"] = "meta"

    return func(*args, **kwargs)

TensorizerArgs dataclass

Source code in vllm/model_executor/model_loader/tensorizer.py
@dataclass
class TensorizerArgs:
    tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str,
                          bytes, os.PathLike, int]
    vllm_tensorized: Optional[bool] = False
    verify_hash: Optional[bool] = False
    num_readers: Optional[int] = None
    encryption_keyfile: Optional[str] = None
    s3_access_key_id: Optional[str] = None
    s3_secret_access_key: Optional[str] = None
    s3_endpoint: Optional[str] = None
    """
  Args for the TensorizerAgent class. These are used to configure the behavior 
  of the TensorDeserializer when loading tensors from a serialized model.

  Args:
      tensorizer_uri: Path to serialized model tensors. Can be a local file 
          path or a S3 URI. This is a required field unless lora_dir is 
          provided and the config is meant to be used for the
          `tensorize_lora_adapter` function.
      vllm_tensorized: If True, indicates that the serialized model is a 
          vLLM model. This is used to determine the behavior of the 
          TensorDeserializer when loading tensors from a serialized model.
          It is far faster to deserialize a vLLM model as it utilizes
          tensorizer's optimized GPU loading. Note that this is now
          deprecated, as serialized vLLM models are now automatically
          inferred as vLLM models.
      verify_hash: If True, the hashes of each tensor will be verified against 
          the hashes stored in the metadata. A `HashMismatchError` will be 
          raised if any of the hashes do not match.
      num_readers: Controls how many threads are allowed to read concurrently
          from the source file. Default is `None`, which will dynamically set
          the number of readers based on the number of available 
          resources and model size. This greatly increases performance.
      encryption_keyfile: File path to a binary file containing a  
          binary key to use for decryption. `None` (the default) means 
          no decryption. See the example script in 
          examples/others/tensorize_vllm_model.py. 
      s3_access_key_id: The access key for the S3 bucket. Can also be set via
          the S3_ACCESS_KEY_ID environment variable.
      s3_secret_access_key: The secret access key for the S3 bucket. Can also
          be set via the S3_SECRET_ACCESS_KEY environment variable.
      s3_endpoint: The endpoint for the S3 bucket. Can also be set via the
          S3_ENDPOINT_URL environment variable.
  """

    def __post_init__(self):
        self.file_obj = self.tensorizer_uri
        self.s3_access_key_id = self.s3_access_key_id or envs.S3_ACCESS_KEY_ID
        self.s3_secret_access_key = (self.s3_secret_access_key
                                     or envs.S3_SECRET_ACCESS_KEY)
        self.s3_endpoint = self.s3_endpoint or envs.S3_ENDPOINT_URL
        self.stream_params = {
            "s3_access_key_id": self.s3_access_key_id,
            "s3_secret_access_key": self.s3_secret_access_key,
            "s3_endpoint": self.s3_endpoint,
        }

        self.deserializer_params = {
            "verify_hash": self.verify_hash,
            "encryption": self.encryption_keyfile,
            "num_readers": self.num_readers
        }

        if self.encryption_keyfile:
            with open_stream(
                    self.encryption_keyfile,
                    **self.stream_params,
            ) as stream:
                key = stream.read()
                decryption_params = DecryptionParams.from_key(key)
                self.deserializer_params['encryption'] = decryption_params

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Tensorizer CLI arguments"""

        # Tensorizer options arg group
        group = parser.add_argument_group(
            'tensorizer options',
            description=('Options for configuring the behavior of the'
                         ' tensorizer deserializer when '
                         'load_format=tensorizer is specified when '
                         'initializing an LLMEngine, either via the CLI '
                         'when running the vLLM OpenAI inference server '
                         'with a JSON string passed to '
                         '--model-loader-extra-config or as arguments given '
                         'to TensorizerConfig when passed to '
                         'model_loader_extra_config in the constructor '
                         'for LLMEngine.'))

        group.add_argument(
            "--tensorizer-uri",
            type=str,
            help="Path to serialized model tensors. Can be a local file path,"
            " or an HTTP(S) or S3 URI.",
        )
        group.add_argument(
            "--verify-hash",
            action="store_true",
            help="If enabled, the hashes of each tensor will be verified"
            " against the hashes stored in the file metadata. An exception"
            " will be raised if any of the hashes do not match.",
        )
        group.add_argument(
            "--encryption-keyfile",
            type=str,
            default=None,
            help="The file path to a binary file containing a binary key to "
            "use for decryption. Can be a file path or S3 network URI.")
        group.add_argument(
            "--num-readers",
            default=None,
            type=int,
            help="Controls how many threads are allowed to read concurrently "
            "from the source file. Default is `None`, which will dynamically "
            "set the number of readers based on the available resources "
            "and model size. This greatly increases performance.")
        group.add_argument(
            "--s3-access-key-id",
            type=str,
            default=None,
            help="The access key for the S3 bucket. Can also be set via the "
            "S3_ACCESS_KEY_ID environment variable.",
        )
        group.add_argument(
            "--s3-secret-access-key",
            type=str,
            default=None,
            help="The secret access key for the S3 bucket. Can also be set via "
            "the S3_SECRET_ACCESS_KEY environment variable.",
        )
        group.add_argument(
            "--s3-endpoint",
            type=str,
            default=None,
            help="The endpoint for the S3 bucket. Can also be set via the "
            "S3_ENDPOINT_URL environment variable.",
        )

        return parser

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        tensorizer_args = cls(**{
            attr: getattr(args, attr)
            for attr in attrs if hasattr(args, attr)
        })
        return tensorizer_args

encryption_keyfile class-attribute instance-attribute

encryption_keyfile: Optional[str] = None

num_readers class-attribute instance-attribute

num_readers: Optional[int] = None

s3_access_key_id class-attribute instance-attribute

s3_access_key_id: Optional[str] = None

s3_endpoint class-attribute instance-attribute

s3_endpoint: Optional[str] = None

Args for the TensorizerAgent class. These are used to configure the behavior of the TensorDeserializer when loading tensors from a serialized model.

Parameters:

Name Type Description Default
tensorizer_uri

Path to serialized model tensors. Can be a local file path or a S3 URI. This is a required field unless lora_dir is provided and the config is meant to be used for the tensorize_lora_adapter function.

required
vllm_tensorized

If True, indicates that the serialized model is a vLLM model. This is used to determine the behavior of the TensorDeserializer when loading tensors from a serialized model. It is far faster to deserialize a vLLM model as it utilizes tensorizer's optimized GPU loading. Note that this is now deprecated, as serialized vLLM models are now automatically inferred as vLLM models.

required
verify_hash

If True, the hashes of each tensor will be verified against the hashes stored in the metadata. A HashMismatchError will be raised if any of the hashes do not match.

required
num_readers

Controls how many threads are allowed to read concurrently from the source file. Default is None, which will dynamically set the number of readers based on the number of available resources and model size. This greatly increases performance.

required
encryption_keyfile

File path to a binary file containing a
binary key to use for decryption. None (the default) means no decryption. See the example script in examples/others/tensorize_vllm_model.py.

required
s3_access_key_id

The access key for the S3 bucket. Can also be set via the S3_ACCESS_KEY_ID environment variable.

required
s3_secret_access_key

The secret access key for the S3 bucket. Can also be set via the S3_SECRET_ACCESS_KEY environment variable.

required
s3_endpoint

The endpoint for the S3 bucket. Can also be set via the S3_ENDPOINT_URL environment variable.

required

s3_secret_access_key class-attribute instance-attribute

s3_secret_access_key: Optional[str] = None

tensorizer_uri instance-attribute

verify_hash class-attribute instance-attribute

verify_hash: Optional[bool] = False

vllm_tensorized class-attribute instance-attribute

vllm_tensorized: Optional[bool] = False

__init__

__init__(
    tensorizer_uri: Union[
        BufferedIOBase,
        RawIOBase,
        BinaryIO,
        str,
        bytes,
        PathLike,
        int,
    ],
    vllm_tensorized: Optional[bool] = False,
    verify_hash: Optional[bool] = False,
    num_readers: Optional[int] = None,
    encryption_keyfile: Optional[str] = None,
    s3_access_key_id: Optional[str] = None,
    s3_secret_access_key: Optional[str] = None,
    s3_endpoint: Optional[str] = None,
) -> None

__post_init__

__post_init__()
Source code in vllm/model_executor/model_loader/tensorizer.py
def __post_init__(self):
    self.file_obj = self.tensorizer_uri
    self.s3_access_key_id = self.s3_access_key_id or envs.S3_ACCESS_KEY_ID
    self.s3_secret_access_key = (self.s3_secret_access_key
                                 or envs.S3_SECRET_ACCESS_KEY)
    self.s3_endpoint = self.s3_endpoint or envs.S3_ENDPOINT_URL
    self.stream_params = {
        "s3_access_key_id": self.s3_access_key_id,
        "s3_secret_access_key": self.s3_secret_access_key,
        "s3_endpoint": self.s3_endpoint,
    }

    self.deserializer_params = {
        "verify_hash": self.verify_hash,
        "encryption": self.encryption_keyfile,
        "num_readers": self.num_readers
    }

    if self.encryption_keyfile:
        with open_stream(
                self.encryption_keyfile,
                **self.stream_params,
        ) as stream:
            key = stream.read()
            decryption_params = DecryptionParams.from_key(key)
            self.deserializer_params['encryption'] = decryption_params

add_cli_args staticmethod

add_cli_args(
    parser: FlexibleArgumentParser,
) -> FlexibleArgumentParser

Tensorizer CLI arguments

Source code in vllm/model_executor/model_loader/tensorizer.py
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
    """Tensorizer CLI arguments"""

    # Tensorizer options arg group
    group = parser.add_argument_group(
        'tensorizer options',
        description=('Options for configuring the behavior of the'
                     ' tensorizer deserializer when '
                     'load_format=tensorizer is specified when '
                     'initializing an LLMEngine, either via the CLI '
                     'when running the vLLM OpenAI inference server '
                     'with a JSON string passed to '
                     '--model-loader-extra-config or as arguments given '
                     'to TensorizerConfig when passed to '
                     'model_loader_extra_config in the constructor '
                     'for LLMEngine.'))

    group.add_argument(
        "--tensorizer-uri",
        type=str,
        help="Path to serialized model tensors. Can be a local file path,"
        " or an HTTP(S) or S3 URI.",
    )
    group.add_argument(
        "--verify-hash",
        action="store_true",
        help="If enabled, the hashes of each tensor will be verified"
        " against the hashes stored in the file metadata. An exception"
        " will be raised if any of the hashes do not match.",
    )
    group.add_argument(
        "--encryption-keyfile",
        type=str,
        default=None,
        help="The file path to a binary file containing a binary key to "
        "use for decryption. Can be a file path or S3 network URI.")
    group.add_argument(
        "--num-readers",
        default=None,
        type=int,
        help="Controls how many threads are allowed to read concurrently "
        "from the source file. Default is `None`, which will dynamically "
        "set the number of readers based on the available resources "
        "and model size. This greatly increases performance.")
    group.add_argument(
        "--s3-access-key-id",
        type=str,
        default=None,
        help="The access key for the S3 bucket. Can also be set via the "
        "S3_ACCESS_KEY_ID environment variable.",
    )
    group.add_argument(
        "--s3-secret-access-key",
        type=str,
        default=None,
        help="The secret access key for the S3 bucket. Can also be set via "
        "the S3_SECRET_ACCESS_KEY environment variable.",
    )
    group.add_argument(
        "--s3-endpoint",
        type=str,
        default=None,
        help="The endpoint for the S3 bucket. Can also be set via the "
        "S3_ENDPOINT_URL environment variable.",
    )

    return parser

from_cli_args classmethod

from_cli_args(args: Namespace) -> TensorizerArgs
Source code in vllm/model_executor/model_loader/tensorizer.py
@classmethod
def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
    attrs = [attr.name for attr in dataclasses.fields(cls)]
    tensorizer_args = cls(**{
        attr: getattr(args, attr)
        for attr in attrs if hasattr(args, attr)
    })
    return tensorizer_args

TensorizerConfig dataclass

Source code in vllm/model_executor/model_loader/tensorizer.py
@dataclass
class TensorizerConfig:
    tensorizer_uri: Union[str, None] = None
    vllm_tensorized: Optional[bool] = False
    verify_hash: Optional[bool] = False
    num_readers: Optional[int] = None
    encryption_keyfile: Optional[str] = None
    s3_access_key_id: Optional[str] = None
    s3_secret_access_key: Optional[str] = None
    s3_endpoint: Optional[str] = None
    model_class: Optional[type[torch.nn.Module]] = None
    hf_config: Optional[PretrainedConfig] = None
    dtype: Optional[Union[str, torch.dtype]] = None
    lora_dir: Optional[str] = None
    _is_sharded: bool = False

    def __post_init__(self):
        # check if the configuration is for a sharded vLLM model
        self._is_sharded = isinstance(self.tensorizer_uri, str) \
            and re.search(r'%0\dd', self.tensorizer_uri) is not None
        if not self.tensorizer_uri and not self.lora_dir:
            raise ValueError("tensorizer_uri must be provided.")
        if not self.tensorizer_uri and self.lora_dir:
            self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
        assert self.tensorizer_uri is not None, ("tensorizer_uri must be "
                                                 "provided.")
        self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
        self.lora_dir = self.tensorizer_dir

    @classmethod
    def as_dict(cls, *args, **kwargs) -> dict[str, Any]:
        cfg = TensorizerConfig(*args, **kwargs)
        return dataclasses.asdict(cfg)

    def to_dict(self) -> dict[str, Any]:
        return dataclasses.asdict(self)

    def _construct_tensorizer_args(self) -> "TensorizerArgs":
        tensorizer_args = {
            "tensorizer_uri": self.tensorizer_uri,
            "vllm_tensorized": self.vllm_tensorized,
            "verify_hash": self.verify_hash,
            "num_readers": self.num_readers,
            "encryption_keyfile": self.encryption_keyfile,
            "s3_access_key_id": self.s3_access_key_id,
            "s3_secret_access_key": self.s3_secret_access_key,
            "s3_endpoint": self.s3_endpoint,
        }
        return TensorizerArgs(**tensorizer_args)  # type: ignore

    def verify_with_parallel_config(
        self,
        parallel_config: "ParallelConfig",
    ) -> None:
        if parallel_config.tensor_parallel_size > 1 \
            and not self._is_sharded:
            raise ValueError(
                "For a sharded model, tensorizer_uri should include a"
                " string format template like '%04d' to be formatted"
                " with the rank of the shard")

    def verify_with_model_config(self, model_config: "ModelConfig") -> None:
        if (model_config.quantization is not None
                and self.tensorizer_uri is not None):
            logger.warning(
                "Loading a model using Tensorizer with quantization on vLLM"
                " is unstable and may lead to errors.")

    def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
        if tensorizer_args is None:
            tensorizer_args = self._construct_tensorizer_args()

        return open_stream(self.tensorizer_uri,
                           **tensorizer_args.stream_params)

_is_sharded class-attribute instance-attribute

_is_sharded: bool = False

dtype class-attribute instance-attribute

dtype: Optional[Union[str, dtype]] = None

encryption_keyfile class-attribute instance-attribute

encryption_keyfile: Optional[str] = None

hf_config class-attribute instance-attribute

hf_config: Optional[PretrainedConfig] = None

lora_dir class-attribute instance-attribute

lora_dir: Optional[str] = None

model_class class-attribute instance-attribute

model_class: Optional[type[Module]] = None

num_readers class-attribute instance-attribute

num_readers: Optional[int] = None

s3_access_key_id class-attribute instance-attribute

s3_access_key_id: Optional[str] = None

s3_endpoint class-attribute instance-attribute

s3_endpoint: Optional[str] = None

s3_secret_access_key class-attribute instance-attribute

s3_secret_access_key: Optional[str] = None

tensorizer_uri class-attribute instance-attribute

tensorizer_uri: Union[str, None] = None

verify_hash class-attribute instance-attribute

verify_hash: Optional[bool] = False

vllm_tensorized class-attribute instance-attribute

vllm_tensorized: Optional[bool] = False

__init__

__init__(
    tensorizer_uri: Union[str, None] = None,
    vllm_tensorized: Optional[bool] = False,
    verify_hash: Optional[bool] = False,
    num_readers: Optional[int] = None,
    encryption_keyfile: Optional[str] = None,
    s3_access_key_id: Optional[str] = None,
    s3_secret_access_key: Optional[str] = None,
    s3_endpoint: Optional[str] = None,
    model_class: Optional[type[Module]] = None,
    hf_config: Optional[PretrainedConfig] = None,
    dtype: Optional[Union[str, dtype]] = None,
    lora_dir: Optional[str] = None,
    _is_sharded: bool = False,
) -> None

__post_init__

__post_init__()
Source code in vllm/model_executor/model_loader/tensorizer.py
def __post_init__(self):
    # check if the configuration is for a sharded vLLM model
    self._is_sharded = isinstance(self.tensorizer_uri, str) \
        and re.search(r'%0\dd', self.tensorizer_uri) is not None
    if not self.tensorizer_uri and not self.lora_dir:
        raise ValueError("tensorizer_uri must be provided.")
    if not self.tensorizer_uri and self.lora_dir:
        self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
    assert self.tensorizer_uri is not None, ("tensorizer_uri must be "
                                             "provided.")
    self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
    self.lora_dir = self.tensorizer_dir

_construct_tensorizer_args

_construct_tensorizer_args() -> TensorizerArgs
Source code in vllm/model_executor/model_loader/tensorizer.py
def _construct_tensorizer_args(self) -> "TensorizerArgs":
    tensorizer_args = {
        "tensorizer_uri": self.tensorizer_uri,
        "vllm_tensorized": self.vllm_tensorized,
        "verify_hash": self.verify_hash,
        "num_readers": self.num_readers,
        "encryption_keyfile": self.encryption_keyfile,
        "s3_access_key_id": self.s3_access_key_id,
        "s3_secret_access_key": self.s3_secret_access_key,
        "s3_endpoint": self.s3_endpoint,
    }
    return TensorizerArgs(**tensorizer_args)  # type: ignore

as_dict classmethod

as_dict(*args, **kwargs) -> dict[str, Any]
Source code in vllm/model_executor/model_loader/tensorizer.py
@classmethod
def as_dict(cls, *args, **kwargs) -> dict[str, Any]:
    cfg = TensorizerConfig(*args, **kwargs)
    return dataclasses.asdict(cfg)

open_stream

open_stream(
    tensorizer_args: Optional[TensorizerArgs] = None,
)
Source code in vllm/model_executor/model_loader/tensorizer.py
def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None):
    if tensorizer_args is None:
        tensorizer_args = self._construct_tensorizer_args()

    return open_stream(self.tensorizer_uri,
                       **tensorizer_args.stream_params)

to_dict

to_dict() -> dict[str, Any]
Source code in vllm/model_executor/model_loader/tensorizer.py
def to_dict(self) -> dict[str, Any]:
    return dataclasses.asdict(self)

verify_with_model_config

verify_with_model_config(model_config: ModelConfig) -> None
Source code in vllm/model_executor/model_loader/tensorizer.py
def verify_with_model_config(self, model_config: "ModelConfig") -> None:
    if (model_config.quantization is not None
            and self.tensorizer_uri is not None):
        logger.warning(
            "Loading a model using Tensorizer with quantization on vLLM"
            " is unstable and may lead to errors.")

verify_with_parallel_config

verify_with_parallel_config(
    parallel_config: ParallelConfig,
) -> None
Source code in vllm/model_executor/model_loader/tensorizer.py
def verify_with_parallel_config(
    self,
    parallel_config: "ParallelConfig",
) -> None:
    if parallel_config.tensor_parallel_size > 1 \
        and not self._is_sharded:
        raise ValueError(
            "For a sharded model, tensorizer_uri should include a"
            " string format template like '%04d' to be formatted"
            " with the rank of the shard")

_NoInitOrTensorImpl

Source code in vllm/model_executor/model_loader/tensorizer.py
class _NoInitOrTensorImpl:
    _MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm)
    _MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES)

    is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active",
                                       default=False)
    _count_active: int = 0
    _count_active_lock = threading.Lock()

    @classmethod
    @contextlib.contextmanager
    def context_manager(cls):
        if cls.is_active.get():
            yield
            return

        with cls._count_active_lock:
            cls._count_active += 1
            if cls._count_active == 1:
                for mod in cls._MODULES:
                    mod.reset_parameters = cls._disable(mod.reset_parameters)

        reset_token = cls.is_active.set(True)

        try:
            with MetaTensorMode():
                yield
        finally:
            cls.is_active.reset(reset_token)
            with cls._count_active_lock:
                cls._count_active -= 1
                if cls._count_active == 0:
                    for mod, original in cls._MODULE_ORIGINALS:
                        mod.reset_parameters = original

    @staticmethod
    def _disable(func):

        def wrapper(*args, **kwargs):
            if not _NoInitOrTensorImpl.is_active.get():
                return func(*args, **kwargs)

        return wrapper

_MODULES class-attribute instance-attribute

_MODULES = (Linear, Embedding, LayerNorm)

_MODULE_ORIGINALS class-attribute instance-attribute

_MODULE_ORIGINALS = tuple(
    (m, reset_parameters) for m in _MODULES
)

_count_active class-attribute instance-attribute

_count_active: int = 0

_count_active_lock class-attribute instance-attribute

_count_active_lock = Lock()

is_active class-attribute instance-attribute

is_active = ContextVar(
    "_NoInitOrTensorImpl.is_active", default=False
)

_disable staticmethod

_disable(func)
Source code in vllm/model_executor/model_loader/tensorizer.py
@staticmethod
def _disable(func):

    def wrapper(*args, **kwargs):
        if not _NoInitOrTensorImpl.is_active.get():
            return func(*args, **kwargs)

    return wrapper

context_manager classmethod

context_manager()
Source code in vllm/model_executor/model_loader/tensorizer.py
@classmethod
@contextlib.contextmanager
def context_manager(cls):
    if cls.is_active.get():
        yield
        return

    with cls._count_active_lock:
        cls._count_active += 1
        if cls._count_active == 1:
            for mod in cls._MODULES:
                mod.reset_parameters = cls._disable(mod.reset_parameters)

    reset_token = cls.is_active.set(True)

    try:
        with MetaTensorMode():
            yield
    finally:
        cls.is_active.reset(reset_token)
        with cls._count_active_lock:
            cls._count_active -= 1
            if cls._count_active == 0:
                for mod, original in cls._MODULE_ORIGINALS:
                    mod.reset_parameters = original

_check_tensors_on_meta_device

_check_tensors_on_meta_device(model: Module) -> None
Source code in vllm/model_executor/model_loader/tensorizer.py
def _check_tensors_on_meta_device(model: nn.Module) -> None:
    for tensor in model.state_dict().values():
        if tensor.device.type == 'meta':
            raise ValueError(
                "The serialized model contains tensors on the meta device,"
                " indicating that some tensors were not loaded properly."
                " Please check that the parameters of the model being"
                " specified match that of the serialized model, such as"
                " its quantization.")

_resize_lora_embeddings

_resize_lora_embeddings(model: Module)

Modify LoRA embedding layers to use bigger tensors to allow for adapter added tokens.

Source code in vllm/model_executor/model_loader/tensorizer.py
def _resize_lora_embeddings(model: nn.Module):
    """Modify LoRA embedding layers to use bigger tensors
    to allow for adapter added tokens."""
    for child in model.modules():
        if (isinstance(child, VocabParallelEmbedding) and child.weight.shape[0]
                < child.num_embeddings_per_partition):
            new_weight = torch.empty(child.num_embeddings_per_partition,
                                     child.embedding_dim,
                                     dtype=child.weight.dtype,
                                     device=child.weight.device)
            new_weight[:child.weight.shape[0]].copy_(child.weight.data)
            new_weight[child.weight.shape[0]:].fill_(0)
            child.weight.data = new_weight

deserialize_tensorizer_model

deserialize_tensorizer_model(
    model: Module, tensorizer_config: TensorizerConfig
) -> None
Source code in vllm/model_executor/model_loader/tensorizer.py
def deserialize_tensorizer_model(model: nn.Module,
                                 tensorizer_config: TensorizerConfig) -> None:
    tensorizer_args = tensorizer_config._construct_tensorizer_args()
    before_mem = get_mem_usage()
    start = time.perf_counter()
    with _read_stream(
            tensorizer_config.tensorizer_uri,
            **tensorizer_args.stream_params) as stream, TensorDeserializer(
                stream,
                dtype=tensorizer_config.dtype,
                device=f'cuda:{torch.cuda.current_device()}',
                **tensorizer_args.deserializer_params) as deserializer:
        deserializer.load_into_module(model)
        end = time.perf_counter()

    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
    duration = end - start
    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
    after_mem = get_mem_usage()
    deserializer.close()
    logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str,
                end - start, per_second)
    logger.info("Memory usage before: %s", before_mem)
    logger.info("Memory usage after: %s", after_mem)

    _check_tensors_on_meta_device(model)
    _resize_lora_embeddings(model)
    del model.vllm_tensorized_marker

init_tensorizer_model

init_tensorizer_model(
    tensorizer_config: TensorizerConfig,
    vllm_config: VllmConfig,
) -> Module
Source code in vllm/model_executor/model_loader/tensorizer.py
def init_tensorizer_model(tensorizer_config: TensorizerConfig,
                          vllm_config: VllmConfig) -> nn.Module:
    assert tensorizer_config.hf_config is not None
    model_args = tensorizer_config.hf_config
    model_args.torch_dtype = tensorizer_config.dtype
    assert tensorizer_config.model_class is not None
    # TODO: Do we need to consider old-style model class?
    with meta_tensor_mode(), set_current_vllm_config(vllm_config,
                                                     check_compile=True):
        return tensorizer_config.model_class(vllm_config=vllm_config)

is_vllm_tensorized

is_vllm_tensorized(
    tensorizer_config: TensorizerConfig,
) -> bool

Infer if the model is a vLLM model by checking the weights for a vLLM tensorized marker.

Parameters:

Name Type Description Default
tensorizer_config TensorizerConfig

The TensorizerConfig object containing the tensorizer_uri to the serialized model.

required

Returns:

Name Type Description
bool bool

True if the model is a vLLM model, False otherwise.

Source code in vllm/model_executor/model_loader/tensorizer.py
def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
    """
    Infer if the model is a vLLM model by checking the weights for
    a vLLM tensorized marker.

    Args:
        tensorizer_config: The TensorizerConfig object containing the
            tensorizer_uri to the serialized model.

    Returns:
        bool: True if the model is a vLLM model, False otherwise.
    """
    tensorizer_args = tensorizer_config._construct_tensorizer_args()
    deserializer = TensorDeserializer(open_stream(
        tensorizer_args.tensorizer_uri, **tensorizer_args.stream_params),
                                      **tensorizer_args.deserializer_params,
                                      lazy_load=True)
    if tensorizer_config.vllm_tensorized:
        logger.warning(
            "Please note that newly serialized vLLM models are automatically "
            "inferred as vLLM models, so setting vllm_tensorized=True is "
            "only necessary for models serialized prior to this change.")
        return True
    return ".vllm_tensorized_marker" in deserializer

meta_tensor_mode

meta_tensor_mode(loading_code=None)
Source code in vllm/model_executor/model_loader/tensorizer.py
def meta_tensor_mode(loading_code=None, ):

    if loading_code is None:
        return _NoInitOrTensorImpl.context_manager()
    elif callable(loading_code):
        with _NoInitOrTensorImpl.context_manager():
            return loading_code()
    else:
        raise TypeError(
            "expected a callable to evaluate,"
            " or None if being used as a context manager;"
            f' got an object of type "{type(loading_code).__name__}" instead.')

serialize_vllm_model

serialize_vllm_model(
    model: Module, tensorizer_config: TensorizerConfig
) -> Module
Source code in vllm/model_executor/model_loader/tensorizer.py
def serialize_vllm_model(
    model: nn.Module,
    tensorizer_config: TensorizerConfig,
) -> nn.Module:
    model.register_parameter(
        "vllm_tensorized_marker",
        nn.Parameter(torch.tensor((1, ), device="meta"), requires_grad=False))
    tensorizer_args = tensorizer_config._construct_tensorizer_args()

    encryption_params = None
    if (keyfile := tensorizer_config.encryption_keyfile) is not None:
        with open(keyfile, "rb") as f:
            key = f.read()
        encryption_params = EncryptionParams(key=key)

    output_file = tensorizer_args.tensorizer_uri
    if tensorizer_config._is_sharded:
        from vllm.distributed import get_tensor_model_parallel_rank
        output_file = output_file % get_tensor_model_parallel_rank()

    with _write_stream(output_file, **tensorizer_args.stream_params) as stream:
        serializer = TensorSerializer(stream, encryption=encryption_params)
        serializer.write_module(model)
        serializer.close()
    logger.info("Successfully serialized model to %s", str(output_file))
    return model

tensorize_lora_adapter

tensorize_lora_adapter(
    lora_path: str, tensorizer_config: TensorizerConfig
)

Uses tensorizer to serialize a LoRA adapter. Assumes that the files needed to load a LoRA adapter are a safetensors-format file called adapter_model.safetensors and a json config file called adapter_config.json.

Serializes the files in the tensorizer_config.lora_dir

Source code in vllm/model_executor/model_loader/tensorizer.py
def tensorize_lora_adapter(lora_path: str,
                           tensorizer_config: TensorizerConfig):
    """
    Uses tensorizer to serialize a LoRA adapter. Assumes that the files
    needed to load a LoRA adapter are a safetensors-format file called
    adapter_model.safetensors and a json config file called adapter_config.json.

    Serializes the files in the tensorizer_config.lora_dir
    """
    import safetensors

    from vllm.lora.utils import get_adapter_absolute_path

    lora_dir = get_adapter_absolute_path(lora_path)

    tensor_path = config_path = ""

    for file in os.listdir(lora_dir):
        if file.startswith("adapter_model"):
            tensor_path = lora_dir + "/" + file
        if file.startswith("adapter_config"):
            config_path = lora_dir + "/" + file
        if tensor_path and config_path:
            break

    if tensor_path.endswith(".safetensors"):
        tensors = safetensors.torch.load_file(tensor_path)
    elif tensor_path.endswith(".bin"):
        tensors = torch.load(tensor_path)
    else:
        raise ValueError("Unsupported file: %s", tensor_path)

    with open(config_path) as f:
        config = json.load(f)

    tensorizer_args = tensorizer_config._construct_tensorizer_args()

    with open_stream(f"{tensorizer_config.lora_dir}/adapter_config.json",
                     mode="wb+",
                     **tensorizer_args.stream_params) as f:

        f.write(json.dumps(config).encode("utf-8"))

    lora_uri = (f"{tensorizer_config.lora_dir}"
                f"/adapter_model.tensors")
    with open_stream(lora_uri, mode="wb+",
                     **tensorizer_args.stream_params) as f:
        serializer = TensorSerializer(f)
        serializer.write_state_dict(tensors)
        serializer.close()

    logger.info("Successfully serialized LoRA files to %s",
                str(tensorizer_config.lora_dir))

tensorize_vllm_model

tensorize_vllm_model(
    engine_args: EngineArgs,
    tensorizer_config: TensorizerConfig,
    generate_keyfile: bool = True,
)

Utility to load a model and then serialize it with Tensorizer

Intended to be used separately from running a vLLM server since it creates its own Engine instance.

Source code in vllm/model_executor/model_loader/tensorizer.py
def tensorize_vllm_model(engine_args: "EngineArgs",
                         tensorizer_config: TensorizerConfig,
                         generate_keyfile: bool = True):
    """Utility to load a model and then serialize it with Tensorizer

       Intended to be used separately from running a vLLM server since it
       creates its own Engine instance.
    """
    engine_config = engine_args.create_engine_config()
    tensorizer_config.verify_with_model_config(engine_config.model_config)
    tensorizer_config.verify_with_parallel_config(
        engine_config.parallel_config)

    # generate the encryption key before creating the engine to support sharding
    if generate_keyfile and (keyfile :=
                             tensorizer_config.encryption_keyfile) is not None:
        encryption_params = EncryptionParams.random()
        with _write_stream(
                keyfile,
                s3_access_key_id=tensorizer_config.s3_access_key_id,
                s3_secret_access_key=tensorizer_config.s3_secret_access_key,
                s3_endpoint=tensorizer_config.s3_endpoint,
        ) as stream:
            stream.write(encryption_params.key)

    from vllm import LLMEngine
    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine

    if not envs.VLLM_USE_V1:
        engine = LLMEngine.from_engine_args(engine_args)
        engine.model_executor.collective_rpc(
            "save_tensorized_model",
            kwargs=dict(tensorizer_config=tensorizer_config),
        )
    else:
        engine = V1LLMEngine.from_vllm_config(engine_config)
        engine.collective_rpc(
            "save_tensorized_model",
            kwargs=dict(tensorizer_config=tensorizer_config),
        )

tensorizer_weights_iterator

tensorizer_weights_iterator(
    tensorizer_args: TensorizerArgs,
) -> Generator[tuple[str, Tensor], None, None]
Source code in vllm/model_executor/model_loader/tensorizer.py
def tensorizer_weights_iterator(
    tensorizer_args: "TensorizerArgs"
) -> Generator[tuple[str, torch.Tensor], None, None]:
    logger.warning("Deserializing HuggingFace models is not optimized for "
                   "loading on vLLM, as tensorizer is forced to load to CPU. "
                   "Consider deserializing a vLLM model instead for faster "
                   "load times. See the "
                   "examples/others/tensorize_vllm_model.py example script "
                   "for serializing vLLM models.")

    deserializer_args = tensorizer_args.deserializer_params
    stream_params = tensorizer_args.stream_params
    stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
    with TensorDeserializer(stream, **deserializer_args,
                            device="cpu") as state:
        yield from state.items()
    del state