vllm.model_executor.models.florence2

ChannelAttention ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class ChannelAttention(nn.Module):

    def __init__(self, dim, groups=8, qkv_bias=True):
        super().__init__()

        self.groups = groups
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(dim, dim)

    def forward(self, x, size):
        B, N, C = x.shape

        qkv = self.qkv(x).reshape(B, N, 3, self.groups,
                                  C // self.groups).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        q = q * (float(N)**-0.5)
        attention = q.transpose(-1, -2) @ k
        attention = attention.softmax(dim=-1)
        x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
        x = x.transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        return x, size

groups `instance-attribute` ¶

groups = groups

proj `instance-attribute` ¶

proj = Linear(dim, dim)

qkv `instance-attribute` ¶

qkv = Linear(dim, dim * 3, bias=qkv_bias)

init ¶

__init__(dim, groups=8, qkv_bias=True)

Source code in vllm/model_executor/models/florence2.py

def __init__(self, dim, groups=8, qkv_bias=True):
    super().__init__()

    self.groups = groups
    self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
    self.proj = nn.Linear(dim, dim)

forward ¶

forward(x, size)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x, size):
    B, N, C = x.shape

    qkv = self.qkv(x).reshape(B, N, 3, self.groups,
                              C // self.groups).permute(2, 0, 3, 1, 4)
    q, k, v = qkv[0], qkv[1], qkv[2]

    q = q * (float(N)**-0.5)
    attention = q.transpose(-1, -2) @ k
    attention = attention.softmax(dim=-1)
    x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
    x = x.transpose(1, 2).reshape(B, N, C)
    x = self.proj(x)
    return x, size

ChannelBlock ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class ChannelBlock(nn.Module):

    def __init__(self,
                 dim,
                 groups,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 drop_path_rate=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 conv_at_attn=True,
                 conv_at_ffn=True):
        super().__init__()

        self.conv1 = PreNorm(None, DepthWiseConv2d(
            dim, 3, 1, 1)) if conv_at_attn else None
        self.channel_attn = PreNorm(
            norm_layer(dim),
            ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
        )
        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
                                                   1)) if conv_at_ffn else None
        self.ffn = PreNorm(
            norm_layer(dim),
            Mlp(in_features=dim,
                hidden_features=int(dim * mlp_ratio),
                act_layer=act_layer),
        )

    def forward(self, x, size):
        if self.conv1:
            x, size = self.conv1(x, size)
        x, size = self.channel_attn(x, size)

        if self.conv2:
            x, size = self.conv2(x, size)
        x, size = self.ffn(x, size)

        return x, size

channel_attn `instance-attribute` ¶

channel_attn = PreNorm(
    norm_layer(dim),
    ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
)

conv1 `instance-attribute` ¶

conv1 = (
    PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
    if conv_at_attn
    else None
)

conv2 `instance-attribute` ¶

conv2 = (
    PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
    if conv_at_ffn
    else None
)

ffn `instance-attribute` ¶

ffn = PreNorm(
    norm_layer(dim),
    Mlp(
        in_features=dim,
        hidden_features=int(dim * mlp_ratio),
        act_layer=act_layer,
    ),
)

init ¶

__init__(
    dim,
    groups,
    mlp_ratio=4.0,
    qkv_bias=True,
    drop_path_rate=0.0,
    act_layer=GELU,
    norm_layer=LayerNorm,
    conv_at_attn=True,
    conv_at_ffn=True,
)

Source code in vllm/model_executor/models/florence2.py

def __init__(self,
             dim,
             groups,
             mlp_ratio=4.,
             qkv_bias=True,
             drop_path_rate=0.,
             act_layer=nn.GELU,
             norm_layer=nn.LayerNorm,
             conv_at_attn=True,
             conv_at_ffn=True):
    super().__init__()

    self.conv1 = PreNorm(None, DepthWiseConv2d(
        dim, 3, 1, 1)) if conv_at_attn else None
    self.channel_attn = PreNorm(
        norm_layer(dim),
        ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
    )
    self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
                                               1)) if conv_at_ffn else None
    self.ffn = PreNorm(
        norm_layer(dim),
        Mlp(in_features=dim,
            hidden_features=int(dim * mlp_ratio),
            act_layer=act_layer),
    )

forward ¶

forward(x, size)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x, size):
    if self.conv1:
        x, size = self.conv1(x, size)
    x, size = self.channel_attn(x, size)

    if self.conv2:
        x, size = self.conv2(x, size)
    x, size = self.ffn(x, size)

    return x, size

ConvEmbed ¶

Bases: Module

Image to Patch Embedding

Source code in vllm/model_executor/models/florence2.py

class ConvEmbed(nn.Module):
    """ Image to Patch Embedding
    """

    def __init__(self,
                 patch_size=7,
                 in_chans=3,
                 embed_dim=64,
                 stride=4,
                 padding=2,
                 norm_layer=None,
                 pre_norm=True):
        super().__init__()
        self.patch_size = patch_size

        self.proj = nn.Conv2d(in_chans,
                              embed_dim,
                              kernel_size=patch_size,
                              stride=stride,
                              padding=padding)

        dim_norm = in_chans if pre_norm else embed_dim
        self.norm = norm_layer(dim_norm) if norm_layer else None

        self.pre_norm = pre_norm

    def forward(self, x, size):
        H, W = size
        if len(x.size()) == 3:
            if self.norm and self.pre_norm:
                x = self.norm(x)
            x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)

        x = self.proj(x)

        _, _, H, W = x.shape
        x = rearrange(x, 'b c h w -> b (h w) c')
        if self.norm and not self.pre_norm:
            x = self.norm(x)

        return x, (H, W)

norm `instance-attribute` ¶

norm = norm_layer(dim_norm) if norm_layer else None

patch_size `instance-attribute` ¶

patch_size = patch_size

pre_norm `instance-attribute` ¶

pre_norm = pre_norm

proj `instance-attribute` ¶

proj = Conv2d(
    in_chans,
    embed_dim,
    kernel_size=patch_size,
    stride=stride,
    padding=padding,
)

init ¶

__init__(
    patch_size=7,
    in_chans=3,
    embed_dim=64,
    stride=4,
    padding=2,
    norm_layer=None,
    pre_norm=True,
)

Source code in vllm/model_executor/models/florence2.py

def __init__(self,
             patch_size=7,
             in_chans=3,
             embed_dim=64,
             stride=4,
             padding=2,
             norm_layer=None,
             pre_norm=True):
    super().__init__()
    self.patch_size = patch_size

    self.proj = nn.Conv2d(in_chans,
                          embed_dim,
                          kernel_size=patch_size,
                          stride=stride,
                          padding=padding)

    dim_norm = in_chans if pre_norm else embed_dim
    self.norm = norm_layer(dim_norm) if norm_layer else None

    self.pre_norm = pre_norm

forward ¶

forward(x, size)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x, size):
    H, W = size
    if len(x.size()) == 3:
        if self.norm and self.pre_norm:
            x = self.norm(x)
        x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)

    x = self.proj(x)

    _, _, H, W = x.shape
    x = rearrange(x, 'b c h w -> b (h w) c')
    if self.norm and not self.pre_norm:
        x = self.norm(x)

    return x, (H, W)

DaViT ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class DaViT(nn.Module):

    def __init__(
        self,
        in_chans=3,
        num_classes=1000,
        depths=(1, 1, 3, 1),
        patch_size=(7, 2, 2, 2),
        patch_stride=(4, 2, 2, 2),
        patch_padding=(3, 0, 0, 0),
        patch_prenorm=(False, False, False, False),
        embed_dims=(64, 128, 192, 256),
        num_heads=(3, 6, 12, 24),
        num_groups=(3, 6, 12, 24),
        window_size=7,
        mlp_ratio=4.,
        qkv_bias=True,
        drop_path_rate=0.1,
        norm_layer=nn.LayerNorm,
        enable_checkpoint=False,
        conv_at_attn=True,
        conv_at_ffn=True,
    ):
        super().__init__()

        self.num_classes = num_classes
        self.embed_dims = embed_dims
        self.num_heads = num_heads
        self.num_groups = num_groups
        self.num_stages = len(self.embed_dims)
        self.enable_checkpoint = enable_checkpoint
        assert self.num_stages == len(self.num_heads) == len(self.num_groups)

        num_stages = len(embed_dims)
        dpr = [
            x.item() for x in torch.linspace(0, drop_path_rate,
                                             sum(depths) * 2)
        ]

        depth_offset = 0
        convs = []
        blocks = []
        for i in range(num_stages):
            conv_embed = ConvEmbed(
                patch_size=patch_size[i],
                stride=patch_stride[i],
                padding=patch_padding[i],
                in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
                embed_dim=self.embed_dims[i],
                norm_layer=norm_layer,
                pre_norm=patch_prenorm[i])
            convs.append(conv_embed)

            block = MySequential(*[
                MySequential(
                    OrderedDict([('spatial_block',
                                  SpatialBlock(
                                      embed_dims[i],
                                      num_heads[i],
                                      window_size,
                                      drop_path_rate=dpr[depth_offset + j * 2],
                                      qkv_bias=qkv_bias,
                                      mlp_ratio=mlp_ratio,
                                      conv_at_attn=conv_at_attn,
                                      conv_at_ffn=conv_at_ffn,
                                  )),
                                 ('channel_block',
                                  ChannelBlock(
                                      embed_dims[i],
                                      num_groups[i],
                                      drop_path_rate=dpr[depth_offset + j * 2 +
                                                         1],
                                      qkv_bias=qkv_bias,
                                      mlp_ratio=mlp_ratio,
                                      conv_at_attn=conv_at_attn,
                                      conv_at_ffn=conv_at_ffn,
                                  ))])) for j in range(depths[i])
            ])
            blocks.append(block)
            depth_offset += depths[i] * 2

        self.convs = nn.ModuleList(convs)
        self.blocks = nn.ModuleList(blocks)

        self.avgpool = nn.AdaptiveAvgPool1d(1)

    @property
    def dim_out(self):
        return self.embed_dims[-1]

    def forward_features_unpool(self, x):
        """
        forward until avg pooling 
        Args:
            x (_type_): input image tensor
        """
        input_size = (x.size(2), x.size(3))
        for conv, block in zip(self.convs, self.blocks):
            x, input_size = conv(x, input_size)
            x, input_size = block(x, input_size)
        return x

    def forward_features(self, x):
        x = self.forward_features_unpool(x)

        # (batch_size, num_tokens, token_dim)
        x = self.avgpool(x.transpose(1, 2))
        # (batch_size, 1, num_tokens)
        x = torch.flatten(x, 1)
        x = self.norms(x)

        return x

    def forward(self, x):
        x = self.forward_features(x)
        x = self.head(x)
        return x

    @classmethod
    def from_config(cls, config):
        return cls(
            depths=config.depths,
            embed_dims=config.dim_embed,
            num_heads=config.num_heads,
            num_groups=config.num_groups,
            patch_size=config.patch_size,
            patch_stride=config.patch_stride,
            patch_padding=config.patch_padding,
            patch_prenorm=config.patch_prenorm,
            drop_path_rate=config.drop_path_rate,
            window_size=config.window_size,
        )

avgpool `instance-attribute` ¶

avgpool = AdaptiveAvgPool1d(1)

blocks `instance-attribute` ¶

blocks = ModuleList(blocks)

convs `instance-attribute` ¶

convs = ModuleList(convs)

dim_out `property` ¶

dim_out

embed_dims `instance-attribute` ¶

embed_dims = embed_dims

enable_checkpoint `instance-attribute` ¶

enable_checkpoint = enable_checkpoint

num_classes `instance-attribute` ¶

num_classes = num_classes

num_groups `instance-attribute` ¶

num_groups = num_groups

num_heads `instance-attribute` ¶

num_heads = num_heads

num_stages `instance-attribute` ¶

num_stages = len(embed_dims)

init ¶

__init__(
    in_chans=3,
    num_classes=1000,
    depths=(1, 1, 3, 1),
    patch_size=(7, 2, 2, 2),
    patch_stride=(4, 2, 2, 2),
    patch_padding=(3, 0, 0, 0),
    patch_prenorm=(False, False, False, False),
    embed_dims=(64, 128, 192, 256),
    num_heads=(3, 6, 12, 24),
    num_groups=(3, 6, 12, 24),
    window_size=7,
    mlp_ratio=4.0,
    qkv_bias=True,
    drop_path_rate=0.1,
    norm_layer=LayerNorm,
    enable_checkpoint=False,
    conv_at_attn=True,
    conv_at_ffn=True,
)

Source code in vllm/model_executor/models/florence2.py

def __init__(
    self,
    in_chans=3,
    num_classes=1000,
    depths=(1, 1, 3, 1),
    patch_size=(7, 2, 2, 2),
    patch_stride=(4, 2, 2, 2),
    patch_padding=(3, 0, 0, 0),
    patch_prenorm=(False, False, False, False),
    embed_dims=(64, 128, 192, 256),
    num_heads=(3, 6, 12, 24),
    num_groups=(3, 6, 12, 24),
    window_size=7,
    mlp_ratio=4.,
    qkv_bias=True,
    drop_path_rate=0.1,
    norm_layer=nn.LayerNorm,
    enable_checkpoint=False,
    conv_at_attn=True,
    conv_at_ffn=True,
):
    super().__init__()

    self.num_classes = num_classes
    self.embed_dims = embed_dims
    self.num_heads = num_heads
    self.num_groups = num_groups
    self.num_stages = len(self.embed_dims)
    self.enable_checkpoint = enable_checkpoint
    assert self.num_stages == len(self.num_heads) == len(self.num_groups)

    num_stages = len(embed_dims)
    dpr = [
        x.item() for x in torch.linspace(0, drop_path_rate,
                                         sum(depths) * 2)
    ]

    depth_offset = 0
    convs = []
    blocks = []
    for i in range(num_stages):
        conv_embed = ConvEmbed(
            patch_size=patch_size[i],
            stride=patch_stride[i],
            padding=patch_padding[i],
            in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
            embed_dim=self.embed_dims[i],
            norm_layer=norm_layer,
            pre_norm=patch_prenorm[i])
        convs.append(conv_embed)

        block = MySequential(*[
            MySequential(
                OrderedDict([('spatial_block',
                              SpatialBlock(
                                  embed_dims[i],
                                  num_heads[i],
                                  window_size,
                                  drop_path_rate=dpr[depth_offset + j * 2],
                                  qkv_bias=qkv_bias,
                                  mlp_ratio=mlp_ratio,
                                  conv_at_attn=conv_at_attn,
                                  conv_at_ffn=conv_at_ffn,
                              )),
                             ('channel_block',
                              ChannelBlock(
                                  embed_dims[i],
                                  num_groups[i],
                                  drop_path_rate=dpr[depth_offset + j * 2 +
                                                     1],
                                  qkv_bias=qkv_bias,
                                  mlp_ratio=mlp_ratio,
                                  conv_at_attn=conv_at_attn,
                                  conv_at_ffn=conv_at_ffn,
                              ))])) for j in range(depths[i])
        ])
        blocks.append(block)
        depth_offset += depths[i] * 2

    self.convs = nn.ModuleList(convs)
    self.blocks = nn.ModuleList(blocks)

    self.avgpool = nn.AdaptiveAvgPool1d(1)

forward ¶

forward(x)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x):
    x = self.forward_features(x)
    x = self.head(x)
    return x

forward_features ¶

forward_features(x)

Source code in vllm/model_executor/models/florence2.py

def forward_features(self, x):
    x = self.forward_features_unpool(x)

    # (batch_size, num_tokens, token_dim)
    x = self.avgpool(x.transpose(1, 2))
    # (batch_size, 1, num_tokens)
    x = torch.flatten(x, 1)
    x = self.norms(x)

    return x

forward_features_unpool ¶

forward_features_unpool(x)

forward until avg pooling Args: x (type): input image tensor

Source code in vllm/model_executor/models/florence2.py

def forward_features_unpool(self, x):
    """
    forward until avg pooling 
    Args:
        x (_type_): input image tensor
    """
    input_size = (x.size(2), x.size(3))
    for conv, block in zip(self.convs, self.blocks):
        x, input_size = conv(x, input_size)
        x, input_size = block(x, input_size)
    return x

from_config `classmethod` ¶

from_config(config)

Source code in vllm/model_executor/models/florence2.py

@classmethod
def from_config(cls, config):
    return cls(
        depths=config.depths,
        embed_dims=config.dim_embed,
        num_heads=config.num_heads,
        num_groups=config.num_groups,
        patch_size=config.patch_size,
        patch_stride=config.patch_stride,
        patch_padding=config.patch_padding,
        patch_prenorm=config.patch_prenorm,
        drop_path_rate=config.drop_path_rate,
        window_size=config.window_size,
    )

DepthWiseConv2d ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class DepthWiseConv2d(nn.Module):

    def __init__(
        self,
        dim_in,
        kernel_size,
        padding,
        stride,
        bias=True,
    ):
        super().__init__()
        self.dw = nn.Conv2d(dim_in,
                            dim_in,
                            kernel_size=kernel_size,
                            padding=padding,
                            groups=dim_in,
                            stride=stride,
                            bias=bias)

    def forward(self, x, size):
        B, N, C = x.shape
        H, W = size
        assert N == H * W

        x = self.dw(x.transpose(1, 2).view(B, C, H, W))
        size = (x.size(-2), x.size(-1))
        x = x.flatten(2).transpose(1, 2)
        return x, size

dw `instance-attribute` ¶

dw = Conv2d(
    dim_in,
    dim_in,
    kernel_size=kernel_size,
    padding=padding,
    groups=dim_in,
    stride=stride,
    bias=bias,
)

init ¶

__init__(dim_in, kernel_size, padding, stride, bias=True)

Source code in vllm/model_executor/models/florence2.py

def __init__(
    self,
    dim_in,
    kernel_size,
    padding,
    stride,
    bias=True,
):
    super().__init__()
    self.dw = nn.Conv2d(dim_in,
                        dim_in,
                        kernel_size=kernel_size,
                        padding=padding,
                        groups=dim_in,
                        stride=stride,
                        bias=bias)

forward ¶

forward(x, size)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x, size):
    B, N, C = x.shape
    H, W = size
    assert N == H * W

    x = self.dw(x.transpose(1, 2).view(B, C, H, W))
    size = (x.size(-2), x.size(-1))
    x = x.flatten(2).transpose(1, 2)
    return x, size

Florence2DummyInputsBuilder ¶

Bases: BaseDummyInputsBuilder[Florence2ProcessingInfo]

Source code in vllm/model_executor/models/florence2.py

class Florence2DummyInputsBuilder(
        BaseDummyInputsBuilder[Florence2ProcessingInfo]):

    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        return ""

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        target_width = target_height = self.info.get_hf_config().projection_dim

        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

get_dummy_mm_data ¶

get_dummy_mm_data(
    seq_len: int, mm_counts: Mapping[str, int]
) -> MultiModalDataDict

Source code in vllm/model_executor/models/florence2.py

def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
    num_images = mm_counts.get("image", 0)

    target_width = target_height = self.info.get_hf_config().projection_dim

    return {
        "image":
        self._get_dummy_images(width=target_width,
                               height=target_height,
                               num_images=num_images)
    }

get_dummy_text ¶

get_dummy_text(mm_counts: Mapping[str, int]) -> str

Source code in vllm/model_executor/models/florence2.py

def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    return ""

Florence2ForConditionalGeneration ¶

Bases: Module, SupportsMultiModal, SupportsV0Only

Source code in vllm/model_executor/models/florence2.py

@MULTIMODAL_REGISTRY.register_processor(
    Florence2MultiModalProcessor,
    info=Florence2ProcessingInfo,
    dummy_inputs=Florence2DummyInputsBuilder)
class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
                                        SupportsV0Only):

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
        if modality.startswith("image"):
            return None

        raise ValueError("Only image modality is supported")

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        processor_config = vllm_config.model_config.hf_image_processor_config

        self.config = config
        self.vision_config = config.vision_config
        self.processor_config = processor_config
        assert config.vision_config.model_type == 'davit', (
            'only DaViT is supported for now')
        self.vision_tower = DaViT.from_config(config=config.vision_config)
        self._build_image_projection_layers(config)
        self.language_model = Florence2LanguageForConditionalGeneration(
            vllm_config=vllm_config.with_hf_config(config.text_config),
            prefix=f"{prefix}.language_model",
        )
        self.pad_token_id = config.pad_token_id

    def _build_image_projection_layers(self, config: PretrainedConfig):
        image_dim_out = config.vision_config.dim_embed[-1]
        dim_projection = config.vision_config.projection_dim
        self.image_projection = nn.Parameter(
            torch.empty(image_dim_out, dim_projection))
        self.image_proj_norm = nn.LayerNorm(dim_projection)
        image_pos_embed_config = config.vision_config.image_pos_embed
        if image_pos_embed_config['type'] == 'learned_abs_2d':
            self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
                embedding_dim=image_dim_out,
                num_pos=image_pos_embed_config['max_pos_embeddings'])
        else:
            raise NotImplementedError("Florence2 only supports learned_abs_2d "
                                      "as image position embedding.")

        self.image_feature_source = config.vision_config.image_feature_source

        # temporal embedding
        visual_temporal_embedding_config = (
            self.vision_config.visual_temporal_embedding)
        if visual_temporal_embedding_config['type'] == 'COSINE':
            self.visual_temporal_embed = PositionalEmbeddingCosine1D(
                embed_dim=image_dim_out,
                max_seq_len=visual_temporal_embedding_config[
                    'max_temporal_embeddings'])
        else:
            raise NotImplementedError(
                'Florence2 only supports COSINE as temporal embedding.')

    def _validate_pixel_values(
        self, data: Union[torch.Tensor, list[torch.Tensor]]
    ) -> Union[torch.Tensor, list[torch.Tensor]]:

        size = self.processor_config["size"]
        h, w = size["height"], size["width"]
        expected_dims = (3, h, w)

        def _validate_shape(d: torch.Tensor):
            actual_dims = tuple(d.shape)

            if actual_dims != expected_dims:
                expected_expr = tuple(*map(str, expected_dims))
                raise ValueError(
                    "The expected shape of pixel values per batch "
                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")

        for d in data:
            _validate_shape(d)

        return data

    def _parse_and_validate_image_input(self, **kwargs: object):
        pixel_values: Optional[Union[list[list[torch.Tensor]],
                                     list[torch.Tensor],
                                     torch.Tensor]] = kwargs.pop(
                                         "pixel_values", None)
        image_embeds: Optional[Union[list[list[torch.Tensor]],
                                     list[torch.Tensor],
                                     torch.Tensor]] = kwargs.pop(
                                         "image_embeds", None)

        if pixel_values is None and image_embeds is None:
            return None

        if pixel_values is not None and image_embeds is not None:
            raise ValueError(
                "Both pixel values and image embeds are provided.")

        if pixel_values is not None:
            return Florence2ImagePixelInputs(
                type="pixel_values",
                data=self._validate_pixel_values(
                    flatten_bn(pixel_values, concat=True)),
            )

        if image_embeds is not None:
            raise NotImplementedError

        raise AssertionError("This line should be unreachable.")

    def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
        dtype = next(self.vision_tower.parameters()).dtype
        pixel_values = pixel_values.to(dtype)

        batch_size, T = pixel_values.size(0), 1
        x = self.vision_tower.forward_features_unpool(pixel_values)
        if self.image_pos_embed is not None:
            x = x.view(batch_size * T, -1, x.shape[-1])
            num_tokens = x.shape[-2]
            h, w = int(num_tokens**0.5), int(num_tokens**0.5)
            assert h * w == num_tokens, (
                'only support square feature maps for now')
            x = x.view(batch_size * T, h, w, x.shape[-1])
            pos_embed = self.image_pos_embed(x)
            x = x + pos_embed
            x = x.view(batch_size, T * h * w, x.shape[-1])

        if self.visual_temporal_embed is not None:
            visual_temporal_embed = self.visual_temporal_embed(
                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
            x = x.view(batch_size, T, -1,
                       x.shape[-1]) + visual_temporal_embed.view(
                           1, T, 1, x.shape[-1])

        x_feat_dict = {}

        spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
        x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x

        temporal_avg_pool_x = x.view(batch_size, T, -1,
                                     x.shape[-1]).mean(dim=1)
        x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x

        x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
        x_feat_dict['last_frame'] = x

        new_x = []
        for _image_feature_source in self.image_feature_source:
            if _image_feature_source not in x_feat_dict:
                raise ValueError('invalid image feature source: {}'.format(
                    _image_feature_source))
            new_x.append(x_feat_dict[_image_feature_source])

        x = torch.cat(new_x, dim=1)

        x = x @ self.image_projection
        x = self.image_proj_norm(x)

        return x

    def _process_image_input(
            self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
        assert image_input["type"] == "pixel_values"
        pixel_values = image_input["data"]
        return self._encode_image(pixel_values)

    def get_language_model(self) -> torch.nn.Module:
        return self.language_model

    def get_multimodal_embeddings(self,
                                  **kwargs: object) -> MultiModalEmbeddings:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return []
        vision_embeddings = self._process_image_input(image_input)
        return vision_embeddings

    def get_input_embeddings(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
        if multimodal_embeddings is not None \
            and len(multimodal_embeddings) != 0:
            inputs_embeds = merge_multimodal_embeddings(
                input_ids, inputs_embeds, multimodal_embeddings,
                self.pad_token_id)
        return inputs_embeds

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        *,
        encoder_input_ids: torch.Tensor,
        encoder_positions: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids
                torch.Tensor of *decoder* input token ids.
            positions
                torch.Tensor of *decoder* position indices.
            encoder_input_ids
                torch.Tensor of *encoder* input token ids.
            encoder_positions
                torch.Tensor of *encoder* position indices
        Returns:
            Output torch.Tensor
        """
        vision_embeddings = self.get_multimodal_embeddings(**kwargs)
        if encoder_input_ids.numel() > 0 or vision_embeddings is not None:
            inputs_embeds = self.get_input_embeddings(encoder_input_ids,
                                                      vision_embeddings)
        else:
            inputs_embeds = None

        hidden_states = self.language_model(input_ids,
                                            positions,
                                            encoder_input_ids,
                                            encoder_positions,
                                            inputs_embeds=inputs_embeds)
        return hidden_states

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self)
        return loader.load_weights(weights)

config `instance-attribute` ¶

config = config

language_model `instance-attribute` ¶

language_model = Florence2LanguageForConditionalGeneration(
    vllm_config=with_hf_config(text_config),
    prefix=f"{prefix}.language_model",
)

pad_token_id `instance-attribute` ¶

pad_token_id = pad_token_id

processor_config `instance-attribute` ¶

processor_config = processor_config

vision_config `instance-attribute` ¶

vision_config = vision_config

vision_tower `instance-attribute` ¶

vision_tower = from_config(config=vision_config)

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/florence2.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()
    config = vllm_config.model_config.hf_config
    processor_config = vllm_config.model_config.hf_image_processor_config

    self.config = config
    self.vision_config = config.vision_config
    self.processor_config = processor_config
    assert config.vision_config.model_type == 'davit', (
        'only DaViT is supported for now')
    self.vision_tower = DaViT.from_config(config=config.vision_config)
    self._build_image_projection_layers(config)
    self.language_model = Florence2LanguageForConditionalGeneration(
        vllm_config=vllm_config.with_hf_config(config.text_config),
        prefix=f"{prefix}.language_model",
    )
    self.pad_token_id = config.pad_token_id

_build_image_projection_layers ¶

_build_image_projection_layers(config: PretrainedConfig)

Source code in vllm/model_executor/models/florence2.py

def _build_image_projection_layers(self, config: PretrainedConfig):
    image_dim_out = config.vision_config.dim_embed[-1]
    dim_projection = config.vision_config.projection_dim
    self.image_projection = nn.Parameter(
        torch.empty(image_dim_out, dim_projection))
    self.image_proj_norm = nn.LayerNorm(dim_projection)
    image_pos_embed_config = config.vision_config.image_pos_embed
    if image_pos_embed_config['type'] == 'learned_abs_2d':
        self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
            embedding_dim=image_dim_out,
            num_pos=image_pos_embed_config['max_pos_embeddings'])
    else:
        raise NotImplementedError("Florence2 only supports learned_abs_2d "
                                  "as image position embedding.")

    self.image_feature_source = config.vision_config.image_feature_source

    # temporal embedding
    visual_temporal_embedding_config = (
        self.vision_config.visual_temporal_embedding)
    if visual_temporal_embedding_config['type'] == 'COSINE':
        self.visual_temporal_embed = PositionalEmbeddingCosine1D(
            embed_dim=image_dim_out,
            max_seq_len=visual_temporal_embedding_config[
                'max_temporal_embeddings'])
    else:
        raise NotImplementedError(
            'Florence2 only supports COSINE as temporal embedding.')

_encode_image ¶

_encode_image(pixel_values: Tensor) -> Tensor

Source code in vllm/model_executor/models/florence2.py

def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
    dtype = next(self.vision_tower.parameters()).dtype
    pixel_values = pixel_values.to(dtype)

    batch_size, T = pixel_values.size(0), 1
    x = self.vision_tower.forward_features_unpool(pixel_values)
    if self.image_pos_embed is not None:
        x = x.view(batch_size * T, -1, x.shape[-1])
        num_tokens = x.shape[-2]
        h, w = int(num_tokens**0.5), int(num_tokens**0.5)
        assert h * w == num_tokens, (
            'only support square feature maps for now')
        x = x.view(batch_size * T, h, w, x.shape[-1])
        pos_embed = self.image_pos_embed(x)
        x = x + pos_embed
        x = x.view(batch_size, T * h * w, x.shape[-1])

    if self.visual_temporal_embed is not None:
        visual_temporal_embed = self.visual_temporal_embed(
            x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
        x = x.view(batch_size, T, -1,
                   x.shape[-1]) + visual_temporal_embed.view(
                       1, T, 1, x.shape[-1])

    x_feat_dict = {}

    spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
    x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x

    temporal_avg_pool_x = x.view(batch_size, T, -1,
                                 x.shape[-1]).mean(dim=1)
    x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x

    x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
    x_feat_dict['last_frame'] = x

    new_x = []
    for _image_feature_source in self.image_feature_source:
        if _image_feature_source not in x_feat_dict:
            raise ValueError('invalid image feature source: {}'.format(
                _image_feature_source))
        new_x.append(x_feat_dict[_image_feature_source])

    x = torch.cat(new_x, dim=1)

    x = x @ self.image_projection
    x = self.image_proj_norm(x)

    return x

_parse_and_validate_image_input ¶

_parse_and_validate_image_input(**kwargs: object)

Source code in vllm/model_executor/models/florence2.py

def _parse_and_validate_image_input(self, **kwargs: object):
    pixel_values: Optional[Union[list[list[torch.Tensor]],
                                 list[torch.Tensor],
                                 torch.Tensor]] = kwargs.pop(
                                     "pixel_values", None)
    image_embeds: Optional[Union[list[list[torch.Tensor]],
                                 list[torch.Tensor],
                                 torch.Tensor]] = kwargs.pop(
                                     "image_embeds", None)

    if pixel_values is None and image_embeds is None:
        return None

    if pixel_values is not None and image_embeds is not None:
        raise ValueError(
            "Both pixel values and image embeds are provided.")

    if pixel_values is not None:
        return Florence2ImagePixelInputs(
            type="pixel_values",
            data=self._validate_pixel_values(
                flatten_bn(pixel_values, concat=True)),
        )

    if image_embeds is not None:
        raise NotImplementedError

    raise AssertionError("This line should be unreachable.")

_process_image_input ¶

_process_image_input(
    image_input: Florence2ImagePixelInputs,
) -> Tensor

Source code in vllm/model_executor/models/florence2.py

def _process_image_input(
        self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
    assert image_input["type"] == "pixel_values"
    pixel_values = image_input["data"]
    return self._encode_image(pixel_values)

_validate_pixel_values ¶

_validate_pixel_values(
    data: Union[Tensor, list[Tensor]],
) -> Union[Tensor, list[Tensor]]

Source code in vllm/model_executor/models/florence2.py

def _validate_pixel_values(
    self, data: Union[torch.Tensor, list[torch.Tensor]]
) -> Union[torch.Tensor, list[torch.Tensor]]:

    size = self.processor_config["size"]
    h, w = size["height"], size["width"]
    expected_dims = (3, h, w)

    def _validate_shape(d: torch.Tensor):
        actual_dims = tuple(d.shape)

        if actual_dims != expected_dims:
            expected_expr = tuple(*map(str, expected_dims))
            raise ValueError(
                "The expected shape of pixel values per batch "
                f"is {expected_expr}. You supplied {tuple(d.shape)}.")

    for d in data:
        _validate_shape(d)

    return data

compute_logits ¶

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]

Source code in vllm/model_executor/models/florence2.py

def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    return self.language_model.compute_logits(hidden_states,
                                              sampling_metadata)

forward ¶

forward(
    input_ids: Tensor,
    positions: Tensor,
    intermediate_tensors: Optional[
        IntermediateTensors
    ] = None,
    *,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
    **kwargs,
) -> Tensor

Returns: Output torch.Tensor

Source code in vllm/model_executor/models/florence2.py

def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    *,
    encoder_input_ids: torch.Tensor,
    encoder_positions: torch.Tensor,
    **kwargs,
) -> torch.Tensor:
    r"""
    Args:
        input_ids
            torch.Tensor of *decoder* input token ids.
        positions
            torch.Tensor of *decoder* position indices.
        encoder_input_ids
            torch.Tensor of *encoder* input token ids.
        encoder_positions
            torch.Tensor of *encoder* position indices
    Returns:
        Output torch.Tensor
    """
    vision_embeddings = self.get_multimodal_embeddings(**kwargs)
    if encoder_input_ids.numel() > 0 or vision_embeddings is not None:
        inputs_embeds = self.get_input_embeddings(encoder_input_ids,
                                                  vision_embeddings)
    else:
        inputs_embeds = None

    hidden_states = self.language_model(input_ids,
                                        positions,
                                        encoder_input_ids,
                                        encoder_positions,
                                        inputs_embeds=inputs_embeds)
    return hidden_states

get_input_embeddings ¶

get_input_embeddings(
    input_ids: Tensor,
    multimodal_embeddings: Optional[
        MultiModalEmbeddings
    ] = None,
) -> Tensor

Source code in vllm/model_executor/models/florence2.py

def get_input_embeddings(
    self,
    input_ids: torch.Tensor,
    multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
    inputs_embeds = self.language_model.get_input_embeddings(input_ids)
    if multimodal_embeddings is not None \
        and len(multimodal_embeddings) != 0:
        inputs_embeds = merge_multimodal_embeddings(
            input_ids, inputs_embeds, multimodal_embeddings,
            self.pad_token_id)
    return inputs_embeds

get_language_model ¶

get_language_model() -> Module

Source code in vllm/model_executor/models/florence2.py

def get_language_model(self) -> torch.nn.Module:
    return self.language_model

get_multimodal_embeddings ¶

get_multimodal_embeddings(
    **kwargs: object,
) -> MultiModalEmbeddings

Source code in vllm/model_executor/models/florence2.py

def get_multimodal_embeddings(self,
                              **kwargs: object) -> MultiModalEmbeddings:
    image_input = self._parse_and_validate_image_input(**kwargs)
    if image_input is None:
        return []
    vision_embeddings = self._process_image_input(image_input)
    return vision_embeddings

get_placeholder_str `classmethod` ¶

get_placeholder_str(modality: str, i: int) -> Optional[str]

Source code in vllm/model_executor/models/florence2.py

@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
    if modality.startswith("image"):
        return None

    raise ValueError("Only image modality is supported")

load_weights ¶

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

Source code in vllm/model_executor/models/florence2.py

def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    loader = AutoWeightsLoader(self)
    return loader.load_weights(weights)

Florence2ImagePixelInputs ¶

Bases: TypedDict

Source code in vllm/model_executor/models/florence2.py

class Florence2ImagePixelInputs(TypedDict):
    type: Literal["pixel_values"]
    data: torch.Tensor
    """Shape: (batch_size, num_channel, height, width)"""

data `instance-attribute` ¶

data: Tensor

Shape: (batch_size, num_channel, height, width)

type `instance-attribute` ¶

type: Literal['pixel_values']

Florence2LanguageForConditionalGeneration ¶

Bases: Module, SupportsV0Only

Source code in vllm/model_executor/models/florence2.py

class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config

        self.config = config
        self.model = Florence2LanguageModel(vllm_config=vllm_config,
                                            prefix=f"{prefix}.model")
        embed_scale = math.sqrt(
            config.d_model) if config.scale_embedding else 1.0

        self.vocab_size = config.vocab_size
        self.lm_head = BartParallelLMHead(self.vocab_size,
                                          config.d_model,
                                          embed_scale=embed_scale)

        self.logits_processor = LogitsProcessor(self.vocab_size,
                                                config.vocab_size)

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        encoder_input_ids: torch.Tensor,
        encoder_positions: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids
                torch.Tensor of *decoder* input token ids.
            positions
                torch.Tensor of *decoder* position indices.
            encoder_input_ids
                torch.Tensor of *encoder* input token ids.
            encoder_positions
                torch.Tensor of *encoder* position indices
        Returns:
            Output torch.Tensor
        """

        return self.model(input_ids,
                          positions,
                          encoder_input_ids,
                          encoder_positions,
                          inputs_embeds=inputs_embeds)

    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.encoder.embed_tokens(input_ids)

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

    def load_weights(self, weights: Iterable[tuple[str,
                                                   torch.Tensor]]) -> set[str]:
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
        ]

        params_dict = dict(self.named_parameters())
        loaded_params: set[str] = set()
        for name, loaded_weight in weights:
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                if "final_logits_bias" in name:
                    continue
                if self.config.tie_word_embeddings and "embed_tokens" in name:
                    continue
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params

config `instance-attribute` ¶

config = config

lm_head `instance-attribute` ¶

lm_head = BartParallelLMHead(
    vocab_size, d_model, embed_scale=embed_scale
)

logits_processor `instance-attribute` ¶

logits_processor = LogitsProcessor(vocab_size, vocab_size)

model `instance-attribute` ¶

model = Florence2LanguageModel(
    vllm_config=vllm_config, prefix=f"{prefix}.model"
)

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/florence2.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config

    self.config = config
    self.model = Florence2LanguageModel(vllm_config=vllm_config,
                                        prefix=f"{prefix}.model")
    embed_scale = math.sqrt(
        config.d_model) if config.scale_embedding else 1.0

    self.vocab_size = config.vocab_size
    self.lm_head = BartParallelLMHead(self.vocab_size,
                                      config.d_model,
                                      embed_scale=embed_scale)

    self.logits_processor = LogitsProcessor(self.vocab_size,
                                            config.vocab_size)

compute_logits ¶

compute_logits(
    hidden_states: Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[Tensor]

Source code in vllm/model_executor/models/florence2.py

def compute_logits(
    self,
    hidden_states: torch.Tensor,
    sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
    logits = self.logits_processor(self.lm_head, hidden_states,
                                   sampling_metadata)
    return logits

forward ¶

forward(
    input_ids: Tensor,
    positions: Tensor,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
    inputs_embeds: Optional[Tensor] = None,
    **kwargs,
) -> Tensor

Returns: Output torch.Tensor

Source code in vllm/model_executor/models/florence2.py

def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    encoder_input_ids: torch.Tensor,
    encoder_positions: torch.Tensor,
    inputs_embeds: Optional[torch.Tensor] = None,
    **kwargs,
) -> torch.Tensor:
    r"""
    Args:
        input_ids
            torch.Tensor of *decoder* input token ids.
        positions
            torch.Tensor of *decoder* position indices.
        encoder_input_ids
            torch.Tensor of *encoder* input token ids.
        encoder_positions
            torch.Tensor of *encoder* position indices
    Returns:
        Output torch.Tensor
    """

    return self.model(input_ids,
                      positions,
                      encoder_input_ids,
                      encoder_positions,
                      inputs_embeds=inputs_embeds)

get_input_embeddings ¶

get_input_embeddings(input_ids: Tensor) -> Tensor

Source code in vllm/model_executor/models/florence2.py

def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
    return self.model.encoder.embed_tokens(input_ids)

load_weights ¶

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

Source code in vllm/model_executor/models/florence2.py

def load_weights(self, weights: Iterable[tuple[str,
                                               torch.Tensor]]) -> set[str]:
    stacked_params_mapping = [
        # (param_name, shard_name, shard_id)
        ("qkv_proj", "q_proj", "q"),
        ("qkv_proj", "k_proj", "k"),
        ("qkv_proj", "v_proj", "v"),
    ]

    params_dict = dict(self.named_parameters())
    loaded_params: set[str] = set()
    for name, loaded_weight in weights:
        for (param_name, weight_name, shard_id) in stacked_params_mapping:
            if weight_name not in name:
                continue
            name = name.replace(weight_name, param_name)
            param = params_dict[name]
            weight_loader = param.weight_loader
            weight_loader(param, loaded_weight, shard_id)
            break
        else:
            if "final_logits_bias" in name:
                continue
            if self.config.tie_word_embeddings and "embed_tokens" in name:
                continue
            param = params_dict[name]
            weight_loader = getattr(param, "weight_loader",
                                    default_weight_loader)
            weight_loader(param, loaded_weight)
        loaded_params.add(name)
    return loaded_params

Florence2LanguageModel ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class Florence2LanguageModel(nn.Module):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config

        self.config = config

        self.vocab_size = config.vocab_size

        self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
        self.encoder = BartEncoder(config,
                                   cache_config=cache_config,
                                   quant_config=quant_config,
                                   prefix=f"{prefix}.encoder")
        self.decoder = BartDecoder(config,
                                   cache_config=cache_config,
                                   quant_config=quant_config,
                                   prefix=f"{prefix}.decoder")

        if self.config.tie_word_embeddings:
            self.encoder.embed_tokens.weight = self.shared.weight
            self.decoder.embed_tokens.weight = self.shared.weight

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        encoder_input_ids: torch.Tensor,
        encoder_positions: torch.Tensor,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        r"""
        Args:
            input_ids
                Indices of *decoder* input sequence tokens in the vocabulary.
                Padding will be ignored by default should you
                provide it.
            positions
                Positions of *decoder* input sequence tokens.
            encoder_input_ids
                Indices of *encoder* input sequence tokens in the vocabulary.
            encoder_positions:
                Positions of *encoder* input sequence tokens.
        Returns:
            Model output torch.Tensor
        """

        encoder_hidden_states = None

        if inputs_embeds is not None or encoder_input_ids.numel() > 0:
            # Run encoder attention if a non-zero number of encoder tokens
            # are provided as input
            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
                                                 positions=encoder_positions,
                                                 inputs_embeds=inputs_embeds)

        # decoder outputs consists of
        # (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            decoder_input_ids=input_ids,
            decoder_positions=positions,
            encoder_hidden_states=encoder_hidden_states)

        return decoder_outputs

config `instance-attribute` ¶

config = config

decoder `instance-attribute` ¶

decoder = BartDecoder(
    config,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.decoder",
)

encoder `instance-attribute` ¶

encoder = BartEncoder(
    config,
    cache_config=cache_config,
    quant_config=quant_config,
    prefix=f"{prefix}.encoder",
)

shared `instance-attribute` ¶

shared = BartScaledWordEmbedding(vocab_size, d_model)

vocab_size `instance-attribute` ¶

vocab_size = vocab_size

init ¶

__init__(*, vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/model_executor/models/florence2.py

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
    super().__init__()

    config = vllm_config.model_config.hf_config
    cache_config = vllm_config.cache_config
    quant_config = vllm_config.quant_config

    self.config = config

    self.vocab_size = config.vocab_size

    self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
    self.encoder = BartEncoder(config,
                               cache_config=cache_config,
                               quant_config=quant_config,
                               prefix=f"{prefix}.encoder")
    self.decoder = BartDecoder(config,
                               cache_config=cache_config,
                               quant_config=quant_config,
                               prefix=f"{prefix}.decoder")

    if self.config.tie_word_embeddings:
        self.encoder.embed_tokens.weight = self.shared.weight
        self.decoder.embed_tokens.weight = self.shared.weight

forward ¶

forward(
    input_ids: Tensor,
    positions: Tensor,
    encoder_input_ids: Tensor,
    encoder_positions: Tensor,
    inputs_embeds: Optional[Tensor] = None,
) -> Tensor

Parameters:

Name	Type	Description	Default
`encoder_positions`	`Tensor`	Positions of encoder input sequence tokens.	required

Returns: Model output torch.Tensor

Source code in vllm/model_executor/models/florence2.py

def forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    encoder_input_ids: torch.Tensor,
    encoder_positions: torch.Tensor,
    inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    r"""
    Args:
        input_ids
            Indices of *decoder* input sequence tokens in the vocabulary.
            Padding will be ignored by default should you
            provide it.
        positions
            Positions of *decoder* input sequence tokens.
        encoder_input_ids
            Indices of *encoder* input sequence tokens in the vocabulary.
        encoder_positions:
            Positions of *encoder* input sequence tokens.
    Returns:
        Model output torch.Tensor
    """

    encoder_hidden_states = None

    if inputs_embeds is not None or encoder_input_ids.numel() > 0:
        # Run encoder attention if a non-zero number of encoder tokens
        # are provided as input
        encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
                                             positions=encoder_positions,
                                             inputs_embeds=inputs_embeds)

    # decoder outputs consists of
    # (dec_features, past_key_value, dec_hidden, dec_attn)
    decoder_outputs = self.decoder(
        decoder_input_ids=input_ids,
        decoder_positions=positions,
        encoder_hidden_states=encoder_hidden_states)

    return decoder_outputs

Florence2MultiModalProcessor ¶

Bases: EncDecMultiModalProcessor[Florence2ProcessingInfo]

Source code in vllm/model_executor/models/florence2.py

class Florence2MultiModalProcessor(
        EncDecMultiModalProcessor[Florence2ProcessingInfo]):

    def _hf_processor_applies_updates(
        self,
        prompt_text: str,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object],
    ) -> bool:
        return False

    def create_encoder_prompt(
        self,
        prompt: Union[str, list[int]],
        mm_data: MultiModalDataDict,
    ) -> Union[str, list[int]]:
        return prompt

    def create_decoder_prompt(
        self,
        prompt: Union[str, list[int]],
        mm_data: MultiModalDataDict,
    ) -> Union[str, list[int]]:
        return [self.info.get_hf_config().eos_token_id]

    def _apply_hf_processor_tokens_only(
        self,
        prompt_tokens: list[int],
    ) -> list[int]:
        hf_processor = self.info.get_hf_processor()
        tokenizer: BartTokenizer = hf_processor.tokenizer
        prompt_text = tokenizer.decode(prompt_tokens)
        # convert task tokens to prompt
        prompt_text = hf_processor._construct_prompts([prompt_text])[0]
        prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False)
        return prompt_tokens

    def _call_hf_processor(
        self,
        prompt: str,
        mm_data: Mapping[str, object],
        mm_kwargs: Mapping[str, object],
        tok_kwargs: Mapping[str, object],
    ) -> BatchFeature:
        if mm_data:
            processed_outputs = super()._call_hf_processor(
                prompt, mm_data, mm_kwargs, tok_kwargs)
        else:
            hf_processor = self.info.get_hf_processor()
            tokenizer = hf_processor.tokenizer
            prompt = hf_processor._construct_prompts([prompt])[0]
            processed_outputs = tokenizer(prompt,
                                          add_special_tokens=True,
                                          return_tensors="pt")
        return processed_outputs

    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        return dict(pixel_values=MultiModalFieldConfig.batched("image"))

    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargs,
    ) -> Sequence[PromptUpdate]:
        hf_config = self.info.get_hf_config()
        pad_token_id = hf_config.pad_token_id
        num_image_tokens = self.info.get_num_image_tokens()
        image_tokens = [pad_token_id] * num_image_tokens

        return [
            PromptInsertion(
                modality="image",
                target=PromptIndexTargets.start(),
                insertion=image_tokens,
            )
        ]

_apply_hf_processor_tokens_only ¶

_apply_hf_processor_tokens_only(
    prompt_tokens: list[int],
) -> list[int]

Source code in vllm/model_executor/models/florence2.py

def _apply_hf_processor_tokens_only(
    self,
    prompt_tokens: list[int],
) -> list[int]:
    hf_processor = self.info.get_hf_processor()
    tokenizer: BartTokenizer = hf_processor.tokenizer
    prompt_text = tokenizer.decode(prompt_tokens)
    # convert task tokens to prompt
    prompt_text = hf_processor._construct_prompts([prompt_text])[0]
    prompt_tokens = tokenizer.encode(prompt_text, add_special_tokens=False)
    return prompt_tokens

_call_hf_processor ¶

_call_hf_processor(
    prompt: str,
    mm_data: Mapping[str, object],
    mm_kwargs: Mapping[str, object],
    tok_kwargs: Mapping[str, object],
) -> BatchFeature

Source code in vllm/model_executor/models/florence2.py

def _call_hf_processor(
    self,
    prompt: str,
    mm_data: Mapping[str, object],
    mm_kwargs: Mapping[str, object],
    tok_kwargs: Mapping[str, object],
) -> BatchFeature:
    if mm_data:
        processed_outputs = super()._call_hf_processor(
            prompt, mm_data, mm_kwargs, tok_kwargs)
    else:
        hf_processor = self.info.get_hf_processor()
        tokenizer = hf_processor.tokenizer
        prompt = hf_processor._construct_prompts([prompt])[0]
        processed_outputs = tokenizer(prompt,
                                      add_special_tokens=True,
                                      return_tensors="pt")
    return processed_outputs

_get_mm_fields_config ¶

_get_mm_fields_config(
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]

Source code in vllm/model_executor/models/florence2.py

def _get_mm_fields_config(
    self,
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
    return dict(pixel_values=MultiModalFieldConfig.batched("image"))

_get_prompt_updates ¶

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]

Source code in vllm/model_executor/models/florence2.py

def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
    hf_config = self.info.get_hf_config()
    pad_token_id = hf_config.pad_token_id
    num_image_tokens = self.info.get_num_image_tokens()
    image_tokens = [pad_token_id] * num_image_tokens

    return [
        PromptInsertion(
            modality="image",
            target=PromptIndexTargets.start(),
            insertion=image_tokens,
        )
    ]

_hf_processor_applies_updates ¶

_hf_processor_applies_updates(
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
) -> bool

Source code in vllm/model_executor/models/florence2.py

def _hf_processor_applies_updates(
    self,
    prompt_text: str,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object],
) -> bool:
    return False

create_decoder_prompt ¶

create_decoder_prompt(
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
) -> Union[str, list[int]]

Source code in vllm/model_executor/models/florence2.py

def create_decoder_prompt(
    self,
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
) -> Union[str, list[int]]:
    return [self.info.get_hf_config().eos_token_id]

create_encoder_prompt ¶

create_encoder_prompt(
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
) -> Union[str, list[int]]

Source code in vllm/model_executor/models/florence2.py

def create_encoder_prompt(
    self,
    prompt: Union[str, list[int]],
    mm_data: MultiModalDataDict,
) -> Union[str, list[int]]:
    return prompt

Florence2ProcessingInfo ¶

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/florence2.py

class Florence2ProcessingInfo(BaseProcessingInfo):

    def get_hf_config(self):
        return self.ctx.get_hf_config()

    def get_hf_processor(self):
        return self.ctx.get_hf_processor()

    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

    def get_num_image_tokens(self) -> int:
        processor_config = self.ctx.get_hf_image_processor_config()
        return processor_config["image_seq_length"]

get_hf_config ¶

get_hf_config()

Source code in vllm/model_executor/models/florence2.py

def get_hf_config(self):
    return self.ctx.get_hf_config()

get_hf_processor ¶

get_hf_processor()

Source code in vllm/model_executor/models/florence2.py

def get_hf_processor(self):
    return self.ctx.get_hf_processor()

get_num_image_tokens ¶

get_num_image_tokens() -> int

Source code in vllm/model_executor/models/florence2.py

def get_num_image_tokens(self) -> int:
    processor_config = self.ctx.get_hf_image_processor_config()
    return processor_config["image_seq_length"]

get_supported_mm_limits ¶

get_supported_mm_limits() -> Mapping[str, Optional[int]]

Source code in vllm/model_executor/models/florence2.py

def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
    return {"image": 1}

LearnedAbsolutePositionEmbedding2D ¶

Bases: Module

This module learns positional embeddings up to a fixed maximum size.

Source code in vllm/model_executor/models/florence2.py

class LearnedAbsolutePositionEmbedding2D(nn.Module):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, embedding_dim=256, num_pos=50):
        super().__init__()
        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
        self.column_embeddings = nn.Embedding(
            num_pos, embedding_dim - (embedding_dim // 2))

    def forward(self, pixel_values):
        """
        pixel_values: (batch_size, height, width, num_channels) 
        returns: (batch_size, height, width, embedding_dim * 2)
        """
        if len(pixel_values.shape) != 4:
            raise ValueError('pixel_values must be a 4D tensor')
        height, width = pixel_values.shape[1:3]
        width_values = torch.arange(width, device=pixel_values.device)
        height_values = torch.arange(height, device=pixel_values.device)
        x_emb = self.column_embeddings(width_values)
        y_emb = self.row_embeddings(height_values)
        # (height, width, embedding_dim * 2)
        pos = torch.cat([
            x_emb.unsqueeze(0).repeat(height, 1, 1),
            y_emb.unsqueeze(1).repeat(1, width, 1)
        ],
                        dim=-1)
        # (embedding_dim * 2, height, width)
        pos = pos.permute(2, 0, 1)
        pos = pos.unsqueeze(0)
        # (batch_size, embedding_dim * 2, height, width)
        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
        # (batch_size, height, width, embedding_dim * 2)
        pos = pos.permute(0, 2, 3, 1)
        return pos

column_embeddings `instance-attribute` ¶

column_embeddings = Embedding(
    num_pos, embedding_dim - embedding_dim // 2
)

row_embeddings `instance-attribute` ¶

row_embeddings = Embedding(num_pos, embedding_dim // 2)

init ¶

__init__(embedding_dim=256, num_pos=50)

Source code in vllm/model_executor/models/florence2.py

def __init__(self, embedding_dim=256, num_pos=50):
    super().__init__()
    self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
    self.column_embeddings = nn.Embedding(
        num_pos, embedding_dim - (embedding_dim // 2))

forward ¶

forward(pixel_values)

pixel_values: (batch_size, height, width, num_channels) returns: (batch_size, height, width, embedding_dim * 2)

Source code in vllm/model_executor/models/florence2.py

def forward(self, pixel_values):
    """
    pixel_values: (batch_size, height, width, num_channels) 
    returns: (batch_size, height, width, embedding_dim * 2)
    """
    if len(pixel_values.shape) != 4:
        raise ValueError('pixel_values must be a 4D tensor')
    height, width = pixel_values.shape[1:3]
    width_values = torch.arange(width, device=pixel_values.device)
    height_values = torch.arange(height, device=pixel_values.device)
    x_emb = self.column_embeddings(width_values)
    y_emb = self.row_embeddings(height_values)
    # (height, width, embedding_dim * 2)
    pos = torch.cat([
        x_emb.unsqueeze(0).repeat(height, 1, 1),
        y_emb.unsqueeze(1).repeat(1, width, 1)
    ],
                    dim=-1)
    # (embedding_dim * 2, height, width)
    pos = pos.permute(2, 0, 1)
    pos = pos.unsqueeze(0)
    # (batch_size, embedding_dim * 2, height, width)
    pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
    # (batch_size, height, width, embedding_dim * 2)
    pos = pos.permute(0, 2, 3, 1)
    return pos

Mlp ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class Mlp(nn.Module):

    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        act_layer=nn.GELU,
    ):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.net = nn.Sequential(
            OrderedDict([("fc1", nn.Linear(in_features, hidden_features)),
                         ("act", act_layer()),
                         ("fc2", nn.Linear(hidden_features, out_features))]))

    def forward(self, x, size):
        return self.net(x), size

net `instance-attribute` ¶

net = Sequential(
    OrderedDict(
        [
            ("fc1", Linear(in_features, hidden_features)),
            ("act", act_layer()),
            ("fc2", Linear(hidden_features, out_features)),
        ]
    )
)

init ¶

__init__(
    in_features,
    hidden_features=None,
    out_features=None,
    act_layer=GELU,
)

Source code in vllm/model_executor/models/florence2.py

def __init__(
    self,
    in_features,
    hidden_features=None,
    out_features=None,
    act_layer=nn.GELU,
):
    super().__init__()
    out_features = out_features or in_features
    hidden_features = hidden_features or in_features
    self.net = nn.Sequential(
        OrderedDict([("fc1", nn.Linear(in_features, hidden_features)),
                     ("act", act_layer()),
                     ("fc2", nn.Linear(hidden_features, out_features))]))

forward ¶

forward(x, size)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x, size):
    return self.net(x), size

MySequential ¶

Bases: Sequential

Source code in vllm/model_executor/models/florence2.py

class MySequential(nn.Sequential):

    def forward(self, *inputs):
        for module in self._modules.values():
            if isinstance(inputs, tuple):
                inputs = module(*inputs)
            else:
                inputs = module(inputs)
        return inputs

forward ¶

forward(*inputs)

Source code in vllm/model_executor/models/florence2.py

def forward(self, *inputs):
    for module in self._modules.values():
        if isinstance(inputs, tuple):
            inputs = module(*inputs)
        else:
            inputs = module(inputs)
    return inputs

PositionalEmbeddingCosine1D ¶

Bases: Module

This class implements a very simple positional encoding. It follows closely the encoder from the link below: https://pytorch.org/tutorials/beginner/translation_transformer.html Args: embed_dim: The dimension of the embeddings. dropout_prob: The dropout probability. max_seq_len: The maximum length to precompute the positional encodings.

Source code in vllm/model_executor/models/florence2.py

class PositionalEmbeddingCosine1D(nn.Module):
    """
    This class implements a very simple positional encoding. It follows closely
    the encoder from the link below:
    https://pytorch.org/tutorials/beginner/translation_transformer.html
    Args:
        embed_dim: The dimension of the embeddings.
        dropout_prob: The dropout probability.
        max_seq_len: The maximum length to precompute the positional encodings.
    """

    def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
        super().__init__()
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len
        # Generate the sinusoidal arrays.
        factor = math.log(10000)
        denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) /
                                self.embed_dim)
        # Matrix where rows correspond to a positional embedding as a function
        # of the position index (i.e., the row index).
        frequencies = \
            torch.arange(0, self.max_seq_len) \
            .reshape(self.max_seq_len, 1) * denominator
        pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
        # Populate uneven entries.
        pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
        pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
        # Save the positional embeddings in a constant buffer.
        # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
        self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed,
                                             requires_grad=False)

    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
        """
        Args:
            seq_embeds: The sequence embeddings in order. Allowed size:
                1. [T, D], where T is the length of the sequence, and D is the
                frame embedding dimension.
                2. [B, T, D], where B is the batch size and T and D are the
                same as above.
        Returns a tensor of with the same dimensions as the input: i.e.,
        [1, T, D] or [T, D].
        """
        shape_len = len(seq_embeds.shape)
        assert 2 <= shape_len <= 3
        len_seq = seq_embeds.size(-2)
        assert len_seq <= self.max_seq_len
        pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
        # Adapt pre-computed positional embeddings to the input.
        if shape_len == 3:
            pos_embeds = pos_embeds.view(
                (1, pos_embeds.size(0), pos_embeds.size(1)))
        return pos_embeds

embed_dim `instance-attribute` ¶

embed_dim = embed_dim

max_seq_len `instance-attribute` ¶

max_seq_len = max_seq_len

pos_idx_to_embed `instance-attribute` ¶

pos_idx_to_embed = Parameter(
    pos_idx_to_embed, requires_grad=False
)

init ¶

__init__(
    embed_dim: int = 512, max_seq_len: int = 1024
) -> None

Source code in vllm/model_executor/models/florence2.py

def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
    super().__init__()
    self.embed_dim = embed_dim
    self.max_seq_len = max_seq_len
    # Generate the sinusoidal arrays.
    factor = math.log(10000)
    denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) /
                            self.embed_dim)
    # Matrix where rows correspond to a positional embedding as a function
    # of the position index (i.e., the row index).
    frequencies = \
        torch.arange(0, self.max_seq_len) \
        .reshape(self.max_seq_len, 1) * denominator
    pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
    # Populate uneven entries.
    pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
    pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
    # Save the positional embeddings in a constant buffer.
    # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
    self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed,
                                         requires_grad=False)

forward ¶

forward(seq_embeds: Tensor) -> Tensor

Parameters:

Name	Type	Description	Default
`seq_embeds`	`Tensor`	The sequence embeddings in order. Allowed size: 1. [T, D], where T is the length of the sequence, and D is the frame embedding dimension. 2. [B, T, D], where B is the batch size and T and D are the same as above.	required

Returns a tensor of with the same dimensions as the input: i.e., [1, T, D] or [T, D].

Source code in vllm/model_executor/models/florence2.py

def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
    """
    Args:
        seq_embeds: The sequence embeddings in order. Allowed size:
            1. [T, D], where T is the length of the sequence, and D is the
            frame embedding dimension.
            2. [B, T, D], where B is the batch size and T and D are the
            same as above.
    Returns a tensor of with the same dimensions as the input: i.e.,
    [1, T, D] or [T, D].
    """
    shape_len = len(seq_embeds.shape)
    assert 2 <= shape_len <= 3
    len_seq = seq_embeds.size(-2)
    assert len_seq <= self.max_seq_len
    pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
    # Adapt pre-computed positional embeddings to the input.
    if shape_len == 3:
        pos_embeds = pos_embeds.view(
            (1, pos_embeds.size(0), pos_embeds.size(1)))
    return pos_embeds

PreNorm ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class PreNorm(nn.Module):

    def __init__(self, norm, fn):
        super().__init__()
        self.norm = norm
        self.fn = fn

    def forward(self, x, *args, **kwargs):
        shortcut = x
        if self.norm is not None:
            x, size = self.fn(self.norm(x), *args, **kwargs)
        else:
            x, size = self.fn(x, *args, **kwargs)

        x = shortcut + x

        return x, size

fn `instance-attribute` ¶

fn = fn

norm `instance-attribute` ¶

norm = norm

init ¶

__init__(norm, fn)

Source code in vllm/model_executor/models/florence2.py

def __init__(self, norm, fn):
    super().__init__()
    self.norm = norm
    self.fn = fn

forward ¶

forward(x, *args, **kwargs)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x, *args, **kwargs):
    shortcut = x
    if self.norm is not None:
        x, size = self.fn(self.norm(x), *args, **kwargs)
    else:
        x, size = self.fn(x, *args, **kwargs)

    x = shortcut + x

    return x, size

SpatialBlock ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class SpatialBlock(nn.Module):

    def __init__(self,
                 dim,
                 num_heads,
                 window_size,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 drop_path_rate=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 conv_at_attn=True,
                 conv_at_ffn=True):
        super().__init__()

        self.conv1 = PreNorm(None, DepthWiseConv2d(
            dim, 3, 1, 1)) if conv_at_attn else None
        self.window_attn = PreNorm(
            norm_layer(dim),
            WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
        )
        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
                                                   1)) if conv_at_ffn else None
        self.ffn = PreNorm(
            norm_layer(dim),
            Mlp(in_features=dim,
                hidden_features=int(dim * mlp_ratio),
                act_layer=act_layer),
        )

    def forward(self, x, size):
        if self.conv1:
            x, size = self.conv1(x, size)
        x, size = self.window_attn(x, size)

        if self.conv2:
            x, size = self.conv2(x, size)
        x, size = self.ffn(x, size)
        return x, size

conv1 `instance-attribute` ¶

conv1 = (
    PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
    if conv_at_attn
    else None
)

conv2 `instance-attribute` ¶

conv2 = (
    PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
    if conv_at_ffn
    else None
)

ffn `instance-attribute` ¶

ffn = PreNorm(
    norm_layer(dim),
    Mlp(
        in_features=dim,
        hidden_features=int(dim * mlp_ratio),
        act_layer=act_layer,
    ),
)

window_attn `instance-attribute` ¶

window_attn = PreNorm(
    norm_layer(dim),
    WindowAttention(
        dim, num_heads, window_size, qkv_bias=qkv_bias
    ),
)

init ¶

__init__(
    dim,
    num_heads,
    window_size,
    mlp_ratio=4.0,
    qkv_bias=True,
    drop_path_rate=0.0,
    act_layer=GELU,
    norm_layer=LayerNorm,
    conv_at_attn=True,
    conv_at_ffn=True,
)

Source code in vllm/model_executor/models/florence2.py

def __init__(self,
             dim,
             num_heads,
             window_size,
             mlp_ratio=4.,
             qkv_bias=True,
             drop_path_rate=0.,
             act_layer=nn.GELU,
             norm_layer=nn.LayerNorm,
             conv_at_attn=True,
             conv_at_ffn=True):
    super().__init__()

    self.conv1 = PreNorm(None, DepthWiseConv2d(
        dim, 3, 1, 1)) if conv_at_attn else None
    self.window_attn = PreNorm(
        norm_layer(dim),
        WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
    )
    self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
                                               1)) if conv_at_ffn else None
    self.ffn = PreNorm(
        norm_layer(dim),
        Mlp(in_features=dim,
            hidden_features=int(dim * mlp_ratio),
            act_layer=act_layer),
    )

forward ¶

forward(x, size)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x, size):
    if self.conv1:
        x, size = self.conv1(x, size)
    x, size = self.window_attn(x, size)

    if self.conv2:
        x, size = self.conv2(x, size)
    x, size = self.ffn(x, size)
    return x, size

WindowAttention ¶

Bases: Module

Source code in vllm/model_executor/models/florence2.py

class WindowAttention(nn.Module):

    def __init__(self, dim, num_heads, window_size, qkv_bias=True):

        super().__init__()
        self.dim = dim
        self.window_size = window_size
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = float(head_dim)**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(dim, dim)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x, size):

        H, W = size
        B, L, C = x.shape
        assert L == H * W, "input feature has wrong size"

        x = x.view(B, H, W, C)

        pad_l = pad_t = 0
        pad_r = (self.window_size - W % self.window_size) % self.window_size
        pad_b = (self.window_size - H % self.window_size) % self.window_size
        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
        _, Hp, Wp, _ = x.shape

        x = window_partition(x, self.window_size)
        x = x.view(-1, self.window_size * self.window_size, C)

        # W-MSA/SW-MSA
        # attn_windows = self.attn(x_windows)

        B_, N, C = x.shape
        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))
        attn = self.softmax(attn)

        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
        x = self.proj(x)

        # merge windows
        x = x.view(-1, self.window_size, self.window_size, C)
        x = window_reverse(x, B, self.window_size, Hp, Wp)

        if pad_r > 0 or pad_b > 0:
            x = x[:, :H, :W, :].contiguous()

        x = x.view(B, H * W, C)

        return x, size

dim `instance-attribute` ¶

dim = dim

num_heads `instance-attribute` ¶

num_heads = num_heads

proj `instance-attribute` ¶

proj = Linear(dim, dim)

qkv `instance-attribute` ¶

qkv = Linear(dim, dim * 3, bias=qkv_bias)

scale `instance-attribute` ¶

scale = float(head_dim) ** -0.5

softmax `instance-attribute` ¶

softmax = Softmax(dim=-1)

window_size `instance-attribute` ¶

window_size = window_size

init ¶

__init__(dim, num_heads, window_size, qkv_bias=True)

Source code in vllm/model_executor/models/florence2.py

def __init__(self, dim, num_heads, window_size, qkv_bias=True):

    super().__init__()
    self.dim = dim
    self.window_size = window_size
    self.num_heads = num_heads
    head_dim = dim // num_heads
    self.scale = float(head_dim)**-0.5

    self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
    self.proj = nn.Linear(dim, dim)

    self.softmax = nn.Softmax(dim=-1)

forward ¶

forward(x, size)

Source code in vllm/model_executor/models/florence2.py

def forward(self, x, size):

    H, W = size
    B, L, C = x.shape
    assert L == H * W, "input feature has wrong size"

    x = x.view(B, H, W, C)

    pad_l = pad_t = 0
    pad_r = (self.window_size - W % self.window_size) % self.window_size
    pad_b = (self.window_size - H % self.window_size) % self.window_size
    x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
    _, Hp, Wp, _ = x.shape

    x = window_partition(x, self.window_size)
    x = x.view(-1, self.window_size * self.window_size, C)

    # W-MSA/SW-MSA
    # attn_windows = self.attn(x_windows)

    B_, N, C = x.shape
    qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
                              C // self.num_heads).permute(2, 0, 3, 1, 4)
    q, k, v = qkv[0], qkv[1], qkv[2]

    q = q * self.scale
    attn = (q @ k.transpose(-2, -1))
    attn = self.softmax(attn)

    x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
    x = self.proj(x)

    # merge windows
    x = x.view(-1, self.window_size, self.window_size, C)
    x = window_reverse(x, B, self.window_size, Hp, Wp)

    if pad_r > 0 or pad_b > 0:
        x = x[:, :H, :W, :].contiguous()

    x = x.view(B, H * W, C)

    return x, size

window_partition ¶

window_partition(x, window_size: int)

Source code in vllm/model_executor/models/florence2.py

def window_partition(x, window_size: int):
    B, H, W, C = x.shape
    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
               C)
    windows = x.permute(0, 1, 3, 2, 4,
                        5).contiguous().view(-1, window_size, window_size, C)
    return windows

window_reverse ¶

window_reverse(
    windows,
    batch_size: int,
    window_size: int,
    H: int,
    W: int,
)

Source code in vllm/model_executor/models/florence2.py

def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
    B = batch_size

    x = windows.view(B, H // window_size, W // window_size, window_size,
                     window_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x

vllm.model_executor.models.florence2

ChannelAttention ¶

groups instance-attribute ¶

proj instance-attribute ¶

qkv instance-attribute ¶

__init__ ¶

forward ¶

ChannelBlock ¶

channel_attn instance-attribute ¶

conv1 instance-attribute ¶

conv2 instance-attribute ¶

ffn instance-attribute ¶

__init__ ¶

forward ¶

ConvEmbed ¶

norm instance-attribute ¶

patch_size instance-attribute ¶

pre_norm instance-attribute ¶

proj instance-attribute ¶

__init__ ¶

forward ¶

DaViT ¶

avgpool instance-attribute ¶

blocks instance-attribute ¶

convs instance-attribute ¶

dim_out property ¶

embed_dims instance-attribute ¶

enable_checkpoint instance-attribute ¶

num_classes instance-attribute ¶

num_groups instance-attribute ¶

num_heads instance-attribute ¶

num_stages instance-attribute ¶

__init__ ¶

forward ¶

forward_features ¶

forward_features_unpool ¶

from_config classmethod ¶

DepthWiseConv2d ¶

dw instance-attribute ¶

__init__ ¶

forward ¶

Florence2DummyInputsBuilder ¶

get_dummy_mm_data ¶

get_dummy_text ¶

Florence2ForConditionalGeneration ¶

config instance-attribute ¶

language_model instance-attribute ¶

pad_token_id instance-attribute ¶

processor_config instance-attribute ¶

vision_config instance-attribute ¶

vision_tower instance-attribute ¶

__init__ ¶

_build_image_projection_layers ¶

_encode_image ¶

_parse_and_validate_image_input ¶

_process_image_input ¶

_validate_pixel_values ¶

compute_logits ¶

forward ¶

get_input_embeddings ¶

get_language_model ¶

get_multimodal_embeddings ¶

get_placeholder_str classmethod ¶

load_weights ¶

Florence2ImagePixelInputs ¶

data instance-attribute ¶

type instance-attribute ¶

Florence2LanguageForConditionalGeneration ¶

config instance-attribute ¶

lm_head instance-attribute ¶

logits_processor instance-attribute ¶

model instance-attribute ¶

vocab_size instance-attribute ¶

__init__ ¶

compute_logits ¶

forward ¶

get_input_embeddings ¶

load_weights ¶

Florence2LanguageModel ¶

config instance-attribute ¶

groups `instance-attribute` ¶

proj `instance-attribute` ¶

qkv `instance-attribute` ¶

init ¶

channel_attn `instance-attribute` ¶

conv1 `instance-attribute` ¶

conv2 `instance-attribute` ¶

ffn `instance-attribute` ¶

init ¶

norm `instance-attribute` ¶

patch_size `instance-attribute` ¶

pre_norm `instance-attribute` ¶

proj `instance-attribute` ¶

init ¶

avgpool `instance-attribute` ¶

blocks `instance-attribute` ¶

convs `instance-attribute` ¶

dim_out `property` ¶

embed_dims `instance-attribute` ¶

enable_checkpoint `instance-attribute` ¶

num_classes `instance-attribute` ¶

num_groups `instance-attribute` ¶

num_heads `instance-attribute` ¶

num_stages `instance-attribute` ¶

init ¶

from_config `classmethod` ¶

dw `instance-attribute` ¶

init ¶

config `instance-attribute` ¶

language_model `instance-attribute` ¶

pad_token_id `instance-attribute` ¶

processor_config `instance-attribute` ¶

vision_config `instance-attribute` ¶

vision_tower `instance-attribute` ¶

init ¶

get_placeholder_str `classmethod` ¶

data `instance-attribute` ¶

type `instance-attribute` ¶

config `instance-attribute` ¶

lm_head `instance-attribute` ¶

logits_processor `instance-attribute` ¶

model `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

config `instance-attribute` ¶

decoder `instance-attribute` ¶

encoder `instance-attribute` ¶

shared `instance-attribute` ¶

vocab_size `instance-attribute` ¶

init ¶

column_embeddings `instance-attribute` ¶

row_embeddings `instance-attribute` ¶

init ¶

net `instance-attribute` ¶

init ¶

embed_dim `instance-attribute` ¶

max_seq_len `instance-attribute` ¶

pos_idx_to_embed `instance-attribute` ¶

init ¶

fn `instance-attribute` ¶

norm `instance-attribute` ¶

init ¶

conv1 `instance-attribute` ¶

conv2 `instance-attribute` ¶

ffn `instance-attribute` ¶

window_attn `instance-attribute` ¶

init ¶

dim `instance-attribute` ¶

num_heads `instance-attribute` ¶

proj `instance-attribute` ¶

qkv `instance-attribute` ¶

scale `instance-attribute` ¶

softmax `instance-attribute` ¶

window_size `instance-attribute` ¶

init ¶