Skip to content

vllm.transformers_utils.processors.nemotron_vl ¶

LlamaNemotronVLEmbedProcessor ¶

Bases: NemotronVLProcessor

Processor for LlamaNemotronVL embedding model.

Inherits from NemotronVLProcessor and specializes it for embedding tasks: - Uses SigLIP transform with normalization instead of base transform - Uses different image context token ( vs )

Source code in vllm/transformers_utils/processors/nemotron_vl.py

class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
    """
    Processor for LlamaNemotronVL embedding model.

    Inherits from NemotronVLProcessor and specializes it for embedding tasks:
    - Uses SigLIP transform with normalization instead of base transform
    - Uses different image context token (<IMG_CONTEXT> vs <image>)
    """

    IMG_CONTEXT = "<IMG_CONTEXT>"

    def __init__(
        self,
        config: PretrainedConfig,
        tokenizer: TokenizerLike,
        processor_config: dict,
        *,
        min_dynamic_patch: int | None = None,
        max_dynamic_patch: int | None = None,
        dynamic_image_size: bool | None = None,
    ) -> None:
        if min_dynamic_patch is None:
            min_dynamic_patch = processor_config.get(
                "min_input_tiles",
                getattr(config, "min_dynamic_patch", 1),
            )
        if max_dynamic_patch is None:
            max_dynamic_patch = processor_config.get(
                "max_input_tiles",
                getattr(config, "max_dynamic_patch", 1),
            )
        if dynamic_image_size is None:
            dynamic_image_size = processor_config.get(
                "dynamic_image_size",
                getattr(config, "dynamic_image_size", True),
            )
        super().__init__(
            config=config,
            tokenizer=tokenizer,
            image_processor=None,
            min_dynamic_patch=min_dynamic_patch,
            max_dynamic_patch=max_dynamic_patch,
            dynamic_image_size=dynamic_image_size,
        )

    def _get_transform(self) -> T.Compose:
        """Override to add SigLIP normalization."""
        return build_siglip_transform(input_size=self.image_size)

    def _replace_image_tokens(
        self,
        text: list[str],
        pixel_values_lst: list[torch.Tensor],
    ) -> list[str]:
        """Override with simpler token replacement for embedding model.

        No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
        not <image>, so there's no collision risk.
        """
        for pixel_values in pixel_values_lst:
            num_patches = pixel_values.shape[0]
            feature_size = num_patches * self.num_image_token
            image_repl = self.get_image_repl(feature_size, num_patches)
            text = [t.replace("<image>", image_repl.full, 1) for t in text]
        return text

_get_transform ¶

_get_transform() -> Compose

Override to add SigLIP normalization.

Source code in vllm/transformers_utils/processors/nemotron_vl.py

def _get_transform(self) -> T.Compose:
    """Override to add SigLIP normalization."""
    return build_siglip_transform(input_size=self.image_size)

_replace_image_tokens ¶

_replace_image_tokens(
    text: list[str], pixel_values_lst: list[Tensor]
) -> list[str]

Override with simpler token replacement for embedding model.

No temporary placeholder needed because IMG_CONTEXT is , not , so there's no collision risk.

Source code in vllm/transformers_utils/processors/nemotron_vl.py

def _replace_image_tokens(
    self,
    text: list[str],
    pixel_values_lst: list[torch.Tensor],
) -> list[str]:
    """Override with simpler token replacement for embedding model.

    No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
    not <image>, so there's no collision risk.
    """
    for pixel_values in pixel_values_lst:
        num_patches = pixel_values.shape[0]
        feature_size = num_patches * self.num_image_token
        image_repl = self.get_image_repl(feature_size, num_patches)
        text = [t.replace("<image>", image_repl.full, 1) for t in text]
    return text

NemotronVLProcessor ¶

Bases: InternVLProcessor

Source code in vllm/transformers_utils/processors/nemotron_vl.py

class NemotronVLProcessor(InternVLProcessor):
    IMG_START = "<img>"
    IMG_END = "</img>"
    IMG_CONTEXT = "<image>"

    def __init__(
        self,
        config: PretrainedConfig,
        tokenizer: TokenizerLike,
        image_processor: BaseImageProcessorFast,
        *,
        min_dynamic_patch: int | None = None,
        max_dynamic_patch: int | None = None,
        dynamic_image_size: bool | None = None,
    ) -> None:
        ABC.__init__(self)
        self.config = config
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        image_size: int = config.force_image_size
        patch_size: int = config.patch_size

        if min_dynamic_patch is None:
            min_dynamic_patch = 1
        assert isinstance(min_dynamic_patch, int)

        if max_dynamic_patch is None:
            max_dynamic_patch = self.image_processor.max_num_tiles
        assert isinstance(max_dynamic_patch, int)

        if dynamic_image_size is None:
            dynamic_image_size = True
        assert isinstance(dynamic_image_size, bool)

        self.num_image_token = int(
            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
        )
        self.image_size = image_size
        self.min_dynamic_patch = min_dynamic_patch
        self.max_dynamic_patch = max_dynamic_patch
        self.dynamic_image_size = dynamic_image_size

        if image_processor is not None:
            self.use_thumbnail = image_processor.use_thumbnail
        else:
            self.use_thumbnail = getattr(config, "use_thumbnail", True)

    @property
    def image_token_id(self) -> int:
        return self.tokenizer.get_vocab()[self.IMG_CONTEXT]

    def _get_transform(self) -> T.Compose:
        return build_transform(input_size=self.image_size)

    def get_num_image_tokens(
        self,
        *,
        image_width: int,
        image_height: int,
    ) -> int:
        target_ratios = self.resolve_target_ratios(
            use_thumbnail=False,  # Applied in calculate_targets
        )

        num_patches, _, _ = calculate_nemotron_vl_targets(
            orig_width=image_width,
            orig_height=image_height,
            image_size=self.image_size,
            target_ratios=target_ratios,
            use_thumbnail=self.use_thumbnail,
        )

        return num_patches * self.num_image_token

    def _images_to_pixel_values_lst(
        self,
        images: list[Image.Image],
        min_dynamic_patch: int | None = None,
        max_dynamic_patch: int | None = None,
        dynamic_image_size: bool | None = None,
    ) -> list[torch.Tensor]:
        min_num, max_num = self.resolve_min_max_num(
            min_dynamic_patch=min_dynamic_patch,
            max_dynamic_patch=max_dynamic_patch,
            dynamic_image_size=dynamic_image_size,
            use_thumbnail=False,  # Applied in image_to_pixel_values
        )

        return [
            image_to_pixel_values_nemotron_vl(
                image,
                input_size=self.image_size,
                min_num=min_num,
                max_num=max_num,
                use_thumbnail=self.use_thumbnail,
                transform=self._get_transform(),
            )
            for image in images
        ]

    def _replace_image_tokens(
        self,
        text: list[str],
        pixel_values_lst: list[torch.Tensor],
    ) -> list[str]:
        """Replace <image> placeholders with image tokens."""
        for pixel_values in pixel_values_lst:
            num_patches = pixel_values.shape[0]
            feature_size = num_patches * self.num_image_token
            image_repl = self.get_image_repl(feature_size, num_patches)
            # Use temporary placeholder to avoid replacing tokens we just inserted
            NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
            text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
        return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]

    def _preprocess_image(
        self,
        text: list[str],
        images: list[Image.Image],
        min_dynamic_patch: int | None = None,
        max_dynamic_patch: int | None = None,
        dynamic_image_size: bool | None = None,
    ) -> tuple[list[str], dict[str, torch.Tensor]]:
        if len(images) == 0:
            image_inputs = {}
        else:
            pixel_values_lst = self._images_to_pixel_values_lst(
                images,
                min_dynamic_patch=min_dynamic_patch,
                max_dynamic_patch=max_dynamic_patch,
                dynamic_image_size=dynamic_image_size,
            )
            image_inputs = {
                "pixel_values_flat": torch.cat(pixel_values_lst),
                "image_num_patches": torch.tensor(
                    [len(item) for item in pixel_values_lst]
                ),
            }

            text = self._replace_image_tokens(text, pixel_values_lst)
        return text, image_inputs

    def get_image_repl(
        self,
        feature_size: int,
        num_patches: int | None,
    ) -> PromptUpdateDetails[str]:
        repl_features = self.IMG_CONTEXT * feature_size
        repl_full = self.IMG_START + repl_features + self.IMG_END

        return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)

_replace_image_tokens ¶

_replace_image_tokens(
    text: list[str], pixel_values_lst: list[Tensor]
) -> list[str]

Replace placeholders with image tokens.

Source code in vllm/transformers_utils/processors/nemotron_vl.py

def _replace_image_tokens(
    self,
    text: list[str],
    pixel_values_lst: list[torch.Tensor],
) -> list[str]:
    """Replace <image> placeholders with image tokens."""
    for pixel_values in pixel_values_lst:
        num_patches = pixel_values.shape[0]
        feature_size = num_patches * self.num_image_token
        image_repl = self.get_image_repl(feature_size, num_patches)
        # Use temporary placeholder to avoid replacing tokens we just inserted
        NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
        text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
    return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]

build_siglip_transform ¶

build_siglip_transform(input_size: int)

Build transform for SigLIP vision encoder with normalization.

Extends the base transform from nemotron_vl with SigLIP-specific normalization.

Source code in vllm/transformers_utils/processors/nemotron_vl.py

def build_siglip_transform(input_size: int):
    """Build transform for SigLIP vision encoder with normalization.

    Extends the base transform from nemotron_vl with SigLIP-specific normalization.
    """
    return T.Compose(
        [
            build_transform(input_size=input_size),
            T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
        ]
    )