Source code for wibench.attacks.adversarial.adversarial

import numpy as np
import torch

from ..base import BaseAttack

from .feature_extractors import ClipEmbedding, ResNet18Embedding, VAEEmbedding


[docs]class AdversarialEmbedding(BaseAttack):
    """Adversarial embedding attack from `WAVES <https://github.com/umd-huang-lab/WAVES>`_ benchmark."""

    def __init__(self,
                 encoder: str = "resnet18",
                 device: torch.device | str = "cuda" if torch.cuda.is_available() else "cpu",
                 loss_type: str = "l2",  # metric between attacked and non-attacked embeddings
                 strength: int = 2,  # 2,4,6,8
                 eps_factor: float = 1 / 255,
                 alpha_factor: float = 0.05,
                 n_steps: int = 200,
                 random_start: bool = True,
                 ) -> None:
        super().__init__()

        # load embedding model
        if encoder == "resnet18":
            # we use last layer's state as the embedding
            embedding_model = ResNet18Embedding("last")
        elif encoder == "clip":
            embedding_model = ClipEmbedding()
        elif encoder == "klvae8":
            # same vae as used in generator
            embedding_model = VAEEmbedding("WIBE-HuggingFace/sd-vae-ft-mse")
        elif encoder == "sdxlvae":
            embedding_model = VAEEmbedding("WIBE-HuggingFace/sdxl-vae")
        else:
            raise ValueError(f"Unsupported encoder: {encoder}")
        embedding_model = embedding_model.to(device)
        embedding_model.eval()

        # # class that performs PGD
        # self.module = WarmupPGDEmbedding(model=embedding_model,
        #                                  device=device,
        #                                  eps=eps_factor * strength,
        #                                  alpha=alpha_factor * eps_factor * strength,
        #                                  steps=n_steps,
        #                                  loss_type=loss_type,
        #                                  random_start=True,
        #                                  )
        self.model = embedding_model
        self.device = device
        self.eps = eps_factor * strength
        self.alpha = alpha_factor * eps_factor * strength
        self.steps = n_steps
        self.loss_type = loss_type
        self.random_start = random_start

        # Initialize the loss function
        if self.loss_type == "l1":
            self.loss_fn = torch.nn.L1Loss()
        elif self.loss_type == "l2":
            self.loss_fn = torch.nn.MSELoss()
        else:
            raise ValueError("Unsupported loss type")

    def pgd(self, images: torch.Tensor, init_delta: torch.Tensor = None) -> torch.Tensor:
        self.model.eval()
        images = images.clone().detach().to(self.device)

        # Get the original embeddings
        original_embeddings = self.model(images).detach()

        # initialize adv images
        if self.random_start:
            adv_images = images.clone().detach()
            # Starting at a uniformly random point
            adv_images = adv_images + torch.empty_like(adv_images).uniform_(-self.eps, self.eps)
            adv_images = torch.clamp(adv_images, min=0, max=1).detach()

        elif init_delta is not None:
            clamped_delta = torch.clamp(init_delta, min=-self.eps, max=self.eps)
            adv_images = images.clone().detach() + clamped_delta
            adv_images = torch.clamp(adv_images, min=0, max=1).detach()
        else:
            assert False

        # PGD
        for _ in range(self.steps):
            self.model.zero_grad()
            adv_images.requires_grad = True
            adv_embeddings = self.model(adv_images)

            # Calculate loss
            cost = self.loss_fn(adv_embeddings, original_embeddings)

            # Update adversarial images
            grad = torch.autograd.grad(cost, adv_images, retain_graph=False, create_graph=False)[0]
            adv_images = adv_images.detach() + self.alpha * grad.sign()
            delta = torch.clamp(adv_images - images, min=-self.eps, max=self.eps)
            adv_images = torch.clamp(images + delta, min=0, max=1).detach()

        return adv_images

    def __call__(self, img: torch.Tensor) -> torch.Tensor:
        img = img.unsqueeze(0)
        return self.pgd(img).squeeze(0).cpu()


[docs]class AdversarialEmbeddingPSNR(BaseAttack):
    r"""Modification of adversarial embedding attack that uses PSNR instead of :math:`\ell_\infty` norm to measure closeness between images."""

    def __init__(self,
                 encoder: str = "resnet18",
                 device: torch.device | str = "cuda" if torch.cuda.is_available() else "cpu",
                 psnr: float = 40,
                 loss_type: str = "l2",  # metric between attacked and non-attacked embeddings
                 alpha: float = 10.,
                 n_steps: int = 100,
                 ) -> None:
        super().__init__()

        # load embedding model
        if encoder == "resnet18":
            # we use last layer's state as the embedding
            embedding_model = ResNet18Embedding("last")
        elif encoder == "clip":
            embedding_model = ClipEmbedding()
        elif encoder == "klvae8":
            # same vae as used in generator
            embedding_model = VAEEmbedding("WIBE-HuggingFace/sd-vae-ft-mse")
        elif encoder == "sdxlvae":
            embedding_model = VAEEmbedding("WIBE-HuggingFace/sdxl-vae")
        else:
            raise ValueError(f"Unsupported encoder: {encoder}")
        embedding_model = embedding_model.to(device)
        embedding_model.eval()


        self.eps = self.psnr_to_eps(psnr)


        self.model = embedding_model
        self.alpha = alpha
        self.steps = n_steps
        self.loss_type = loss_type
        self.random_start = True
        self.device = device

        # Initialize the loss function
        if self.loss_type == "l1":
            self.loss_fn = torch.nn.L1Loss()
        elif self.loss_type == "l2":
            self.loss_fn = torch.nn.MSELoss()
        else:
            raise ValueError("Unsupported loss type")

    def psnr_to_eps(self, psnr: float, height: int = 512, width: int = 512, value_range: float = 1.) -> float:
        return 10 ** (-psnr / 20) * (value_range * np.sqrt(height * width))

    def pgd(self, images: torch.Tensor, init_delta: torch.Tensor = None) -> torch.Tensor:
        self.model.eval()
        images = images.clone().detach().to(self.device)

        # Get the original embeddings
        original_embeddings = self.model(images).detach()

        # initialize adv images
        if self.random_start:
            adv_images = images.clone().detach()
            # Starting at a uniformly random point
            adv_images = adv_images + torch.empty_like(adv_images).uniform_(-self.eps, self.eps)
            adv_images = torch.clamp(adv_images, min=0, max=1).detach()

        elif init_delta is not None:
            clamped_delta = torch.clamp(init_delta, min=-self.eps, max=self.eps)
            adv_images = images.clone().detach() + clamped_delta
            adv_images = torch.clamp(adv_images, min=0, max=1).detach()
        else:
            raise AssertionError

        # PGD
        for _ in range(self.steps):
            self.model.zero_grad()
            adv_images.requires_grad = True
            adv_embeddings = self.model(adv_images)

            # Calculate loss
            cost = self.loss_fn(adv_embeddings, original_embeddings)

            # Update adversarial images
            grad = torch.autograd.grad(cost, adv_images, retain_graph=False, create_graph=False)[0]
            adv_images = adv_images.detach() + self.alpha * grad / torch.linalg.vector_norm(grad, 2, dim=(-1, -2), keepdim=True)
            delta = adv_images - images
            delta = delta * self.eps / torch.linalg.vector_norm(delta, 2, dim=(-1, -2), keepdim=True).clamp(min=self.eps)
            adv_images = torch.clamp(images + delta, min=0, max=1).detach()

        return adv_images

    def __call__(self, img: torch.Tensor) -> torch.Tensor:
        img = img.unsqueeze(0)
        return self.pgd(img).squeeze(0).cpu()