Source code for wibench.attacks.image_editing.ImageEditingFluxContext

import torch
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

from diffusers import FluxKontextPipeline

import json
from wibench.typing import TorchImg
from wibench.attacks.base import BaseAttack


[docs]class ImageEditingFLuxContext(BaseAttack): """ Adversarial attack that edits images using instruction-guided generation. Combines InternVL2 for natural language understanding and FLUX.1-Kontext for instruction-guided image editing. Generates textual instructions describing the input image, then uses them to guide image-to-image transformations that create adversarial outputs. """ def __init__( self, device_vl: str = "cuda:0" if torch.cuda.is_available() else "cpu", device_flux: str = "cuda:1" if torch.cuda.is_available() else "cpu", internvl_path: str = "OpenGVLab/InternVL2_5-8B", fluxcontext_path: str = "black-forest-labs/FLUX.1-Kontext-dev", prompts_path: str = "./resources/flux_prompts.json", guidance_scale: float = 7.5, num_inference_steps: int = 28, is_prompts: bool = True, mode: str = "base", custom_prompt: str = None, ): super().__init__() self.is_prompts = is_prompts self.mode = mode self.custom_prompt = custom_prompt self.device_vl = device_vl self.device_flux = device_flux self.internvl_path = internvl_path self.internvl_model = ( AutoModel.from_pretrained( self.internvl_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, ) .eval() .to(self.device_vl) ) self.tokenizer = AutoTokenizer.from_pretrained( self.internvl_path, trust_remote_code=True, use_fast=False ) self.flux_path = fluxcontext_path self.flux = FluxKontextPipeline.from_pretrained( fluxcontext_path, torch_dtype=torch.bfloat16 ) self.flux = self.flux.to(self.device_flux) self.IMAGENET_MEAN = (0.485, 0.456, 0.406) self.IMAGENET_STD = (0.229, 0.224, 0.225) f = open(prompts_path) self.prompts = json.load(f) self.guidance_scale = guidance_scale self.num_inference_steps = num_inference_steps def build_transform(self, input_size): MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD transform = T.Compose( [ T.Lambda( lambda img: ( img.convert("RGB") if img.mode != "RGB" else img ) ), T.Resize( (input_size, input_size), interpolation=InterpolationMode.BICUBIC, ), T.ToTensor(), T.Normalize(mean=MEAN, std=STD), ] ) return transform def find_closest_aspect_ratio( self, aspect_ratio, target_ratios, width, height, image_size ): best_ratio_diff = float("inf") best_ratio = (1, 1) area = width * height for ratio in target_ratios: target_aspect_ratio = ratio[0] / ratio[1] ratio_diff = abs(aspect_ratio - target_aspect_ratio) if ratio_diff < best_ratio_diff: best_ratio_diff = ratio_diff best_ratio = ratio elif ratio_diff == best_ratio_diff: if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: best_ratio = ratio return best_ratio def dynamic_preprocess( self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False ): orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num ) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) # find the closest aspect ratio to the target target_aspect_ratio = self.find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size ) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] for i in range(blocks): box = ( (i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size, ) # split the image split_img = resized_img.crop(box) processed_images.append(split_img) assert len(processed_images) == blocks if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) return processed_images def load_image(self, image, input_size=448, max_num=12): # image = Image.open(image_file).convert('RGB') transform = self.build_transform(input_size=input_size) images = self.dynamic_preprocess( image, image_size=input_size, use_thumbnail=True, max_num=max_num ) pixel_values = [transform(image) for image in images] pixel_values = torch.stack(pixel_values) return pixel_values def __call__(self, image: TorchImg) -> TorchImg: """ If you want to use one prompt for isntruction using set of prompts, use is_prompts=True. """ # NOTE must be applied only for one image! # generate instruction with InternVL pil_image = T.ToPILImage()(image) pil_img_size = pil_image.size pixel_values = ( self.load_image(pil_image, max_num=12) .to(torch.bfloat16) .to(self.device_vl) ) generation_config = dict(max_new_tokens=1024, do_sample=False) if self.is_prompts: question = self.prompts[self.mode] else: question = self.custom_prompt response, _ = self.internvl_model.chat( self.tokenizer, pixel_values, question, generation_config, history=None, return_history=True, ) # FluxContext attacked_image = self.flux( image=pil_image, prompt=response, height=1024, width=1024, num_inference_steps=self.num_inference_steps, guidance_scale=self.guidance_scale, ).images[0] attacked_image = attacked_image.resize(pil_img_size) return T.ToTensor()(attacked_image)