import ImageReward as RM
import torch
from typing_extensions import Any
from pathlib import Path
from wibench.utils import (
torch_img2numpy_bgr,
save_tmp_images,
delete_tmp_images
)
from wibench.typing import TorchImg
from wibench.metrics.base import PostEmbedMetric
from wibench.download import requires_download
from ..aesthetic.aesthetic import (
URL,
NAME,
REQUIRED_FILES,
DEFAULT_DOWNLOAD_ROOT
)
[docs]@requires_download(URL, NAME, REQUIRED_FILES)
class CLIPScore(PostEmbedMetric):
"""`CLIPScore <https://arxiv.org/abs/2104.08718>`_: A Reference-free Evaluation Metric for Image Captioning.
The implementation is taken from the github `repository <https://github.com/zai-org/ImageReward>`__. Based on `CLIP code base <https://github.com/openai/CLIP>`_.
Initialization Parameters
-------------------------
device : str
Device to run the model on ('cuda', 'cpu')
Call Parameters
---------------
prompt : str
Text prompt for comparison
img2 : TorchImg
Input image tensor in (C, H, W) format
watermark_data : Any
Not used, can be anything
Notes
-----
- The watermark_data field is required for the pipeline to work correctly
"""
def __init__(self,
device: str = "cuda" if torch.cuda.is_available() else "cpu",
download_root: str = DEFAULT_DOWNLOAD_ROOT):
download_root = str(Path(download_root).resolve())
self.model = RM.load_score("CLIP", device=device, download_root=download_root)
def __call__(self,
prompt: str,
img: TorchImg,
watermark_data: Any) -> float:
numpy_image = torch_img2numpy_bgr(img)
tmp_paths = save_tmp_images([numpy_image])
result = self.model.score(prompt, tmp_paths)
delete_tmp_images(tmp_paths)
return result