Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions src/vulca/providers/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import asyncio
import base64
import io
import math
import os

Expand Down Expand Up @@ -66,6 +67,20 @@ def _detect_mime_type(data: bytes) -> str:
return "image/png"


def _build_visible_mask_reference(mask_bytes: bytes) -> bytes:
"""Render an alpha edit mask as visible black/white guidance for Gemini."""
from PIL import Image

with Image.open(io.BytesIO(mask_bytes)) as mask_img:
alpha = mask_img.convert("RGBA").split()[-1]
visible = alpha.point(lambda value: 255 if value < 128 else 0)
rgb = Image.new("RGB", visible.size, (0, 0, 0))
rgb.paste((255, 255, 255), mask=visible)
buf = io.BytesIO()
rgb.save(buf, format="PNG")
return buf.getvalue()


class GeminiImageProvider:
"""Image generation via Google Gemini API.

Expand Down Expand Up @@ -279,6 +294,7 @@ async def inpaint_with_mask(
image_bytes = image_fh.read()
with open(mask_path, "rb") as mask_fh:
mask_bytes = mask_fh.read()
visible_mask_bytes = _build_visible_mask_reference(mask_bytes)

if size and "x" in size:
try:
Expand All @@ -295,11 +311,15 @@ async def inpaint_with_mask(
aspect_ratio = _dims_to_aspect_ratio(width, height)
full_prompt = (
f"{prompt}\n\n"
"Use the first image as the source crop. Use the second image as an "
"RGBA edit mask: transparent mask pixels mark the edit region, and "
"opaque mask pixels mark source context that should stay visually "
"preserved. Repaint only the transparent mask pixels. Do not create "
"a new scene outside the masked replacement area."
"Use the first image as the source crop. Use the second image as a "
"visible binary edit mask rendered from the original RGBA mask: "
"white mask pixels (original transparent mask pixels) mark the edit "
"region, and black mask pixels (original opaque mask pixels) mark "
"source context that should stay visually preserved. Repaint only "
"the white edit region. Do not create a new scene outside the masked "
"replacement area. The mask image is not part of the output: do not "
"draw the mask, do not copy its white or black shapes, and do not "
"leave mask-colored background behind."
)
if tradition and tradition != "default":
full_prompt += (
Expand All @@ -314,8 +334,8 @@ async def inpaint_with_mask(
mime_type=_detect_mime_type(image_bytes),
),
types.Part.from_bytes(
data=mask_bytes,
mime_type=_detect_mime_type(mask_bytes),
data=visible_mask_bytes,
mime_type="image/png",
),
full_prompt,
]
Expand Down
9 changes: 9 additions & 0 deletions tests/test_gemini_image_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,17 @@ def __init__(self, api_key):
assert recorded["model"] == "gemini-3.1-flash-image-preview"
assert recorded["contents"][0].mime_type == "image/png"
assert recorded["contents"][1].mime_type == "image/png"
visible_mask = Image.open(io.BytesIO(recorded["contents"][1].data)).convert(
"RGB"
)
assert visible_mask.getpixel((8, 6)) == (255, 255, 255)
assert visible_mask.getpixel((0, 0)) == (0, 0, 0)
assert "transparent mask pixels" in recorded["contents"][2]
assert "opaque mask pixels" in recorded["contents"][2]
assert "white mask pixels" in recorded["contents"][2]
assert "black mask pixels" in recorded["contents"][2]
assert "do not draw the mask" in recorded["contents"][2].lower()
assert "not part of the output" in recorded["contents"][2].lower()
assert "paint one compact yellow flower head" in recorded["contents"][2]
assert result.mime == "image/png"
assert base64.b64decode(result.image_b64).startswith(b"\x89PNG")
Expand Down