Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 159 additions & 22 deletions src/tools/imagegen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ interface ImageGenInput {
output_path?: string;
size?: string;
model?: string;
/**
* Optional reference image for image-to-image generation (style transfer,
* character consistency, edits). When set, the call is routed to
* /v1/images/image2image instead of /v1/images/generations and only models
* that support reference images may be used (gpt-image-1/2,
* nano-banana-pro, grok-imagine-image-pro). Accepts:
* - http(s) URL — fetched server-side
* - data URI (data:image/...;base64,...)
* - local file path — read, base64-encoded, capped at ~4 MB
*/
image_url?: string;
/**
* Optional Content id to attach this generation to. When provided:
* (1) Budget is checked BEFORE the paid generation — refusing up-front
Expand All @@ -37,6 +48,75 @@ interface ImageGenInput {
contentId?: string;
}

/**
* Models that accept a reference image via /v1/images/image2image. Currently
* limited to OpenAI's edit endpoint — Gemini Nano Banana Pro and Grok Imagine
* Image Pro need gateway-side support before they can be wired in here.
*/
export const EDIT_SUPPORTED_MODELS = new Set([
'openai/gpt-image-1',
'openai/gpt-image-2',
]);

export const REFERENCE_IMAGE_MAX_BYTES = 4_000_000;

/**
* Normalize a reference image into a base64 data URI for the gateway. The
* /v1/images/image2image endpoint validates `image` against /^data:image\//,
* so http(s) URLs and local paths both have to be inlined client-side before
* posting. Already-formed data URIs pass through.
*/
export async function resolveReferenceImage(input: string, workingDir: string): Promise<string> {
if (input.startsWith('data:image/')) return input;

if (/^https?:\/\//i.test(input)) {
const ctrl = new AbortController();
const timeout = setTimeout(() => ctrl.abort(), 30_000);
try {
const resp = await fetch(input, { signal: ctrl.signal });
if (!resp.ok) {
throw new Error(`Reference image fetch failed: ${resp.status} ${resp.statusText}`);
}
const contentType = (resp.headers.get('content-type') || '').toLowerCase().split(';')[0].trim();
if (!contentType.startsWith('image/')) {
throw new Error(`Reference image URL returned non-image content-type: ${contentType || '(none)'}`);
}
const buf = Buffer.from(await resp.arrayBuffer());
if (buf.byteLength > REFERENCE_IMAGE_MAX_BYTES) {
throw new Error(
`Reference image too large: ${(buf.byteLength / 1_000_000).toFixed(1)}MB > ${(REFERENCE_IMAGE_MAX_BYTES / 1_000_000).toFixed(1)}MB cap.`,
);
}
return `data:${contentType};base64,${buf.toString('base64')}`;
} finally {
clearTimeout(timeout);
}
}

// Treat as local file path.
const resolved = path.isAbsolute(input) ? input : path.resolve(workingDir, input);
const stat = fs.statSync(resolved);
if (stat.size > REFERENCE_IMAGE_MAX_BYTES) {
throw new Error(
`Reference image too large: ${(stat.size / 1_000_000).toFixed(1)}MB > ${(REFERENCE_IMAGE_MAX_BYTES / 1_000_000).toFixed(1)}MB cap. Resize or crop first.`,
);
}
const ext = path.extname(resolved).toLowerCase();
const mimeMap: Record<string, string> = {
'.png': 'image/png',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.gif': 'image/gif',
'.webp': 'image/webp',
};
const mime = mimeMap[ext];
if (!mime) {
throw new Error(`Unsupported reference image extension ${ext || '(none)'}. Use .png/.jpg/.jpeg/.gif/.webp.`);
}
const bytes = fs.readFileSync(resolved);
return `data:${mime};base64,${bytes.toString('base64')}`;
}

export interface ImageGenDeps {
/** Optional Content library for auto-recording generations into a piece. */
library?: ContentLibrary;
Expand All @@ -50,12 +130,24 @@ function buildExecute(deps: ImageGenDeps) {
ctx: ExecutionScope,
): Promise<CapabilityResult> {
const rawInput = input as unknown as ImageGenInput;
const { output_path, size, model, contentId } = rawInput;
const { output_path, size, model, contentId, image_url } = rawInput;

if (!rawInput.prompt) {
return { output: 'Error: prompt is required', isError: true };
}

// Resolve the reference image (if any) before any paid call so we fail
// cheaply on bad paths / oversize attachments. Holds the resolved data URI
// / http URL that gets posted to /v1/images/image2image.
let referenceImage: string | undefined;
if (image_url) {
try {
referenceImage = await resolveReferenceImage(image_url, ctx.workingDir);
} catch (err) {
return { output: `Error: ${(err as Error).message}`, isError: true };
}
}

// One-shot refinement opt-out: leading `///` tells Franklin "don't
// refine this prompt, I wrote it the way I want it." Strip the prefix
// and pass skipRefine through to the router.
Expand All @@ -72,12 +164,29 @@ function buildExecute(deps: ImageGenDeps) {
// step and use the old default. Otherwise: classifier picks a fitting
// model + rewrites the prompt, the preview goes to AskUser, user
// chooses or cancels.
let imageModel = model || 'openai/gpt-image-1';
// Reference-image mode forces an edit-capable model. If the caller named
// an unsupported one, fail loudly so we don't silently downgrade their
// request to text-only generation.
if (referenceImage && model && !EDIT_SUPPORTED_MODELS.has(model)) {
return {
output:
`Error: model ${model} does not support reference images. ` +
`Use one of: ${[...EDIT_SUPPORTED_MODELS].join(', ')}.`,
isError: true,
};
}

let imageModel = model || (referenceImage ? 'openai/gpt-image-2' : 'openai/gpt-image-1');
const imageSize = size || '1024x1024';
let chosenPrompt = prompt;

// Skip the proposal flow when a reference image is set: the media router
// doesn't know which models support image-to-image, so its suggestions
// would frequently be unusable (text-only models). Default to gpt-image-1
// for now; a future router upgrade can pick between the four edit-capable
// models based on the prompt.
const autoApprove = process.env.FRANKLIN_MEDIA_AUTO_APPROVE_ALL === '1';
if (!model && !autoApprove && ctx.onAskUser) {
if (!model && !autoApprove && ctx.onAskUser && !referenceImage) {
try {
const chain = loadChain();
const client = new ModelClient({ apiUrl: API_URLS[chain], chain });
Expand Down Expand Up @@ -137,20 +246,34 @@ function buildExecute(deps: ImageGenDeps) {

const chain = loadChain();
const apiUrl = API_URLS[chain];
const endpoint = `${apiUrl}/v1/images/generations`;
// Reference-image mode hits the dedicated /v1/images/image2image endpoint;
// otherwise stay on text-to-image generations.
const endpoint = referenceImage
? `${apiUrl}/v1/images/image2image`
: `${apiUrl}/v1/images/generations`;

// Default output path
const outPath = output_path
? (path.isAbsolute(output_path) ? output_path : path.resolve(ctx.workingDir, output_path))
: path.resolve(ctx.workingDir, `generated-${Date.now()}.png`);

const body = JSON.stringify({
model: imageModel,
prompt: chosenPrompt,
n: 1,
size: imageSize,
response_format: 'b64_json',
});
const body = JSON.stringify(
referenceImage
? {
model: imageModel,
prompt: chosenPrompt,
image: referenceImage,
size: imageSize,
n: 1,
}
: {
model: imageModel,
prompt: chosenPrompt,
n: 1,
size: imageSize,
response_format: 'b64_json',
},
);

const headers: Record<string, string> = {
'Content-Type': 'application/json',
Expand All @@ -173,7 +296,7 @@ function buildExecute(deps: ImageGenDeps) {
if (response.status === 402) {
const paymentHeaders = await signPayment(response, chain, endpoint);
if (!paymentHeaders) {
return { output: 'Payment failed. Check wallet balance with: runcode balance', isError: true };
return { output: 'Payment failed. Check wallet balance with: franklin balance', isError: true };
}

response = await fetch(endpoint, {
Expand All @@ -198,11 +321,21 @@ function buildExecute(deps: ImageGenDeps) {
return { output: 'No image data returned from API', isError: true };
}

// Save image
// Save image. The /v1/images/image2image endpoint returns Gemini results
// as a data URI in `url`, so decode those locally instead of going through
// fetch — saves a network round-trip and avoids data:-URI fetch quirks.
if (imageData.b64_json) {
const buffer = Buffer.from(imageData.b64_json, 'base64');
fs.mkdirSync(path.dirname(outPath), { recursive: true });
fs.writeFileSync(outPath, buffer);
} else if (imageData.url && imageData.url.startsWith('data:')) {
const match = imageData.url.match(/^data:[^;]+;base64,(.+)$/);
if (!match) {
return { output: 'Malformed data URI in response', isError: true };
}
const buffer = Buffer.from(match[1], 'base64');
fs.mkdirSync(path.dirname(outPath), { recursive: true });
fs.writeFileSync(outPath, buffer);
} else if (imageData.url) {
// Download from URL (with 30s timeout)
const dlCtrl = new AbortController();
Expand Down Expand Up @@ -290,7 +423,7 @@ async function signPayment(
feePayer as string,
{
resourceUrl: details.resource?.url || endpoint,
resourceDescription: details.resource?.description || 'RunCode image generation',
resourceDescription: details.resource?.description || 'Franklin image generation',
maxTimeoutSeconds: details.maxTimeoutSeconds || 300,
extra: details.extra as Record<string, unknown> | undefined,
}
Expand All @@ -309,7 +442,7 @@ async function signPayment(
details.network || 'eip155:8453',
{
resourceUrl: details.resource?.url || endpoint,
resourceDescription: details.resource?.description || 'RunCode image generation',
resourceDescription: details.resource?.description || 'Franklin image generation',
maxTimeoutSeconds: details.maxTimeoutSeconds || 300,
extra: details.extra as Record<string, unknown> | undefined,
}
Expand Down Expand Up @@ -347,20 +480,24 @@ export function createImageGenCapability(deps: ImageGenDeps = {}): CapabilityHan
spec: {
name: 'ImageGen',
description:
"Generate an image from a text prompt. Costs USDC from the user's wallet " +
"— confirm before generating. Saves to a local file. Default size: " +
"1024x1024. Do NOT call repeatedly to iterate on style — ask the user " +
"first. Pass contentId to attach the result to an existing Content " +
"piece: the content's budget is checked BEFORE paying, and on success " +
"the image is recorded as an asset with its estimated cost. Skipping " +
"contentId generates a one-off image with no budget tracking.",
"Generate an image from a text prompt — optionally with a reference " +
"image for style transfer / character consistency / edits. Costs USDC " +
"from the user's wallet — confirm before generating. Saves to a local " +
"file. Default size: 1024x1024. Do NOT call repeatedly to iterate on " +
"style — ask the user first. Pass contentId to attach the result to " +
"an existing Content piece: the content's budget is checked BEFORE " +
"paying, and on success the image is recorded as an asset with its " +
"estimated cost. Skipping contentId generates a one-off image with no " +
"budget tracking. When image_url is set, only edit-capable models " +
"(openai/gpt-image-1, openai/gpt-image-2) are accepted.",
input_schema: {
type: 'object',
properties: {
prompt: { type: 'string', description: 'Text description of the image to generate' },
output_path: { type: 'string', description: 'Where to save the image. Default: generated-<timestamp>.png in working directory' },
size: { type: 'string', description: 'Image size: 1024x1024, 1792x1024, or 1024x1792. Default: 1024x1024' },
model: { type: 'string', description: 'Image model to use. Default: openai/gpt-image-1' },
image_url: { type: 'string', description: 'Optional reference image (image-to-image / style transfer). Accepts an http(s) URL, a data URI, or a local file path. Only works with edit-capable models.' },
contentId: { type: 'string', description: 'Optional Content id to attach this generation to. Pre-flight budget check + auto-record on success.' },
},
required: ['prompt'],
Expand Down
108 changes: 107 additions & 1 deletion test/local.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ test('write capability allows files under system temp directory', async () => {
test('session storage falls back to temp dir when HOME is not writable', async () => {
const originalHome = process.env.HOME;
const fakeHome = mkdtempSync(join(tmpdir(), 'rc-home-ro-'));
const fallbackDir = join(tmpdir(), 'runcode', 'sessions');
const fallbackDir = join(tmpdir(), 'franklin', 'sessions');

try {
mkdirSync(fakeHome, { recursive: true });
Expand Down Expand Up @@ -3944,3 +3944,109 @@ test('version-check: getAvailableUpdate reflects cache vs installed version', as
else if (fs.existsSync(cacheFile)) fs.unlinkSync(cacheFile);
}
});

test('imagegen: resolveReferenceImage passes data URIs through unchanged', async () => {
const { resolveReferenceImage } = await import('../dist/tools/imagegen.js');

// Pre-formed data URIs are already in the gateway-required shape.
const dataUri = 'data:image/png;base64,iVBORw0KGgo=';
assert.equal(await resolveReferenceImage(dataUri, '/tmp'), dataUri);
});

test('imagegen: resolveReferenceImage fetches http(s) URLs and inlines them as data URIs', async () => {
const { resolveReferenceImage } = await import('../dist/tools/imagegen.js');
const pngBytes = Buffer.from(
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
'base64',
);

const server = createServer((req, res) => {
if (req.url === '/img.png') {
res.writeHead(200, { 'Content-Type': 'image/png', 'Content-Length': pngBytes.length });
res.end(pngBytes);
} else if (req.url === '/text.html') {
res.writeHead(200, { 'Content-Type': 'text/html' });
res.end('<html></html>');
} else if (req.url === '/missing.png') {
res.writeHead(404, { 'Content-Type': 'text/plain' });
res.end('not found');
} else {
res.writeHead(500); res.end();
}
});
await new Promise(resolve => server.listen(0, '127.0.0.1', resolve));
const port = server.address().port;
try {
const out = await resolveReferenceImage(`http://127.0.0.1:${port}/img.png`, '/tmp');
assert.match(out, /^data:image\/png;base64,/, 'url should round-trip into a data URI');
const decoded = Buffer.from(out.split(',')[1], 'base64');
assert.ok(decoded.equals(pngBytes), 'fetched bytes must match original');

// Non-image content-type → reject before we waste a paid call.
await assert.rejects(
() => resolveReferenceImage(`http://127.0.0.1:${port}/text.html`, '/tmp'),
/non-image content-type/,
);

// Upstream errors surface clearly.
await assert.rejects(
() => resolveReferenceImage(`http://127.0.0.1:${port}/missing.png`, '/tmp'),
/Reference image fetch failed: 404/,
);
} finally {
await new Promise(resolve => server.close(resolve));
}
});

test('imagegen: resolveReferenceImage reads and base64-encodes a local image', async () => {
const { resolveReferenceImage } = await import('../dist/tools/imagegen.js');
const tmp = mkdtempSync(join(tmpdir(), 'imagegen-ref-'));
const imgPath = join(tmp, 'pixel.png');
// 1x1 transparent PNG.
const pngBytes = Buffer.from(
'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==',
'base64',
);
writeFileSync(imgPath, pngBytes);

try {
const out = await resolveReferenceImage(imgPath, '/tmp');
assert.match(out, /^data:image\/png;base64,/);
const decoded = Buffer.from(out.split(',')[1], 'base64');
assert.ok(decoded.equals(pngBytes), 'round-trip should preserve bytes');

// Relative paths resolve against workingDir.
const relOut = await resolveReferenceImage('pixel.png', tmp);
assert.equal(relOut, out);
} finally {
rmSync(tmp, { recursive: true, force: true });
}
});

test('imagegen: resolveReferenceImage rejects unsupported extensions and oversized files', async () => {
const { resolveReferenceImage, REFERENCE_IMAGE_MAX_BYTES } = await import('../dist/tools/imagegen.js');
const tmp = mkdtempSync(join(tmpdir(), 'imagegen-ref-'));

try {
// Unsupported extension.
const txt = join(tmp, 'note.txt');
writeFileSync(txt, 'hello');
await assert.rejects(() => resolveReferenceImage(txt, '/tmp'), /Unsupported reference image extension/);

// Oversized PNG.
const big = join(tmp, 'huge.png');
writeFileSync(big, Buffer.alloc(REFERENCE_IMAGE_MAX_BYTES + 1, 0));
await assert.rejects(() => resolveReferenceImage(big, '/tmp'), /Reference image too large/);
} finally {
rmSync(tmp, { recursive: true, force: true });
}
});

test('imagegen: EDIT_SUPPORTED_MODELS lists OpenAI image-edit models', async () => {
const { EDIT_SUPPORTED_MODELS } = await import('../dist/tools/imagegen.js');
assert.ok(EDIT_SUPPORTED_MODELS.has('openai/gpt-image-1'));
assert.ok(EDIT_SUPPORTED_MODELS.has('openai/gpt-image-2'));
// Other providers can be added once the gateway wires them up.
assert.ok(!EDIT_SUPPORTED_MODELS.has('google/nano-banana-pro'));
assert.ok(!EDIT_SUPPORTED_MODELS.has('xai/grok-imagine-image-pro'));
});
Loading