Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2d7c46a
initial cloud modal mvp commit
int-chaos Mar 1, 2026
dd9f2e6
stream stdout to local
int-chaos Mar 1, 2026
982380a
doc
int-chaos Mar 1, 2026
3a6ade1
imports
int-chaos Mar 5, 2026
4c23857
remove unused fields
int-chaos Mar 5, 2026
c92a178
stream modal stdout
int-chaos Mar 5, 2026
7bee0a8
add context paths to cloud
int-chaos Mar 7, 2026
99a67b6
modal requirement
int-chaos Mar 7, 2026
ae0c228
user guide
int-chaos Mar 7, 2026
7890a60
fix
int-chaos Mar 7, 2026
bcf77eb
basic functionalites unit test
int-chaos Mar 7, 2026
4e3d6a1
Merge branch 'main' into cloud-modal-mvp
int-chaos Mar 8, 2026
af504b1
remove timeout dead code
int-chaos Mar 8, 2026
68b213e
use modal secret to upload env vars
int-chaos Mar 8, 2026
f9fed8c
Run the shared prompt/config preprocessing before the cloud return
int-chaos Mar 8, 2026
8e80938
Use absolute path to resolve modal_app.py from the installed package.
int-chaos Mar 8, 2026
c33a813
Fix test isolation to work without optional Modal dependency.
int-chaos Mar 8, 2026
02c4402
remove envs from tests
int-chaos Mar 8, 2026
c5a9ce5
stderr logging
int-chaos Mar 8, 2026
8b34144
bandage fix for config ui validation issue
int-chaos Mar 8, 2026
44d82ca
Merge branch 'cloud-modal-mvp' of github.com:massgen/MassGen into clo…
int-chaos Mar 8, 2026
4abdd54
Rewrite agent-level backend.context_paths before launching the cloud …
int-chaos Mar 8, 2026
d159321
remove single agent check
int-chaos Mar 8, 2026
1a209ef
preserve the exception chain with raise ... from err
int-chaos Mar 8, 2026
075c4d7
docstrings for helper funcs
int-chaos Mar 8, 2026
b9f312b
cloud output file always final_answer.txt
int-chaos Mar 8, 2026
3797a9b
use cloud job id to prevent marker spoofing
int-chaos Mar 8, 2026
6dbcaad
find LOG_DIR without scraping stderr
int-chaos Mar 8, 2026
ff5a6d6
events.jsonl only has json
int-chaos Mar 8, 2026
95549f0
fix result marker
int-chaos Mar 9, 2026
61fe947
precommit
int-chaos Mar 9, 2026
d84a1b6
resolve duplicate constants
int-chaos Mar 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions docs/source/user_guide/cloud.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Cloud
=====

Overview
--------

MassGen Cloud allows users to run a MassGen job in the cloud.

Currently, MassGen supports:
- running jobs in the Modal cloud.
- single agent jobs.

Quick Start
-----------
To start using MassGen Cloud, you need to have a Modal account and install the Modal CLI.

.. code-block:: bash

pip install modal
modal setup
modal secret create massgen-env --from-dotenv .env

To run a MassGen job in the cloud, use the ``--cloud`` flag:

.. code-block:: bash

massgen --cloud --config config.yaml "Your question"

MassGen will upload the config file, context paths, prompt, and any other necessary files to the cloud and run the job there. You can monitor the progress in the local terminal and view the results when the job is complete.

Results and logs will be saved to the local directory ``.massgen/cloud_jobs/job_{job_id}/artifacts/``.
116 changes: 113 additions & 3 deletions massgen/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,84 @@ def record_event(event):
)


def _run_cloud_job(args: argparse.Namespace, config: dict[str, Any], config_path_label: str | None) -> None:
"""Launch a MassGen run in Modal cloud and materialize results locally."""
if not args.question:
raise ConfigurationError("--cloud requires a question argument")

import uuid

import yaml

from .cloud.cloud_job import CloudJobRequest
from .cloud.modal_launcher import ModalCloudJobLauncher
from .cloud.utils import process_context_paths

config_copy = copy.deepcopy(config)

cloud_job_id = uuid.uuid4().hex[:8]

# Package context path files and rewrite config paths for remote
orchestrator_cfg = config_copy.get("orchestrator", {})
context_paths = orchestrator_cfg.get("context_paths", [])
if context_paths:
rewritten_paths = process_context_paths(context_paths, cloud_job_id=cloud_job_id)
if rewritten_paths:
orchestrator_cfg["context_paths"] = rewritten_paths
config_copy["orchestrator"] = orchestrator_cfg

agents_list = config_copy.get("agents", [])

for agent_cfg in agents_list:
if isinstance(agent_cfg, dict):
backend_cfg = agent_cfg.get("backend", {})
agent_context_paths = backend_cfg.get("context_paths", [])
if agent_context_paths:
rewritten_paths = process_context_paths(agent_context_paths, cloud_job_id=cloud_job_id)
if rewritten_paths:
backend_cfg["context_paths"] = rewritten_paths
agent_cfg["backend"] = backend_cfg

# Cloud validation compatibility for PyPI version
# The PyPI version of massgen on Modal fails if display_type is "silent".
# Since modal_app.py passes `--automation`, it will be re-set to "silent"
# internally *after* validation in the cloud. We remove it here just for validation.
if "ui" in config_copy and config_copy["ui"].get("display_type") == "silent":
config_copy.pop("ui", None)

launcher = ModalCloudJobLauncher()
request = CloudJobRequest(
prompt=args.question,
config_yaml=yaml.safe_dump(config_copy, sort_keys=False),
timeout_seconds=args.cloud_timeout,
cloud_job_id=cloud_job_id,
)
result = launcher.launch(request)

final_answer = result.final_answer
output_path: Path | None = None
if args.output_file:
output_path = Path(args.output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(final_answer, encoding="utf-8")
else:
output_path = result.artifacts_dir / "final_answer.txt"
output_path.write_text(final_answer, encoding="utf-8")

# Always print location in automation mode for machine parsing.
if args.automation:
_automation_print(f"OUTPUT_FILE: {output_path.resolve()}")
_automation_print(f"CLOUD_ARTIFACTS_DIR: {result.artifacts_dir.resolve()}")
if result.local_log_dir:
_automation_print(f"LOG_DIR: {result.local_log_dir.resolve()}")
if result.local_events_path:
_automation_print(f"EVENTS_FILE: {result.local_events_path.resolve()}")
if config_path_label:
_automation_print(f"CLOUD_CONFIG_SOURCE: {config_path_label}")
else:
print(final_answer)


def _build_coordination_ui(ui_config: dict[str, Any]) -> CoordinationUI:
"""Create a CoordinationUI with display_kwargs passthrough (incl. theme)."""
display_kwargs = dict(ui_config.get("display_kwargs", {}) or {})
Expand Down Expand Up @@ -9093,9 +9171,6 @@ def _save_prompt_metadata_failure_fallback(
# Validate that all context paths exist before proceeding
validate_context_paths(config)

# Relocate all filesystem paths to .massgen/ directory
relocate_filesystem_paths(config)

# Generate unique instance ID for parallel execution safety
# This prevents Docker container naming conflicts when running multiple instances
import uuid
Expand Down Expand Up @@ -9432,6 +9507,21 @@ def _save_prompt_metadata_failure_fallback(
f"[Spec Mode] Prepended spec creation instructions " f"(target_chunks={plan_target_chunks}, broadcast={broadcast_mode})",
)

# Cloud execution path (Modal MVP)
if getattr(args, "cloud", False):
if not args.automation:
logger.info("Cloud mode requires automation output; enabling --automation")
args.automation = True
_run_cloud_job(
args=args,
config=config,
config_path_label=str(resolved_path) if resolved_path else None,
)
return
Comment on lines +9510 to +9520
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

This early return skips shared session finalization.

Cloud runs register memory_session_id above, but returning here bypasses the later finally block that calls SessionRegistry.complete_session(...). Successful cloud runs can therefore remain stuck in an active state.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@massgen/cli.py` around lines 9510 - 9520, The early return after calling
_run_cloud_job bypasses the later finally block that calls
SessionRegistry.complete_session and leaves memory_session_id active; remove the
immediate return (or otherwise ensure control flows through to the existing
finally) so that after _run_cloud_job returns the shared session finalization
logic (SessionRegistry.complete_session(...)) is executed for cloud runs; locate
the cloud-path block using _run_cloud_job and memory_session_id to make this
change.


# Relocate all filesystem paths to .massgen/ directory
relocate_filesystem_paths(config)

# For interactive mode without initial question, defer agent creation until first prompt
# This allows @path references in the first prompt to be included in Docker mounts
is_interactive_without_question = not args.question and not getattr(
Expand Down Expand Up @@ -9854,6 +9944,12 @@ def cleanup_agent(
_save_prompt_metadata_failure_fallback("timeout_error", failure_error=e)
sys.exit(EXIT_TIMEOUT)
except Exception as e:
# Keep cloud-specific timeout mapping distinct from generic execution failures.
from .cloud.modal_launcher import CloudJobError

if isinstance(e, CloudJobError) and "timeout" in str(e).lower():
print(f"❌ Timeout error: {e}", flush=True)
sys.exit(EXIT_TIMEOUT)
Comment on lines +9947 to +9952
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, check if the file exists and its size
wc -l massgen/cli.py

Repository: massgen/MassGen

Length of output: 79


🏁 Script executed:

# Look at the specific lines mentioned in the review
sed -n '9930,9950p' massgen/cli.py

Repository: massgen/MassGen

Length of output: 931


🏁 Script executed:

# Get broader context - look at the exception handler that contains these lines
sed -n '9900,9960p' massgen/cli.py

Repository: massgen/MassGen

Length of output: 2800


🏁 Script executed:

# Find the function containing this exception handler
sed -n '9700,9945p' massgen/cli.py | head -100

Repository: massgen/MassGen

Length of output: 4695


🏁 Script executed:

# Search for other imports of CloudJobError or modal_launcher in the file
rg -n "from .cloud.modal_launcher|CloudJobError" massgen/cli.py

Repository: massgen/MassGen

Length of output: 261


🏁 Script executed:

# Check if there are any conditional imports or try/except blocks around modal_launcher imports
rg -n "modal_launcher|ModalCloud" massgen/cli.py -B 2 -A 2

Repository: massgen/MassGen

Length of output: 922


🏁 Script executed:

# Find the function containing the exception handler with line 9939-9944
sed -n '9800,9945p' massgen/cli.py | grep -n "^async def\|^def " | tail -1

Repository: massgen/MassGen

Length of output: 41


🏁 Script executed:

# Get more context to find the function name
sed -n '9850,9870p' massgen/cli.py

Repository: massgen/MassGen

Length of output: 1065


🏁 Script executed:

# Check the structure of cloud.modal_launcher module
head -50 massgen/cloud/modal_launcher.py

Repository: massgen/MassGen

Length of output: 1566


🏁 Script executed:

# Search for CloudJobError definition and its dependencies
rg -n "class CloudJobError|^CloudJobError" massgen/cloud/modal_launcher.py

Repository: massgen/MassGen

Length of output: 41


🏁 Script executed:

# Check if CloudJobError depends on Modal or has conditional imports
sed -n '1,100p' massgen/cloud/modal_launcher.py | rg -n "import|CloudJobError"

Repository: massgen/MassGen

Length of output: 699


🏁 Script executed:

# Find function name containing lines 9939-9944
sed -n '9800,9945p' massgen/cli.py | grep -E "^async def|^def " | tail -5

Repository: massgen/MassGen

Length of output: 41


🏁 Script executed:

# Get better context by looking around line 9900
sed -n '9895,9950p' massgen/cli.py

Repository: massgen/MassGen

Length of output: 2586


🏁 Script executed:

# Check what cloud_job.py exports/defines
head -100 massgen/cloud/cloud_job.py

Repository: massgen/MassGen

Length of output: 2324


🏁 Script executed:

# Verify if CloudJobError is actually re-exported from modal_launcher or just imported
rg -n "CloudJobError" massgen/cloud/modal_launcher.py

Repository: massgen/MassGen

Length of output: 526


🏁 Script executed:

# Find the function name containing the exception handler
grep -n "^async def\|^def " massgen/cli.py | grep -B5 "9939"

Repository: massgen/MassGen

Length of output: 41


🏁 Script executed:

# Look for _run_cloud_job function
rg -n "def _run_cloud_job" massgen/cli.py

Repository: massgen/MassGen

Length of output: 174


🏁 Script executed:

# Check what happens around the _run_cloud_job call
sed -n '470,530p' massgen/cli.py

Repository: massgen/MassGen

Length of output: 2590


🏁 Script executed:

# Verify the full exception handler context to ensure the review comment is accurate
sed -n '9935,9950p' massgen/cli.py

Repository: massgen/MassGen

Length of output: 739


🏁 Script executed:

# Find where _run_cloud_job is called and verify it's in the try block
rg -n "_run_cloud_job" massgen/cli.py | head -20

Repository: massgen/MassGen

Length of output: 207


🏁 Script executed:

# Check the full try/except context by looking at larger range
sed -n '9700,9950p' massgen/cli.py | tail -100

Repository: massgen/MassGen

Length of output: 4510


🏁 Script executed:

# Verify if CloudJobError is imported anywhere else at module level
rg -n "^from.*CloudJobError|^import.*CloudJobError" massgen/cli.py

Repository: massgen/MassGen

Length of output: 41


🏁 Script executed:

# Check if cloud_job.py has any complex imports that could fail
head -50 massgen/cloud/cloud_job.py

Repository: massgen/MassGen

Length of output: 1172


🏁 Script executed:

# Check if modal_launcher.py has problematic imports at module level
head -20 massgen/cloud/modal_launcher.py

Repository: massgen/MassGen

Length of output: 645


🏁 Script executed:

# Verify what exceptions could be raised from _run_cloud_job
sed -n '471,530p' massgen/cli.py | grep -E "raise|CloudJobError"

Repository: massgen/MassGen

Length of output: 224


🏁 Script executed:

# Check what the try/except block contains at line 9171
sed -n '9100,9180p' massgen/cli.py

Repository: massgen/MassGen

Length of output: 3587


🏁 Script executed:

# Check what imports could fail in modal_launcher and its dependencies
cat massgen/cloud/__init__.py 2>/dev/null || echo "File not found"

Repository: massgen/MassGen

Length of output: 470


🏁 Script executed:

# Verify the full import chain
rg -n "import.*modal|from.*modal" massgen/cloud/modal_launcher.py

Repository: massgen/MassGen

Length of output: 41


🏁 Script executed:

# Check if there are any optional/conditional imports in modal_launcher
sed -n '1,50p' massgen/cloud/modal_launcher.py

Repository: massgen/MassGen

Length of output: 1566


🏁 Script executed:

# Verify the exact exception handler context to understand what exceptions could occur
sed -n '9160,9180p' massgen/cli.py

Repository: massgen/MassGen

Length of output: 860


🏁 Script executed:

# Check if CloudJobError is re-exported from modal_launcher or if it needs to be imported from cloud_job
python3 <<'EOF'
import sys
sys.path.insert(0, '/dev/null')
# Try to understand the module structure
with open('massgen/cloud/modal_launcher.py') as f:
    content = f.read()
    if 'CloudJobError' in content:
        # Find where it's used
        lines = content.split('\n')
        for i, line in enumerate(lines[:50], 1):
            if 'CloudJobError' in line:
                print(f"Line {i}: {line}")
EOF

Repository: massgen/MassGen

Length of output: 155


🏁 Script executed:

# Check if utils.py has any problematic imports
head -30 massgen/cloud/utils.py

Repository: massgen/MassGen

Length of output: 981


🏁 Script executed:

# Look for any existing try/except blocks around cloud imports to understand best practices
rg -n "try:" massgen/cloud/ -A 10 | head -50

Repository: massgen/MassGen

Length of output: 813


🏁 Script executed:

# Verify if there are any other similar patterns where imports happen inside exception handlers
rg -n "except.*as.*:\s*from" massgen/ -A 2

Repository: massgen/MassGen

Length of output: 50374


Guard the import inside the exception handler to avoid masking the original error.

If _run_cloud_job() fails due to a missing Modal dependency or an import error from massgen.cloud.modal_launcher, the handler will raise a secondary exception when attempting to import CloudJobError, masking the original failure. Wrap the import in a try/except to ensure the fallback error handling path is reached.

🔧 Suggested fix
-        from .cloud.modal_launcher import CloudJobError
-
-        if isinstance(e, CloudJobError) and "timeout" in str(e).lower():
+        try:
+            from .cloud.modal_launcher import CloudJobError
+        except Exception:  # Don't mask the original failure.
+            CloudJobError = None
+
+        if CloudJobError and isinstance(e, CloudJobError) and "timeout" in str(e).lower():
             print(f"❌ Timeout error: {e}", flush=True)
             sys.exit(EXIT_TIMEOUT)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Keep cloud-specific timeout mapping distinct from generic execution failures.
from .cloud.modal_launcher import CloudJobError
if isinstance(e, CloudJobError) and "timeout" in str(e).lower():
print(f"❌ Timeout error: {e}", flush=True)
sys.exit(EXIT_TIMEOUT)
# Keep cloud-specific timeout mapping distinct from generic execution failures.
try:
from .cloud.modal_launcher import CloudJobError
except Exception: # Don't mask the original failure.
CloudJobError = None
if CloudJobError and isinstance(e, CloudJobError) and "timeout" in str(e).lower():
print(f"❌ Timeout error: {e}", flush=True)
sys.exit(EXIT_TIMEOUT)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@massgen/cli.py` around lines 9939 - 9944, The exception handler currently
does a top-level import of CloudJobError from massgen.cloud.modal_launcher which
can raise ImportError and mask the original failure from _run_cloud_job(); wrap
the import in a try/except (catch ImportError/Exception) inside the except block
so the handler falls back to the generic error path if the import fails, and
only perform isinstance(e, CloudJobError) and the timeout string check when the
import succeeded (otherwise use the existing generic error handling/exit codes).

print(f"❌ Error: {e}", flush=True)
_save_prompt_metadata_failure_fallback("execution_error", failure_error=e)
sys.exit(EXIT_EXECUTION_ERROR)
Expand Down Expand Up @@ -10328,6 +10424,17 @@ def cli_main():
help="Enable automation mode: silent output (~10 lines), status.json tracking, meaningful exit codes. "
"REQUIRED for LLM agents and background execution. Automatically isolates workspaces for parallel runs.",
)
parser.add_argument(
"--cloud",
action="store_true",
help="Run the job in Modal cloud (MVP: single-agent automation).",
)
parser.add_argument(
"--cloud-timeout",
type=int,
default=3600,
help="Cloud job timeout in seconds (default: 3600).",
)
parser.add_argument(
"--stream-events",
action="store_true",
Expand Down Expand Up @@ -10585,6 +10692,9 @@ def cli_main():
if args.plan_chunks is not None and args.plan_chunks <= 0:
print("❌ --plan-chunks must be a positive integer")
sys.exit(2)
if args.cloud_timeout <= 0:
print("❌ --cloud-timeout must be a positive integer")
sys.exit(2)

# Validate mode flag combinations
mode_errors = validate_mode_flag_combinations(args)
Expand Down
18 changes: 18 additions & 0 deletions massgen/cloud/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env python3
"""Cloud execution utilities for MassGen."""

from massgen.cloud.cloud_job import (
CloudJobError,
CloudJobLauncher,
CloudJobRequest,
CloudJobResult,
)
from massgen.cloud.modal_launcher import ModalCloudJobLauncher

__all__ = [
"CloudJobError",
"CloudJobLauncher",
"CloudJobRequest",
"CloudJobResult",
"ModalCloudJobLauncher",
]
43 changes: 43 additions & 0 deletions massgen/cloud/cloud_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python3

from dataclasses import dataclass
from pathlib import Path


class CloudJobError(RuntimeError):
"""Raised when a cloud job fails or returns invalid output."""


@dataclass
class CloudJobRequest:
"""Payload for launching a cloud run."""

prompt: str
config_yaml: str
timeout_seconds: int
cloud_job_id: str = ""


@dataclass
class CloudJobResult:
"""Result returned from the cloud launcher."""

final_answer: str
artifacts_dir: Path
local_log_dir: Path | None
local_events_path: Path | None
remote_log_dir: str | None


class CloudJobLauncher:
"""Interface for launching cloud jobs."""

RESULT_MARKER = "__MASSGEN_CLOUD_JOB_RESULT__"

def __init__(self, workspace_root: Path | None = None):
base = workspace_root or (Path.cwd() / ".massgen" / "cloud_jobs")
self.workspace_root = base
self.workspace_root.mkdir(parents=True, exist_ok=True)

def launch(self, request: CloudJobRequest) -> CloudJobResult:
raise NotImplementedError
Loading
Loading