CellForge/main.py at main · gersteinlab/CellForge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#!/usr/bin/env python3
"""
cellforge Main Entry Point
End-to-End Intelligent Multi-Agent System for Automated Single-Cell Data Analysis and Method Design
"""

import os
import sys
import json
import argparse
from pathlib import Path
from typing import Dict, Any, Optional
from dotenv import load_dotenv

# Load environment variables from .env file (if it exists)
env_file = Path(__file__).parent / ".env"
try:
    if env_file.exists():
        load_dotenv(env_file)
        print(f"✅ Loaded environment variables from {env_file}")
    else:
        print(f"⚠️  .env file not found at {env_file}")
        print("   Run 'python setup_env.py' to create it")
except Exception as e:
    print(f"⚠️  Warning: Could not load .env file: {e}")
    print("   This is normal if .env file doesn't exist yet")

# Add project root to path
project_root = Path(__file__).parent
sys.path.append(str(project_root))

# Import cellforge components
try:
    from cellforge.llm import LLMInterface
except ImportError:
    print("⚠️  cellforge package not found. Please run 'python install.py' first.")
    sys.exit(1)

# Default task description - EDIT THIS VARIABLE TO CUSTOMIZE YOUR TASK
DEFAULT_TASK_DESCRIPTION = """Your task is to develop a predictive model that accurately estimates gene expression profiles of individual K562 cells following CRISPR interference (CRISPRi), using the dataset from Norman et al. (2019, Science).

Task Definition:
- Input: Baseline gene expression profile of an unperturbed K562 cell and the identity of the target gene(s) for perturbation
- Output: Predicted gene expression profile after perturbation

Evaluation Scenarios:
1. Unseen Perturbations: Predict effects of gene perturbations not present during training
2. Unseen Cell Contexts: Predict responses in cells with gene expression profiles not observed during training

Evaluation Metrics:
- Mean Squared Error (MSE): Measures the average squared difference between predicted and observed gene expression.
- Pearson Correlation Coefficient (PCC): Quantifies linear correlation between predicted and observed profiles.
- R² (Coefficient of Determination): Represents the proportion of variance in the observed gene expression that can be explained by the predicted values.
- MSE for Differentially Expressed (DE) Genes (MSE_DE): Same as MSE but computed specifically for genes identified as differentially expressed.
- PCC for Differentially Expressed (DE) Genes (PCC_DE): Same as PCC but computed specifically for genes identified as differentially expressed.
- R² for Differentially Expressed (DE) Genes (R2_DE): Same as R² but computed specifically for genes identified as differentially expressed."""

def load_config(config_path: str = "config.json") -> Dict[str, Any]:
    """Load configuration file"""
    if os.path.exists(config_path):
        with open(config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
    else:
        # Default configuration
        default_config = {
            "task_description": DEFAULT_TASK_DESCRIPTION,
            "dataset_path": "cellforge/data/datasets/",
            "output_dir": "results/",
            "llm_config": {
                "provider": "openai",  # openai, anthropic, local
                "model": os.getenv("MODEL_NAME", "gpt-4"),
                "api_key": "loaded_from_env"  # API keys are loaded from .env file
            },
            "workflow_phases": ["task_analysis", "method_design", "code_generation"],
            "qdrant_config": {
                "host": os.getenv("QDRANT_URL", "localhost"),
                "port": int(os.getenv("QDRANT_PORT", "6333"))
            }
        }

        # Save default configuration
        with open(config_path, 'w', encoding='utf-8') as f:
            json.dump(default_config, f, ensure_ascii=False, indent=2)

        print(f"✅ Default configuration file created: {config_path}")
        print("⚠️  Please configure your API keys in .env file")
        print("💡 To customize your task, edit the DEFAULT_TASK_DESCRIPTION variable in main.py")
        return default_config

    # Update task description from the variable if config exists
    config["task_description"] = DEFAULT_TASK_DESCRIPTION
    return config

def validate_config(config: Dict[str, Any]) -> bool:
    """Validate configuration file completeness"""
    required_fields = ["task_description", "dataset_path", "llm_config"]

    for field in required_fields:
        if field not in config:
            print(f"❌ Configuration file missing required field: {field}")
            return False

    # Check if at least one LLM API key is configured in .env file
    llm_api_keys = [
        os.getenv("OPENAI_API_KEY"),
        os.getenv("ANTHROPIC_API_KEY"),
        os.getenv("DEEPSEEK_API_KEY"),
        os.getenv("LLAMA_API_KEY"),
        os.getenv("QWEN_API_KEY")
    ]

    configured_llm_keys = [key for key in llm_api_keys if key and key != "your_openai_api_key_here"]

    if not configured_llm_keys:
        print("⚠️  No LLM API keys found in .env file")
        print("💡 Please copy env.example to .env and configure at least one LLM API key")
        return False

    print(f"✅ {len(configured_llm_keys)} LLM API key(s) configured")

    # Check search API keys (optional for basic functionality)
    search_api_keys = {
        "GitHub": os.getenv("GITHUB_TOKEN"),
        "SerpAPI": os.getenv("SERPAPI_KEY"),
        "PubMed": os.getenv("PUBMED_API_KEY")
    }

    missing_search_keys = []
    configured_search_keys = []
    for name, key in search_api_keys.items():
        if not key or key == f"your_{name.lower()}_key_here":
            missing_search_keys.append(name)
        else:
            configured_search_keys.append(name)

    if missing_search_keys:
        print(f"⚠️  Missing optional search API keys: {', '.join(missing_search_keys)}")
        print("💡 These keys are optional for enhanced RAG functionality")
        if configured_search_keys:
            print(f"✅ Configured search APIs: {', '.join(configured_search_keys)}")
        else:
            print("⚠️  No search APIs configured - RAG functionality will be limited")

    return True

def run_task_analysis(config: Dict[str, Any]) -> bool:
    """Run Task Analysis phase"""
    try:
        print("\n" + "="*60)
        print("PHASE 1: TASK ANALYSIS")
        print("="*60)

        from cellforge.Task_Analysis.main import run_task_analysis

        # Prepare dataset info
        dataset_info = {
            "dataset_path": config["dataset_path"],
            "dataset_name": "norman_2019_k562",
            "data_type": "scRNA-seq",
            "cell_line": "K562",
            "perturbation_type": "CRISPRi"
        }

        # Run task analysis
        result = run_task_analysis(config["task_description"], dataset_info)

        if result:
            print("✅ Task analysis completed")
            return True
        else:
            print("❌ Task analysis failed")
            return False

    except Exception as e:
        print(f"❌ Error in task analysis: {str(e)}")
        return False

def run_method_design(config: Dict[str, Any]) -> bool:
    """Run Method Design phase"""
    try:
        print("\n" + "="*60)
        print("PHASE 2: METHOD DESIGN")
        print("="*60)

        # Import method design modules
        from cellforge.Method_Design import generate_research_plan

        # Load task analysis results
        task_analysis_dir = Path("cellforge/Task_Analysis/results")
        if not task_analysis_dir.exists():
            print("❌ Task analysis results not found. Please run task analysis first.")
            return False

        # Find latest task analysis report
        task_reports = list(task_analysis_dir.glob("task_analysis_*.json"))
        if not task_reports:
            print("❌ No task analysis reports found. Please run task analysis first.")
            return False

        latest_report = max(task_reports, key=lambda x: x.stat().st_mtime)

        # Load task analysis
        with open(latest_report, 'r', encoding='utf-8') as f:
            task_analysis = json.load(f)

        # Generate research plan with automatic code generation
        output_dir = "cellforge/data/results"
        Path(output_dir).mkdir(parents=True, exist_ok=True)

        print("🔧 Generating research plan...")
        plan = generate_research_plan(
            task_analysis=task_analysis,
            rag_retriever=None,  # Will be initialized in the module
            task_type=task_analysis.get("task_type", "gene_knockout"),
            output_dir=output_dir,
            auto_generate_code=True  # Enable automatic code generation
        )

        if plan:
            print("✅ Method design completed")

            # Show generated files
            if 'generated_files' in plan:
                files_info = plan['generated_files']
                base_filename = files_info['base_filename']
                print(f"📁 Generated files:")
                print(f"  - {output_dir}/{base_filename}.md (Research plan)")
                print(f"  - {output_dir}/{base_filename}.json (Detailed data)")
                print(f"  - {output_dir}/{base_filename}.mmd (Architecture diagram)")
                print(f"  - {output_dir}/{base_filename}_consensus.png (Consensus progress)")

                # Show code generation result
                if 'code_generation' in plan:
                    code_info = plan['code_generation']
                    if code_info['status'] == 'success':
                        print(f"  - {output_dir}/result.py (Generated code)")
                        print(f"✅ Code generation completed successfully")
                    elif code_info['status'] == 'failed':
                        print(f"❌ Code generation failed: {code_info.get('error', 'Unknown error')}")
                    elif code_info['status'] == 'error':
                        print(f"❌ Code generation error: {code_info.get('error', 'Unknown error')}")

            return True
        else:
            print("❌ Method design failed")
            return False

    except Exception as e:
        print(f"❌ Error in method design: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

def run_code_generation(config: Dict[str, Any]) -> bool:
    """Run Code Generation phase"""
    try:
        print("\n" + "="*60)
        print("PHASE 3: CODE GENERATION")
        print("="*60)

        # Check if code generation was already completed in method design phase
        results_dir = Path("cellforge/data/results")
        if not results_dir.exists():
            print("❌ Results directory not found. Please run method design first.")
            return False

        # Check for generated code file
        code_file = results_dir / "result.py"
        if code_file.exists():
            print("✅ Code generation already completed in method design phase")
            print(f"📁 Generated code: {code_file}")
            return True

        # Check for research plan files
        plan_files = list(results_dir.glob("research_plan_*.json"))
        if not plan_files:
            print("❌ No research plans found. Please run method design first.")
            return False

        latest_plan = max(plan_files, key=lambda x: x.stat().st_mtime)
        print(f"📋 Found research plan: {latest_plan}")

        # Import code generation module
        try:
            from cellforge.Code_Generation import generate_code_from_plan
        except ImportError as e:
            print(f"❌ Code generation module not available: {e}")
            print("💡 Code generation requires OpenHands setup")
            return False

        # Generate code from plan
        print("🔧 Generating code from research plan...")
        code_file_path = generate_code_from_plan(
            research_plan=json.load(open(latest_plan, 'r', encoding='utf-8')),
            output_dir=str(results_dir)
        )

        if code_file_path and Path(code_file_path).exists():
            print("✅ Code generation completed")
            print(f"📁 Generated code: {code_file_path}")
            return True
        else:
            print("❌ Code generation failed")
            return False

    except Exception as e:
        print(f"❌ Error in code generation: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

def run_complete_workflow(config: Dict[str, Any]) -> bool:
    """Run complete end-to-end workflow"""
    print("🚀 Starting cellforge End-to-End Workflow")
    print("="*80)

    # Validate configuration
    if not validate_config(config):
        print("❌ Configuration validation failed, please check .env file")
        return False

    success = True

    # Run each phase
    for phase in config["workflow_phases"]:
        if phase == "task_analysis":
            success &= run_task_analysis(config)
        elif phase == "method_design":
            success &= run_method_design(config)
        elif phase == "code_generation":
            success &= run_code_generation(config)

    if success:
        print("\n" + "="*80)
        print("🎉 All phases completed!")
        print("="*80)
        print(f"Results saved to: {config['output_dir']}")
    else:
        print("\n" + "="*80)
        print("❌ Workflow execution failed")
        print("="*80)

    return success

def create_sample_dataset():
    """Create sample dataset directory structure"""
    print("📁 Creating sample dataset directory structure...")

    directories = [
        "cellforge/data/datasets/scRNA-seq",
        "cellforge/data/datasets/scATAC-seq",
        "cellforge/data/datasets/perturbation",
        "results"
    ]

    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
        print(f"  ✅ Created: {directory}")

    # Create sample README
    readme_content = """# Dataset Directory

Please place your single-cell datasets in the appropriate directories:

- `scRNA-seq/`: Single-cell RNA-seq data (.h5ad files)
- `scATAC-seq/`: Single-cell ATAC-seq data (.h5ad files)
- `perturbation/`: Drug perturbation data (.h5ad files)

## Data Format Requirements

Recommended AnnData format (.h5ad):
- Gene expression matrix stored in `adata.X`
- Cell metadata stored in `adata.obs`
- Gene metadata stored in `adata.var`
- Required annotations: cell type, condition, batch (if applicable)

## Example Datasets

You can download datasets from [scPerturb](https://projects.sanderlab.org/scperturb/):
- Norman et al. (2019) K562 CRISPRi data
- Adamson et al. (2016) Drug perturbation data
"""

    with open("cellforge/data/datasets/README.md", 'w', encoding='utf-8') as f:
        f.write(readme_content)

    print("✅ Sample dataset directory structure created")

def main():
    """Main function"""
    parser = argparse.ArgumentParser(description="cellforge - Intelligent Single-Cell Analysis System")
    parser.add_argument("--config", default="config.json", help="Configuration file path")
    parser.add_argument("--init", action="store_true", help="Initialize project structure")
    parser.add_argument("--phase", choices=["task_analysis", "method_design", "code_generation"],
                       help="Run specific phase")

    args = parser.parse_args()

    if args.init:
        print("🚀 Initializing cellforge project...")
        create_sample_dataset()
        load_config(args.config)  # Create default configuration
        print("\n✅ Project initialization completed!")
        print("📝 Please copy env.example to .env and configure your API keys")
        print("💡 To customize your task, edit the DEFAULT_TASK_DESCRIPTION variable in main.py")
        return

    # Load configuration
    config = load_config(args.config)

    if args.phase:
        # Run specific phase
        if args.phase == "task_analysis":
            run_task_analysis(config)
        elif args.phase == "method_design":
            run_method_design(config)
        elif args.phase == "code_generation":
            run_code_generation(config)
    else:
        # Run complete workflow
        run_complete_workflow(config)

if __name__ == "__main__":
    main()