
diff --git a/app/client/src/pages/Home/WelcomePage.tsx b/app/client/src/pages/Home/WelcomePage.tsx
index 82498ab..135f70d 100644
--- a/app/client/src/pages/Home/WelcomePage.tsx
+++ b/app/client/src/pages/Home/WelcomePage.tsx
@@ -1,10 +1,12 @@
-import { Button, Col, Flex, Layout, Row, Image } from 'antd';
+import toString from 'lodash/toString';
+import { Button, Col, Flex, Layout, Row, Image, Checkbox } from 'antd';
import React from 'react';
import styled from 'styled-components';
import SDGIcon from '../../assets/sdg-landing.svg';
import LightBulbIcon from '../../assets/ic-lightbulb.svg';
import QueryPromptIcon from '../../assets/ic-query-prompt.svg';
import NumbersIcon from '../../assets/ic-numbers.svg';
+import { CheckboxChangeEvent } from 'antd/es/checkbox';
const { Content } = Layout;
@@ -107,6 +109,11 @@ const InfoSection = styled.div`
const WelcomePage: React.FC = () => {
+ const onChange = (e: CheckboxChangeEvent) => {
+ const checked = e.target.checked;
+ window.localStorage.setItem('sds_mute_welcome_page', toString(checked));
+ }
+
return (
@@ -148,6 +155,10 @@ const WelcomePage: React.FC = () => {
+
+
+ {`Don't show me this again`}
+
diff --git a/app/client/src/routes.tsx b/app/client/src/routes.tsx
index 853257c..59e20a3 100644
--- a/app/client/src/routes.tsx
+++ b/app/client/src/routes.tsx
@@ -2,7 +2,7 @@ import { Navigate, createBrowserRouter } from "react-router-dom";
import Layout from "./Container";
import DataGenerator from "./pages/DataGenerator";
import HomePage from "./pages/Home";
-import { Pages } from "./types";
+import { Pages, WizardModeType } from "./types";
import EvaluatorPage from "./pages/Evaluator";
import ReevaluatorPage from "./pages/Evaluator/ReevaluatorPage";
import DatasetDetailsPage from "./pages/DatasetDetails/DatasetDetailsPage";
@@ -12,6 +12,10 @@ import EvaluationDetailsPage from "./pages/EvaluationDetails/EvaluationDetailsPa
//import TelemetryDashboard from "./components/TelemetryDashboard";
+const isWelcomePageMuted = () => {
+ return window.localStorage.getItem('sds_mute_welcome_page') === 'true';
+}
+
const router = createBrowserRouter([
{
path: '/',
@@ -19,7 +23,9 @@ const router = createBrowserRouter([
children: [
{
path: '/', // Redirect root to Pages.WELCOME
- element: ,
+ element: isWelcomePageMuted() ? :
+ ,
+ errorElement:
},
{
path: Pages.HOME,
@@ -29,7 +35,13 @@ const router = createBrowserRouter([
},
{
path: Pages.GENERATOR,
- element: ,
+ element: ,
+ errorElement: ,
+ loader: async () => null
+ },
+ {
+ path: Pages.DATA_AUGMENTATION,
+ element: ,
errorElement: ,
loader: async () => null
},
diff --git a/app/client/src/types.ts b/app/client/src/types.ts
index ee3381b..2490874 100644
--- a/app/client/src/types.ts
+++ b/app/client/src/types.ts
@@ -1,5 +1,6 @@
export enum Pages {
GENERATOR = 'data-generator',
+ DATA_AUGMENTATION = 'data-augmentation',
REGENERATE = 're-generate',
EVALUATOR = 'evaluator',
HISTORY = 'history',
@@ -45,4 +46,16 @@ export const EXPORT_TYPE_LABELS: Record = {
export type JobStatus = 'ENGINE_STOPPED' | 'ENGINE_SUCCEEDED' | 'ENGINE_TIMEDOUT' | 'ENGINE_SCHEDULING' | 'ENGINE_RUNNING' | 'null' | 'default';
-export const HuggingFaceIconUrl = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg";
\ No newline at end of file
+export const HuggingFaceIconUrl = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg";
+
+export interface UseCase {
+ name: string;
+ id: string;
+ label: string;
+ value: string;
+}
+
+export enum WizardModeType {
+ DATA_GENERATION = 'data-generation',
+ DATA_AUGMENTATION = 'data-augmention'
+}
\ No newline at end of file
diff --git a/app/core/config.py b/app/core/config.py
index 777a6e4..78a14f7 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -17,6 +17,7 @@ class UseCase(str, Enum):
LENDING_DATA = "lending_data"
#HOUSING_DATA = "housing_data"
CREDIT_CARD_DATA = "credit_card_data"
+ TICKETING_DATASET = "ticketing_dataset"
class Technique(str, Enum):
SFT = "sft"
@@ -595,9 +596,46 @@ class UseCaseMetadataEval(BaseModel):
""",
- schema=None
+ schema=None
),
-}
+
+ UseCase.TICKETING_DATASET: UseCaseMetadata(
+ name="Ticketing Dataset",
+ description= "Synthetic dataset for ticketing system ",
+ topics=["Technical Issues", "Billing Queries", "Payment queries"],
+ default_examples=[
+ {
+ "Prompt": "I have received this message that I owe $300 and I was instructed to pay the bill online. I already paid this amount and I am wondering why I received this message.",
+ "Completion": "report_payment_issue"
+ },
+ {
+ "Prompt": "I will not be able to attend the presentation and would like to cancel my rsvp.",
+ "Completion": "cancel_ticket"
+ },
+ {
+ "Prompt": "I am having questions regarding the exact time, location, and requirements of the event and would like to talk to customer service.",
+ "Completion": "Customer_service"
+ }
+ ]
+ ,
+ prompt= """
+ Generate authentic customer support ticket interactions that have a user query and system response.
+ For each user query, the system generates a keyword that is used to forward the user to the specific subsystem.
+ Requirements for user queries:
+ - Use professional, respectful language
+ - Follow standard customer service best practices
+ Each response should be a single id from the following list:
+ cancel_ticket,customer_service,report_payment_issue
+ Here are the explanations of the responses:
+ cancel_ticket means that the customer wants to cancel the ticket.
+ customer_service means that customer wants to talk to customer service.
+ report_payment_issue means that the customer is facing payment issues and wants to be forwarded to the billing department to resolve the issue.
+
+ """,
+ schema=None
+ )
+ }
+
USE_CASE_CONFIGS_EVALS = {
@@ -916,6 +954,43 @@ class UseCaseMetadataEval(BaseModel):
Give a score rating 1-10 for the given data. If there are more than 9 points to subtract use 1 as the absolute minimum scoring. List all justification as list.
+ """
+ ),
+ UseCase.TICKETING_DATASET: UseCaseMetadataEval(
+ name="Ticketing Dataset",
+ default_examples=[
+ {
+ "score": 5,
+ "justification": """
+ The query is professionally written, respectful, and follows customer service best practices.
+ The response 'report_payment_issue' is one of the allowed keywords.
+ The matching between the query and response is perfect according to the provided definitions.
+
+ """},
+ {
+ "score": 3,
+ "justification": """
+ The query is professionally written and respectful.
+ The response 'cancel_ticket' is one of the allowed keywords.
+ While the response uses a valid keyword, it doesn't match the most appropriate category for the specific query content.
+ """
+ },
+
+ ],
+ prompt= """
+ You are given a user query for a ticketing support system and the system responses which is a keyword that is used to forward the user to the specific subsystem.
+ Evaluate whether the queries:
+ - Use professional, respectful language
+ - Follow standard customer service best practices
+ Evaluate whether the responses use only one of the the following keywords: cancel_ticket,customer_service,report_payment_issue
+ Evaluate whether the solutions and responses are correctly matched based on the following definitions:
+ cancel_ticket means that the customer wants to cancel the ticket.
+ customer_service means that customer wants to talk to customer service.
+ report_payment_issue means that the customer is facing payment issues and wants to be forwarded to the billing department to resolve the issue.
+ Give a score of 1-5 based on the following instructions:
+ If the responses don’t match the four keywords give always value 1.
+ Rate the quality of the queries and responses based on the instructions give a rating between 1 to 5.
+
"""
)
}
diff --git a/app/core/database.py b/app/core/database.py
index f5c7682..57e286a 100644
--- a/app/core/database.py
+++ b/app/core/database.py
@@ -71,8 +71,9 @@ def init_db(self):
job_id TEXT,
job_name TEXT UNIQUE,
job_status TEXT,
- job_creator_name TEXT
-
+ job_creator_name TEXT,
+ completed_rows INTEGER
+
)
""")
@@ -163,8 +164,8 @@ def save_generation_metadata(self, metadata: Dict) -> int:
custom_prompt, model_parameters, input_key, output_key, output_value, generate_file_name,
display_name, local_export_path, hf_export_path, s3_export_path,
num_questions, total_count, topics, examples,
- schema, doc_paths, input_path, job_id, job_name, job_status, job_creator_name
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ schema, doc_paths, input_path, job_id, job_name, job_status, job_creator_name, completed_rows
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
values = (
@@ -194,7 +195,8 @@ def save_generation_metadata(self, metadata: Dict) -> int:
metadata.get('job_id', None),
metadata.get('job_name', None),
metadata.get('job_status', None),
- metadata.get('job_creator_name', None)
+ metadata.get('job_creator_name', None),
+ metadata.get('completed_rows', None)
)
cursor.execute(query, values)
@@ -212,7 +214,7 @@ def save_generation_metadata(self, metadata: Dict) -> int:
print(f"Error saving metadata to database: {str(e)}")
raise
- def update_job_generate(self, job_name: str, generate_file_name: str, local_export_path: str, timestamp: str, job_status):
+ def update_job_generate(self, job_name: str, generate_file_name: str, local_export_path: str, timestamp: str, job_status, completed_rows):
"""Update job generate with retry mechanism"""
max_retries = 3
retry_delay = 1 # seconds
@@ -244,11 +246,12 @@ def update_job_generate(self, job_name: str, generate_file_name: str, local_expo
SET generate_file_name = ?,
local_export_path = ?,
timestamp = ?,
- job_status = ?
+ job_status = ?,
+ completed_rows = ?
WHERE job_name = ?
AND job_name IS NOT NULL
AND job_name != ''
- """, (generate_file_name, local_export_path, timestamp, job_status, job_name))
+ """, (generate_file_name, local_export_path, timestamp, job_status, completed_rows,job_name))
rows_affected = cursor.rowcount
conn.commit()
diff --git a/app/main.py b/app/main.py
index 1ce5d47..88c6a9e 100644
--- a/app/main.py
+++ b/app/main.py
@@ -773,6 +773,7 @@ async def get_use_cases():
{"id": UseCase.CUSTOM, "name": "Custom"},
{"id": UseCase.LENDING_DATA, "name": "Lending Data"},
{"id": UseCase.CREDIT_CARD_DATA, "name": "Credit Card Data"},
+ {"id": UseCase.TICKETING_DATASET, "name": "Ticketing Dataset"},
]
}
@@ -1420,13 +1421,22 @@ async def perform_upgrade():
# 2. Database migrations
try:
- db_success, db_message = await alembic_manager.handle_database_upgrade()
- if db_success:
- db_upgraded = True
- messages.append(db_message)
- else:
- messages.append(f"Database upgrade failed: {db_message}")
- raise HTTPException(status_code=500, detail=db_message)
+ # In your upgrade endpoint, you can add this debug line:
+ print(f"Current working directory: {os.getcwd()}")
+ print(f"Alembic.ini exists: {os.path.exists('alembic.ini')}")
+ print("--- Starting database migration via external script ---")
+ # Use `uv run` to ensure the script runs within the project's virtual environment
+ # This is more robust than just calling 'python'
+ result = subprocess.run(
+ ["uv", "run", "python", "run_migrations.py"],
+ capture_output=True,
+ text=True,
+ check=True # This will raise CalledProcessError on failure
+ )
+
+ print(result.stdout) # Log the output from the script
+ db_upgraded = True
+ messages.append("Database migration check completed successfully.")
except Exception as e:
messages.append(f"Database migration failed: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
diff --git a/app/migrations/alembic_manager.py b/app/migrations/alembic_manager.py
index 51538c6..7e1b257 100644
--- a/app/migrations/alembic_manager.py
+++ b/app/migrations/alembic_manager.py
@@ -1,61 +1,84 @@
# app/migrations/alembic_manager.py
-from alembic.config import Config
-from alembic import command
-from alembic.script import ScriptDirectory
-from alembic.runtime.migration import MigrationContext
-from pathlib import Path
-import os
-from sqlalchemy import create_engine
+import subprocess
class AlembicMigrationManager:
def __init__(self, db_path: str = None):
- """Initialize Alembic with the same database path as DatabaseManager"""
- self.app_path = Path(__file__).parent.parent.parent
-
- if db_path is None:
- db_path = os.path.join(self.app_path, "metadata.db")
- self.db_path = db_path
-
- # Initialize Alembic config
- self.alembic_cfg = Config(str(self.app_path / "alembic.ini"))
- self.alembic_cfg.set_main_option('script_location', str(self.app_path / "alembic"))
- self.alembic_cfg.set_main_option('sqlalchemy.url', f'sqlite:///{db_path}')
-
- # Create engine for version checks
- self.engine = create_engine(f'sqlite:///{db_path}')
-
- async def get_db_version(self) -> str:
- """Get current database version"""
- with self.engine.connect() as conn:
- context = MigrationContext.configure(conn)
- return context.get_current_revision()
+ """Initialize with database path (kept for interface compatibility)"""
+ self.db_path = db_path or "metadata.db"
async def handle_database_upgrade(self) -> tuple[bool, str]:
"""
- Handle database migrations carefully to avoid disrupting existing data
+ Simple database migration using alembic upgrade head
+ No directory changes needed - already in project root
"""
try:
- # First check if alembic_version table exists
- try:
- version = await self.get_db_version()
- if version is None:
- # Database exists but no alembic version - stamp current
- command.stamp(self.alembic_cfg, "head")
- return True, "Existing database stamped with current version"
- except Exception:
- # No alembic_version table - stamp current
- command.stamp(self.alembic_cfg, "head")
- return True, "Existing database stamped with current version"
+ # Run upgrade head - we're already in the right directory
+ result = subprocess.run(
+ ["alembic", "upgrade", "head"],
+ capture_output=True,
+ text=True,
+ check=True
+ )
- # Now check for and apply any new migrations
- script = ScriptDirectory.from_config(self.alembic_cfg)
- head_revision = script.get_current_head()
+ # Check if anything was actually upgraded
+ if "Running upgrade" in result.stdout:
+ return True, f"Database upgraded successfully: {result.stdout.strip()}"
+ else:
+ return True, "Database is already up to date"
+
+ except subprocess.CalledProcessError as e:
+ error_msg = e.stderr or e.stdout or str(e)
+ return False, f"Database upgrade failed: {error_msg}"
+ except Exception as e:
+ return False, f"Error during database upgrade: {str(e)}"
+
+ async def get_migration_status(self) -> dict:
+ """Get detailed migration status for debugging"""
+ try:
+ # Get current version
+ current_result = subprocess.run(
+ ["alembic", "current"],
+ capture_output=True,
+ text=True,
+ check=True
+ )
- if version != head_revision:
- command.upgrade(self.alembic_cfg, "head")
- return True, "Database schema updated successfully"
+ # Get head version
+ head_result = subprocess.run(
+ ["alembic", "show", "head"],
+ capture_output=True,
+ text=True,
+ check=True
+ )
- return True, "Database schema is up to date"
-
+ return {
+ "current": current_result.stdout.strip(),
+ "head": head_result.stdout.strip(),
+ "status": "ready"
+ }
+
+ except subprocess.CalledProcessError as e:
+ error_msg = e.stderr or e.stdout or str(e)
+ return {"error": f"Command failed: {error_msg}", "status": "error"}
except Exception as e:
- return False, f"Error during database upgrade: {str(e)}"
\ No newline at end of file
+ return {"error": str(e), "status": "error"}
+
+ async def get_current_version(self) -> str:
+ """Get current database version using alembic current command"""
+ try:
+ result = subprocess.run(
+ ["alembic", "current"],
+ capture_output=True,
+ text=True,
+ check=True
+ )
+
+ # Extract just the version ID from output like "2b4e8d9f6c3a (head)"
+ import re
+ match = re.search(r'([a-f0-9]{12})', result.stdout)
+ return match.group(1) if match else "none"
+
+ except subprocess.CalledProcessError:
+ return "none"
+ except Exception:
+ return "unknown"
\ No newline at end of file
diff --git a/app/migrations/alembic_schema_models.py b/app/migrations/alembic_schema_models.py
index 3967a11..5eb735f 100644
--- a/app/migrations/alembic_schema_models.py
+++ b/app/migrations/alembic_schema_models.py
@@ -35,6 +35,7 @@ class GenerationMetadataModel(Base):
job_name = Column(Text, unique=True)
job_status = Column(Text)
job_creator_name = Column(Text)
+ completed_rows = Column(Integer)
class EvaluationMetadataModel(Base):
__tablename__ = 'evaluation_metadata'
diff --git a/app/services/synthesis_service.py b/app/services/synthesis_service.py
index 5aa25bb..7f893f9 100644
--- a/app/services/synthesis_service.py
+++ b/app/services/synthesis_service.py
@@ -1093,7 +1093,8 @@ def json_serializable(obj):
'input_path': input_path_str,
'input_key': request.input_key,
'output_key': request.output_key,
- 'output_value': request.output_value
+ 'output_value': request.output_value,
+ 'completed_rows': len(final_output) if final_output else 0
}
if is_demo:
@@ -1109,7 +1110,7 @@ def json_serializable(obj):
generate_file_name = os.path.basename(file_path) if final_output else ''
final_output_path = file_path if final_output else ''
- self.db.update_job_generate(job_name, generate_file_name, final_output_path, timestamp, job_status)
+ self.db.update_job_generate(job_name, generate_file_name, final_output_path, timestamp, job_status, len(final_output) if final_output else 0)
self.db.backup_and_restore_db()
return {
"status": "completed" if final_output else "failed",
@@ -1157,13 +1158,14 @@ def json_serializable(obj):
if saved_partial_results:
# Update with actual file information for partial results
generate_file_name = os.path.basename(file_path)
- final_output_path = file_path
+ final_output_path = file_path
+ completed_rows = len(final_output) if final_output else 0
else:
# No results saved, use empty values
generate_file_name = ''
final_output_path = ''
-
- self.db.update_job_generate(job_name, generate_file_name, final_output_path, timestamp, job_status)
+ completed_rows = 0
+ self.db.update_job_generate(job_name, generate_file_name, final_output_path, timestamp, job_status, completed_rows = completed_rows )
raise
def get_health_check(self) -> Dict:
diff --git a/images/synthetic-data-studio-banner.svg b/images/synthetic-data-studio-banner.svg
new file mode 100644
index 0000000..6804615
--- /dev/null
+++ b/images/synthetic-data-studio-banner.svg
@@ -0,0 +1,16 @@
+
diff --git a/run_migrations.py b/run_migrations.py
new file mode 100644
index 0000000..dc83e49
--- /dev/null
+++ b/run_migrations.py
@@ -0,0 +1,33 @@
+import asyncio
+import sys
+from pathlib import Path
+
+# Ensure the 'app' directory is in the Python path
+ROOT_DIR = Path(__file__).parent
+APP_DIR = ROOT_DIR / "app"
+sys.path.append(str(ROOT_DIR))
+
+from app.migrations.alembic_manager import AlembicMigrationManager
+
+async def main():
+ """
+ Initializes the migration manager and runs the database upgrade.
+ This will always use the latest code from disk.
+ """
+ print("--- Running dedicated migration script ---")
+ # Assumes your DB file is named metadata.db in the root
+ db_path = str(ROOT_DIR / "metadata.db")
+ alembic_manager = AlembicMigrationManager(db_path)
+
+ success, message = await alembic_manager.handle_database_upgrade()
+
+ if not success:
+ print(f"Migration Error: {message}")
+ # Exit with a non-zero status code to indicate failure
+ sys.exit(1)
+
+ print(f"Migration Success: {message}")
+ print("--- Migration script finished ---")
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file