Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions python_backend/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,8 @@ PORT=3002

# CORS Allowed Origin (default: * — restrict in production)
# ALLOWED_ORIGIN=https://your-frontend-domain.com

# Text anonymization surrogate salt.
# Keep the real value in your local .env or deployment secrets.
# Do not commit the real salt.
# BIOBLOCK_STUDY_SALT=replace-with-a-long-random-secret
33 changes: 29 additions & 4 deletions python_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,13 @@
# Preview factory imports
from services.preview.factory import PreviewFactory
from services.audit_logger import AuditLogger
from services.ingestion import HEADER_READ_LIMIT, IngestionError, route_for_ingestion
from services.ingestion import (
HEADER_READ_LIMIT,
TEXT_READ_LIMIT_BYTES,
IngestionError,
detect_modality,
route_for_ingestion,
)

# PDF extraction imports
try:
Expand Down Expand Up @@ -300,23 +306,42 @@ async def ingest_file(
profile: str = Form("strict"),
):
"""
Privacy-safe Week 1 upload entry point.
Privacy-safe upload entry point.

This endpoint only detects modality and selects a placeholder handler.
It does not store raw uploads, index metadata, or trigger downstream flows.
Text uploads are anonymized. Other modalities still route to placeholder
handlers until their later milestones.
"""
try:
header = await file.read(HEADER_READ_LIMIT)
if not header:
raise HTTPException(status_code=400, detail="Uploaded file is empty")

text_content = None
try:
modality = detect_modality(file.filename, file.content_type, header)
except IngestionError:
modality = None

if modality == "text":
await file.seek(0)
text_content = await file.read(TEXT_READ_LIMIT_BYTES + 1)
if len(text_content) > TEXT_READ_LIMIT_BYTES:
raise HTTPException(
status_code=413,
detail=(
f"Text uploads must be {TEXT_READ_LIMIT_BYTES} "
"bytes or smaller"
),
)

await file.seek(0)

return route_for_ingestion(
filename=file.filename,
content_type=file.content_type,
header=header,
profile=profile,
text_content=text_content,
)
except IngestionError as e:
raise HTTPException(status_code=e.status_code, detail=e.detail)
Expand Down
57 changes: 52 additions & 5 deletions python_backend/services/ingestion.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from typing import Any, Callable, Dict, Optional

from services.text_anonymization import (
TextAnonymizationError,
anonymize_clinical_text,
)

SUPPORTED_PROFILES = {"strict", "research"}
SUPPORTED_MODALITIES = {"csv", "text", "dicom", "nifti", "wsi"}
HEADER_READ_LIMIT = 4096
TEXT_READ_LIMIT_BYTES = 256 * 1024


class IngestionError(ValueError):
Expand Down Expand Up @@ -93,8 +98,33 @@ def anonymize_csv() -> Dict[str, str]:
return _placeholder_result("anonymize_csv")


def anonymize_text() -> Dict[str, str]:
return _placeholder_result("anonymize_text")
def anonymize_text(
text_content: bytes,
profile: str,
study_salt: Optional[str] = None,
) -> Dict[str, Any]:
try:
text = text_content.decode("utf-8")
except UnicodeDecodeError as exc:
raise IngestionError("Text uploads must be UTF-8 encoded") from exc

try:
result = anonymize_clinical_text(
text=text,
profile=profile,
study_salt=study_salt,
)
except TextAnonymizationError as exc:
raise IngestionError(exc.detail, status_code=exc.status_code) from exc

return {
"handler": "anonymize_text",
"routing_status": "handler_selected",
"anonymization_status": result["anonymization_status"],
"message": "Text anonymization completed.",
"anonymized_text": result["anonymized_text"],
"detected_entities": result["detected_entities"],
}


def anonymize_dicom() -> Dict[str, str]:
Expand All @@ -109,7 +139,7 @@ def anonymize_wsi() -> Dict[str, str]:
return _placeholder_result("anonymize_wsi")


HANDLER_REGISTRY: Dict[str, Callable[[], Dict[str, str]]] = {
HANDLER_REGISTRY: Dict[str, Callable[..., Dict[str, Any]]] = {
"csv": anonymize_csv,
"text": anonymize_text,
"dicom": anonymize_dicom,
Expand All @@ -123,6 +153,8 @@ def route_for_ingestion(
content_type: Optional[str],
header: bytes,
profile: str,
text_content: Optional[bytes] = None,
study_salt: Optional[str] = None,
) -> Dict[str, Any]:
safe_name = _safe_filename(filename)
privacy_profile = validate_privacy_profile(profile)
Expand All @@ -135,8 +167,17 @@ def route_for_ingestion(
status_code=500,
)

handler_result = handler()
return {
if modality == "text":
if text_content is None:
raise IngestionError(
"Text content was not provided for anonymization",
status_code=500,
)
handler_result = handler(text_content, privacy_profile, study_salt)
else:
handler_result = handler()

response = {
"status": "success",
"filename": safe_name,
"detected_modality": modality,
Expand All @@ -152,3 +193,9 @@ def route_for_ingestion(
"blockchain_transaction": "pending",
},
}
if "anonymized_text" in handler_result:
response["anonymized_text"] = handler_result["anonymized_text"]
if "detected_entities" in handler_result:
response["detected_entities"] = handler_result["detected_entities"]

return response
Loading
Loading