broadinstitute · kilonzi · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025 · lucidtronix
diff --git a/model_zoo/ECG2AF/deployment/__init__.py b/model_zoo/ECG2AF/deployment/__init__.py
diff --git a/model_zoo/ECG2AF/deployment/v1/__init__.py b/model_zoo/ECG2AF/deployment/v1/__init__.py
diff --git a/model_zoo/ECG2AF/deployment/v1/processing_image/Dockerfile b/model_zoo/ECG2AF/deployment/v1/processing_image/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.9-slim
+WORKDIR /app
+COPY prepare.py /app/
+COPY finalize.py /app/
+COPY requirements.txt /app/
+RUN pip install -r /app/requirements.txt
+ENTRYPOINT ["python"]
diff --git a/model_zoo/ECG2AF/deployment/v1/processing_image/finalize.py b/model_zoo/ECG2AF/deployment/v1/processing_image/finalize.py
@@ -0,0 +1,44 @@
+import argparse
+import json
+import numpy as np
+import pandas as pd
+
+
+def convert_survival_curve_to_risk_score(curve):
+    curve = np.array(curve)
+    return 1 - np.cumprod(curve[:25])[-1]
+
+
+def finalize(input_csv, predictions_json, output_csv):
+    with open(predictions_json, "r") as f:
+        prediction_data = json.load(f)
+
+    df = pd.read_csv(input_csv, dtype={"file_id": str})
+
+    age = prediction_data["output_age_from_wide_csv_continuous"]
+    af = prediction_data["output_af_in_read_categorical"]
+    sex = prediction_data["output_sex_from_wide_categorical"]
+    curves = prediction_data["output_survival_curve_af_survival_curve"]
+
+    if len(age) != len(df):
+        raise ValueError(f"Mismatch: {len(age)} predictions but {len(df)} rows in input CSV!")
+
+    df["output_age"] = [row[0] for row in age]
+    df["output_af_0"] = [row[0] for row in af]
+    df["output_af_1"] = [row[1] for row in af]
+    df["output_sex_male"] = [row[0] for row in sex]
+    df["output_sex_female"] = [row[1] for row in sex]
+    df["af_risk_score"] = [convert_survival_curve_to_risk_score(row) for row in curves]
+
+    df.to_csv(output_csv, index=False)
+    print(f"✅ Predictions written to {output_csv} ({len(df)} rows).")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Path to input CSV")
+    parser.add_argument("--output", required=True, help="Path to final CSV with predictions")
+    parser.add_argument("--predictions", required=True, help="Path to predictions JSON")
+    args = parser.parse_args()
+
+    finalize(args.input, args.predictions, args.output)
diff --git a/model_zoo/ECG2AF/deployment/v1/processing_image/prepare.py b/model_zoo/ECG2AF/deployment/v1/processing_image/prepare.py
@@ -0,0 +1,53 @@
+import argparse
+
+import h5py
+import numpy as np
+import pandas as pd
+import smart_open
+
+ECG_REST_LEADS = {
+    'strip_I': 0, 'strip_II': 1, 'strip_III': 2, 'strip_V1': 6, 'strip_V2': 7, 'strip_V3': 8,
+    'strip_V4': 9, 'strip_V5': 10, 'strip_V6': 11, 'strip_aVF': 5, 'strip_aVL': 4, 'strip_aVR': 3,
+}
+ECG_SHAPE = (5000, 12)
+ECG_HD5_PATH = 'ukb_ecg_rest'
+
+
+def ecg_as_tensor(ecg_file):
+    with smart_open.open(ecg_file, 'rb') as f:
+        with h5py.File(f, 'r') as hd5:
+            tensor = np.zeros(ECG_SHAPE, dtype=np.float32)
+            for lead in ECG_REST_LEADS:
+                data = np.array(hd5[f'{ECG_HD5_PATH}/{lead}/instance_0'])
+                tensor[:, ECG_REST_LEADS[lead]] = data
+
+            mean = np.mean(tensor)
+            std = np.std(tensor) + 1e-7
+            tensor = (tensor - mean) / std
+    return tensor
+
+
+def prepare(input_csv, output_h5):
+    """Processes ECG files into HDF5 tensor format from GCS/Azure/Local."""
+    df = pd.read_csv(input_csv, dtype={"file": str})
+    h5_file = h5py.File(output_h5, "w")
+    tensors_group = h5_file.create_group("tensors")
+    df = df.dropna(subset=["file"])
+    df["file"] = df["file"].astype(str)
+    for _, row in df.iterrows():
+        sample_id, file_path = row["file_id"], row["file"]
+        print(f"Processing: sample_id={sample_id}, file_path={file_path}, type={type(file_path)}")
+        tensor = ecg_as_tensor(file_path)
+        tensors_group.create_dataset(str(sample_id), data=tensor)
+
+    h5_file.close()
+    print(f"Processed ECG tensors saved to {output_h5}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="Path to input CSV")
+    parser.add_argument("--output", required=True, help="Path to output HDF5 file")
+    args = parser.parse_args()
+
+    prepare(args.input, args.output)
diff --git a/model_zoo/ECG2AF/deployment/v1/processing_image/requirements.txt b/model_zoo/ECG2AF/deployment/v1/processing_image/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+numpy
+h5py
+smart-open[gcs]