Merge pull request #12 from compbiocore/copilot/update-data-integration-section

EricSalomaki · web-flow · commit 2c277502b4a1 · 2026-06-02T14:34:16.000-04:00
Update Section 13 to transfer PBMC reference labels onto workshop clusters and add DE on transferred cell types
diff --git a/notebooks/scRNAseq_in_Python.ipynb b/notebooks/scRNAseq_in_Python.ipynb
@@ -1328,19 +1328,13 @@
     "\n",
     "*Reference: [Integrating data using ingest and BBKNN](https://scanpy.readthedocs.io/en/1.10.x/tutorials/basics/integrating-data-using-ingest.html)*\n",
     "\n",
-    "When you have a well-annotated **reference dataset** and a new **query dataset**, you can transfer labels from the reference to the query using `sc.tl.ingest`.\n",
+    "In this section, we use a pre-annotated PBMC reference dataset to transfer cell-type labels onto the PBMC3k cells we clustered in Section 12.\n",
     "\n",
-    "### How it works\n",
-    "1. Fit a PCA + kNN model on the reference\n",
-    "2. Project query cells into the reference's PCA space\n",
-    "3. Assign each query cell the label of its nearest neighbor in the reference\n",
-    "\n",
-    "**Advantages over batch correction approaches (Harmony, BBKNN, etc.)**:\n",
-    "- Transparent and fast\n",
-    "- Solves the label-transfer problem directly\n",
-    "- Maintains the reference embedding structure\n",
-    "\n",
-    "> **Note:** This asymmetric approach (*ingesting* annotations from reference → query) is different from jointly integrating datasets.\n"
+    "### Goal\n",
+    "1. Use an annotated PBMC reference as the source of labels\n",
+    "2. Project our Section 12 cells into the reference space with `sc.tl.ingest`\n",
+    "3. Save transferred labels as a **new annotation** on `adata` (`cell_type_transfer`)\n",
+    "4. Run differential expression between two transferred cell types\n"
    ]
   },
   {
@@ -1350,15 +1344,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load the pre-processed PBMC3k as reference\n",
-    "# (Scanpy's built-in processed version already has cell type labels)\n",
+    "# Load an annotated PBMC reference and choose a label column for transfer\n",
     "adata_ref = sc.datasets.pbmc3k_processed()\n",
     "\n",
-    "# Load the PBMC 68k reduced dataset as query\n",
-    "adata_query = sc.datasets.pbmc68k_reduced()\n",
+    "ref_label_key = 'bulk_labels' if 'bulk_labels' in adata_ref.obs.columns else 'louvain'\n",
+    "adata_ref.obs['cell_type_ref'] = adata_ref.obs[ref_label_key].astype('category')\n",
+    "\n",
+    "# Use the analysis object from Sections 1–12 as query\n",
+    "adata_query = adata.copy()\n",
     "\n",
-    "print('Reference:', adata_ref.shape, '| Labels:', adata_ref.obs['louvain'].unique().tolist())\n",
-    "print('Query:    ', adata_query.shape)\n"
+    "print('Reference:', adata_ref.shape, '| Label column:', ref_label_key)\n",
+    "print('Query (Section 12 object):', adata_query.shape)\n"
    ]
   },
   {
@@ -1368,9 +1364,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ingest requires datasets to share the same variable names\n",
+    "# ingest requires reference/query to share the same gene names\n",
     "var_names = adata_ref.var_names.intersection(adata_query.var_names)\n",
-    "adata_ref   = adata_ref[:, var_names].copy()\n",
+    "adata_ref = adata_ref[:, var_names].copy()\n",
     "adata_query = adata_query[:, var_names].copy()\n",
     "\n",
     "print('Shared genes:', len(var_names))\n"
@@ -1383,13 +1379,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Train the PCA/kNN model on the reference\n",
+    "# Train the PCA + kNN model on the reference\n",
     "sc.pp.pca(adata_ref)\n",
     "sc.pp.neighbors(adata_ref)\n",
     "sc.tl.umap(adata_ref)\n",
     "\n",
-    "# Check the reference UMAP\n",
-    "sc.pl.umap(adata_ref, color='louvain', title='Reference PBMC3k (annotated)')\n"
+    "# Inspect reference annotation\n",
+    "sc.pl.umap(adata_ref, color='cell_type_ref', title='Reference PBMC (annotation source)')\n"
    ]
   },
   {
@@ -1399,14 +1395,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Ingest: project query cells into reference space and transfer 'louvain' labels\n",
-    "sc.tl.ingest(adata_query, adata_ref, obs='louvain')\n",
+    "# Transfer reference labels to the Section 12 query object\n",
+    "sc.tl.ingest(adata_query, adata_ref, obs='cell_type_ref')\n",
     "\n",
-    "# Visualise query cells projected onto the reference UMAP\n",
+    "# Save transferred labels as a new annotation in the main object\n",
+    "adata.obs['cell_type_transfer'] = adata_query.obs.loc[adata.obs_names, 'cell_type_ref'].astype('category')\n",
+    "\n",
+    "# Compare manual (Section 12) vs transferred annotations\n",
     "sc.pl.umap(\n",
-    "    adata_query,\n",
-    "    color=['louvain'],\n",
-    "    title='Query PBMC68k — transferred labels',\n",
+    "    adata,\n",
+    "    color=['cell_type', 'cell_type_transfer'],\n",
+    "    title=['Manual annotation (Section 12)', 'Transferred annotation (reference)'],\n",
+    "    ncols=2,\n",
     ")\n"
    ]
   },
@@ -1417,18 +1417,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Concatenate reference and query for a combined view\n",
-    "adata_combined = adata_ref.concatenate(\n",
-    "    adata_query,\n",
-    "    batch_categories=['3k', '68k'],\n",
+    "# Differential expression between the two most abundant transferred cell types\n",
+    "transfer_counts = adata.obs['cell_type_transfer'].value_counts()\n",
+    "top_two = transfer_counts.index[:2].tolist()\n",
+    "\n",
+    "if len(top_two) < 2:\n",
+    "    raise ValueError('Need at least two transferred cell types for DE analysis.')\n",
+    "\n",
+    "sc.tl.rank_genes_groups(\n",
+    "    adata,\n",
+    "    groupby='cell_type_transfer',\n",
+    "    groups=[top_two[0]],\n",
+    "    reference=top_two[1],\n",
+    "    method='wilcoxon',\n",
+    "    use_raw=True,\n",
+    "    n_genes=25,\n",
+    "    key_added='de_cell_type_transfer',\n",
     ")\n",
     "\n",
-    "sc.pl.umap(\n",
-    "    adata_combined,\n",
-    "    color=['louvain', 'batch'],\n",
-    "    title=['Cell type (transferred)', 'Dataset batch'],\n",
-    "    ncols=2,\n",
-    ")\n"
+    "print(f'DE comparison (transferred labels): {top_two[0]} vs {top_two[1]}')\n",
+    "sc.pl.rank_genes_groups(adata, key='de_cell_type_transfer', groups=[top_two[0]], n_genes=15, sharey=False)\n",
+    "\n",
+    "de_transfer_df = sc.get.rank_genes_groups_df(adata, group=top_two[0], key='de_cell_type_transfer')\n",
+    "de_transfer_df.head(10)\n"
    ]
   },
   {