Skip to content

Commit 2c27750

Browse files
authored
Merge pull request #12 from compbiocore/copilot/update-data-integration-section
Update Section 13 to transfer PBMC reference labels onto workshop clusters and add DE on transferred cell types
2 parents c196d1b + 3c2cc76 commit 2c27750

1 file changed

Lines changed: 50 additions & 39 deletions

File tree

notebooks/scRNAseq_in_Python.ipynb

Lines changed: 50 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,19 +1328,13 @@
13281328
"\n",
13291329
"*Reference: [Integrating data using ingest and BBKNN](https://scanpy.readthedocs.io/en/1.10.x/tutorials/basics/integrating-data-using-ingest.html)*\n",
13301330
"\n",
1331-
"When you have a well-annotated **reference dataset** and a new **query dataset**, you can transfer labels from the reference to the query using `sc.tl.ingest`.\n",
1331+
"In this section, we use a pre-annotated PBMC reference dataset to transfer cell-type labels onto the PBMC3k cells we clustered in Section 12.\n",
13321332
"\n",
1333-
"### How it works\n",
1334-
"1. Fit a PCA + kNN model on the reference\n",
1335-
"2. Project query cells into the reference's PCA space\n",
1336-
"3. Assign each query cell the label of its nearest neighbor in the reference\n",
1337-
"\n",
1338-
"**Advantages over batch correction approaches (Harmony, BBKNN, etc.)**:\n",
1339-
"- Transparent and fast\n",
1340-
"- Solves the label-transfer problem directly\n",
1341-
"- Maintains the reference embedding structure\n",
1342-
"\n",
1343-
"> **Note:** This asymmetric approach (*ingesting* annotations from reference → query) is different from jointly integrating datasets.\n"
1333+
"### Goal\n",
1334+
"1. Use an annotated PBMC reference as the source of labels\n",
1335+
"2. Project our Section 12 cells into the reference space with `sc.tl.ingest`\n",
1336+
"3. Save transferred labels as a **new annotation** on `adata` (`cell_type_transfer`)\n",
1337+
"4. Run differential expression between two transferred cell types\n"
13441338
]
13451339
},
13461340
{
@@ -1350,15 +1344,17 @@
13501344
"metadata": {},
13511345
"outputs": [],
13521346
"source": [
1353-
"# Load the pre-processed PBMC3k as reference\n",
1354-
"# (Scanpy's built-in processed version already has cell type labels)\n",
1347+
"# Load an annotated PBMC reference and choose a label column for transfer\n",
13551348
"adata_ref = sc.datasets.pbmc3k_processed()\n",
13561349
"\n",
1357-
"# Load the PBMC 68k reduced dataset as query\n",
1358-
"adata_query = sc.datasets.pbmc68k_reduced()\n",
1350+
"ref_label_key = 'bulk_labels' if 'bulk_labels' in adata_ref.obs.columns else 'louvain'\n",
1351+
"adata_ref.obs['cell_type_ref'] = adata_ref.obs[ref_label_key].astype('category')\n",
1352+
"\n",
1353+
"# Use the analysis object from Sections 1–12 as query\n",
1354+
"adata_query = adata.copy()\n",
13591355
"\n",
1360-
"print('Reference:', adata_ref.shape, '| Labels:', adata_ref.obs['louvain'].unique().tolist())\n",
1361-
"print('Query: ', adata_query.shape)\n"
1356+
"print('Reference:', adata_ref.shape, '| Label column:', ref_label_key)\n",
1357+
"print('Query (Section 12 object):', adata_query.shape)\n"
13621358
]
13631359
},
13641360
{
@@ -1368,9 +1364,9 @@
13681364
"metadata": {},
13691365
"outputs": [],
13701366
"source": [
1371-
"# ingest requires datasets to share the same variable names\n",
1367+
"# ingest requires reference/query to share the same gene names\n",
13721368
"var_names = adata_ref.var_names.intersection(adata_query.var_names)\n",
1373-
"adata_ref = adata_ref[:, var_names].copy()\n",
1369+
"adata_ref = adata_ref[:, var_names].copy()\n",
13741370
"adata_query = adata_query[:, var_names].copy()\n",
13751371
"\n",
13761372
"print('Shared genes:', len(var_names))\n"
@@ -1383,13 +1379,13 @@
13831379
"metadata": {},
13841380
"outputs": [],
13851381
"source": [
1386-
"# Train the PCA/kNN model on the reference\n",
1382+
"# Train the PCA + kNN model on the reference\n",
13871383
"sc.pp.pca(adata_ref)\n",
13881384
"sc.pp.neighbors(adata_ref)\n",
13891385
"sc.tl.umap(adata_ref)\n",
13901386
"\n",
1391-
"# Check the reference UMAP\n",
1392-
"sc.pl.umap(adata_ref, color='louvain', title='Reference PBMC3k (annotated)')\n"
1387+
"# Inspect reference annotation\n",
1388+
"sc.pl.umap(adata_ref, color='cell_type_ref', title='Reference PBMC (annotation source)')\n"
13931389
]
13941390
},
13951391
{
@@ -1399,14 +1395,18 @@
13991395
"metadata": {},
14001396
"outputs": [],
14011397
"source": [
1402-
"# Ingest: project query cells into reference space and transfer 'louvain' labels\n",
1403-
"sc.tl.ingest(adata_query, adata_ref, obs='louvain')\n",
1398+
"# Transfer reference labels to the Section 12 query object\n",
1399+
"sc.tl.ingest(adata_query, adata_ref, obs='cell_type_ref')\n",
14041400
"\n",
1405-
"# Visualise query cells projected onto the reference UMAP\n",
1401+
"# Save transferred labels as a new annotation in the main object\n",
1402+
"adata.obs['cell_type_transfer'] = adata_query.obs.loc[adata.obs_names, 'cell_type_ref'].astype('category')\n",
1403+
"\n",
1404+
"# Compare manual (Section 12) vs transferred annotations\n",
14061405
"sc.pl.umap(\n",
1407-
" adata_query,\n",
1408-
" color=['louvain'],\n",
1409-
" title='Query PBMC68k — transferred labels',\n",
1406+
" adata,\n",
1407+
" color=['cell_type', 'cell_type_transfer'],\n",
1408+
" title=['Manual annotation (Section 12)', 'Transferred annotation (reference)'],\n",
1409+
" ncols=2,\n",
14101410
")\n"
14111411
]
14121412
},
@@ -1417,18 +1417,29 @@
14171417
"metadata": {},
14181418
"outputs": [],
14191419
"source": [
1420-
"# Concatenate reference and query for a combined view\n",
1421-
"adata_combined = adata_ref.concatenate(\n",
1422-
" adata_query,\n",
1423-
" batch_categories=['3k', '68k'],\n",
1420+
"# Differential expression between the two most abundant transferred cell types\n",
1421+
"transfer_counts = adata.obs['cell_type_transfer'].value_counts()\n",
1422+
"top_two = transfer_counts.index[:2].tolist()\n",
1423+
"\n",
1424+
"if len(top_two) < 2:\n",
1425+
" raise ValueError('Need at least two transferred cell types for DE analysis.')\n",
1426+
"\n",
1427+
"sc.tl.rank_genes_groups(\n",
1428+
" adata,\n",
1429+
" groupby='cell_type_transfer',\n",
1430+
" groups=[top_two[0]],\n",
1431+
" reference=top_two[1],\n",
1432+
" method='wilcoxon',\n",
1433+
" use_raw=True,\n",
1434+
" n_genes=25,\n",
1435+
" key_added='de_cell_type_transfer',\n",
14241436
")\n",
14251437
"\n",
1426-
"sc.pl.umap(\n",
1427-
" adata_combined,\n",
1428-
" color=['louvain', 'batch'],\n",
1429-
" title=['Cell type (transferred)', 'Dataset batch'],\n",
1430-
" ncols=2,\n",
1431-
")\n"
1438+
"print(f'DE comparison (transferred labels): {top_two[0]} vs {top_two[1]}')\n",
1439+
"sc.pl.rank_genes_groups(adata, key='de_cell_type_transfer', groups=[top_two[0]], n_genes=15, sharey=False)\n",
1440+
"\n",
1441+
"de_transfer_df = sc.get.rank_genes_groups_df(adata, group=top_two[0], key='de_cell_type_transfer')\n",
1442+
"de_transfer_df.head(10)\n"
14321443
]
14331444
},
14341445
{

0 commit comments

Comments
 (0)