Refine comments and add optimization notes in notebook

EricSalomaki · web-flow · commit f34ec2f2f8a1 · 2026-06-02T16:12:33.000-04:00
diff --git a/notebooks/scRNAseq_in_Python.ipynb b/notebooks/scRNAseq_in_Python.ipynb
@@ -1902,9 +1902,10 @@
     "\n",
     "if _gseapy_ok and _integration_ok:\n",
     "    # ── Run one-vs-rest Wilcoxon marker analysis on Harmony clusters ──────────\n",
-    "    _cluster_key = 'leiden_harmony_r0.5'  # adjust to your chosen resolution from §16\n",
+    "    _cluster_key = 'leiden_harmony_r0.5'  # adjust to your chosen resolution\n",
     "    _mk_key      = f'rank_genes_{_cluster_key}'\n",
     "\n",
+    "    # Optimization: Only run the computationally heavy Wilcoxon test if the results don't already exist in the AnnData object\n",
     "    if _mk_key not in adata_int.uns:\n",
     "        print(f\"Running Wilcoxon one-vs-rest for {_cluster_key}...\")\n",
     "        sc.tl.rank_genes_groups(\n",
@@ -1942,6 +1943,7 @@
     "    _ora_padj_cutoff  = 0.05\n",
     "    _ora_logfc_cutoff = 0.0   # keep genes with logFC > 0 (upregulated)\n",
     "\n",
+    "    # Extract marker gene results structure from the AnnData object\n",
     "    _mk      = adata_int.uns[_mk_key]\n",
     "    _groups  = _mk['names'].dtype.names\n",
     "    _n_genes = _mk['names'].shape[0]\n",
@@ -1957,6 +1959,7 @@
     "\n",
     "    _all_ora = {lib: [] for lib in _gene_set_libraries}\n",
     "\n",
+    "    # Loop through each cluster and filter genes that are significant and upregulated\n",
     "    for _cl in _groups:\n",
     "        _sig_genes = [\n",
     "            _mk['names'][_cl][r]\n",
@@ -1968,6 +1971,7 @@
     "        if not _sig_genes:\n",
     "            continue\n",
     "\n",
+    "        # Query databases for this cell cluster\n",
     "        for _lib_name, _lib_id in _gene_set_libraries.items():\n",
     "            try:\n",
     "                _enr = gp.enrich(\n",
@@ -1980,6 +1984,7 @@
     "                _df = _enr.results.copy()\n",
     "                _df.insert(0, 'cluster', _cl)\n",
     "                _df['query_size'] = len(_sig_genes)\n",
+    "                # Parse the 'Overlap' string (e.g., "5/45") to compute ratios\n",
     "                if 'Overlap' in _df.columns:\n",
     "                    _ov = _df['Overlap'].str.split('/')\n",
     "                    _df['count']      = _ov.str[0].astype(int)\n",