StatFunGen · gaow · May 13, 2026 · May 12, 2026 · May 12, 2026
diff --git a/code/enrichment/sldsc_enrichment.ipynb b/code/enrichment/sldsc_enrichment.ipynb
@@ -409,7 +409,11 @@
    },
    "source": [
     "##### 1.2 joint tau\n",
-    "with more than one annotation as the input"
+    "with more than one annotation as the input\n",
+    "\n",
+    "`--snp_list <file>` is **optional** and **orthogonal to single/joint**: it restricts which SNP *rows* are written to the `.l2.ldscore` output (HM3-style restriction) — it does **not** change the LD-score *values* of retained SNPs (the r²-window still uses all `.bim` SNPs), nor `.l2.M` / `.l2.M_5_50`. With `--snp_list` the step runs polyfun's `ldsc.py --print-snps` and emits `.l2.ldscore.gz` (without it: `compute_ldscores.py` → `.l2.ldscore.parquet`), and the target annot is normalized to the plink `.bim`. Downstream `get_heritability` / `postprocess` **commands are unchanged** — they just read whichever `.l2.ldscore.{gz,parquet}` exists; the regression runs on the intersection sumstats ∩ baseline ∩ weights ∩ target, so for the HM3 restriction to take full effect the baseline & weights LD scores should also be HM3-restricted.\n",
+    "\n",
+    "**Which polyfun script & what to align.** `--snp_list` present → step 1 runs `ldsc.py --l2 --print-snps` (output `.l2.ldscore.gz`); absent → `compute_ldscores.py` (output `.l2.ldscore.parquet`). `ldsc.py` requires the target annot to line up with the plink `.bim` — **same SNP set and same row order**. The pipeline handles this internally (`normalize_for_ldsc` re-expands the annot onto the `.bim` rows, filling 0), but if you assemble inputs by hand, keep the **plink `.bim`, `.frq`, target annot, and baseline annot all on the same reference panel with consistent SNP set/order**: target and baseline annot row counts must match (polyfun hstacks them), and the `.frq` MAF filter (`--frqfile-chr`, m50) plus the by-SNP merge in `get_heritability` both assume a consistent panel. Mixing panels (e.g. 1000G plink with an ADSP-derived annot, or a `.frq` whose row count/order differs from the annot) is the most common source of wrong `Prop_SNPs` / Enrichment.\n"
    ]
   },
   {
@@ -493,7 +497,8 @@
     "    --weight-name weights_chr \\\n",
     "    --annotation-name my_analysis \\\n",
     "    --cwd output/heritability \\\n",
-    "    --maf-cutoff 0.05"
+    "    --maf-cutoff 0.05\n",
+    "# (allm variant: use --maf-cutoff 0 — no MAF filter; polyfun then uses --not-M-5-50)\n"
    ]
   },
   {
@@ -508,7 +513,9 @@
     "(which contains the $N$ single-target subdirectories and optionally the\n",
     "joint subdirectory) and calls `pecotmr::sldsc_postprocessing_pipeline()`\n",
     "to produce per-trait standardized tables and the default random-effects\n",
-    "meta across all traits."
+    "meta across all traits.\n",
+    "\n",
+    "Use `--target-categories-label` (same order as `--target-categories`) to give the target annotations friendly names in the output — e.g. `--target-categories ANNOT_1_0 ANNOT_2_0 --target-categories-label quantile_eQTL eQTL` makes the `target` column read `quantile_eQTL` / `eQTL` instead of `ANNOT_1_0` / `ANNOT_2_0` (the original names are kept in `params$target_categories_orig`). Omit it to keep the polyfun `.results` names.\n"
    ]
   },
   {
@@ -524,16 +531,25 @@
     "# pecotmr::sldsc_postprocessing_pipeline. Single and joint target subdirectories\n",
     "# are auto-detected from <annotation_name>_single_<i> and <annotation_name>_joint.\n",
     "\n",
+    "# --target-categories: the joint-run target columns as they appear in the .results \"Category\"\n",
+    "#   column. The 2-target joint dir here has columns ANNOT_1, ANNOT_2 (compute_ldscores.py keeps\n",
+    "#   the annot col names) and polyfun appends \"_0\" (target = ref-ld file 0) -> \"ANNOT_1_0\", \"ANNOT_2_0\".\n",
+    "#   (Single-target dirs would be \"ANNOT_0\"; with --snp_list/ldsc.py and a single annot it is \"L2_0\".)\n",
+    "#   You can drop --target-categories entirely to auto-detect from the joint-run results.\n",
+    "# --target-categories-label (optional, same order as --target-categories): friendly display\n",
+    "#   names; renames the \"target\" column in the output (originals kept in params$target_categories_orig).\n",
     "sos run pipeline/sldsc_enrichment.ipynb postprocess \\\n",
     "    --traits-file data/all_traits.txt \\\n",
-    "    --heritability-cwd output/polyfun/heritability \\\n",
-    "    --target-categories anno1 anno2 \\\n",
-    "    --target-anno-dir output/polyfun/ldscores/my_analysis_joint \\\n",
+    "    --heritability-cwd output/heritability \\\n",
+    "    --target-categories ANNOT_1_0 ANNOT_2_0 \\\n",
+    "    --target-categories-label quantile_eQTL eQTL \\\n",
+    "    --target-anno-dir output/my_analysis_joint \\\n",
     "    --frqfile-dir data/ADSP/frq \\\n",
     "    --plink-name ADSP_chr \\\n",
     "    --annotation-name my_analysis \\\n",
     "    --cwd output/sldsc_postprocess \\\n",
-    "    --maf-cutoff 0.05"
+    "    --maf-cutoff 0.05\n",
+    "# (allm variant: use --maf-cutoff 0 — no MAF filter; polyfun then uses --not-M-5-50)\n"
    ]
   },
   {
@@ -561,7 +577,7 @@
     "    --postprocess-rds output/sldsc_postprocess/my_analysis.sldsc_postprocess.rds \\\n",
     "    --subset-traits-file data/neuro_traits.txt \\\n",
     "    --subset-name neuro \\\n",
-    "    --target-categories anno1 anno2 \\\n",
+    "    --target-categories ANNOT_1_0 ANNOT_2_0 \\\n",
     "    --annotation-name my_analysis \\\n",
     "    --cwd output/sldsc_postprocess"
    ]
@@ -610,7 +626,6 @@
     "parameter: weights_dir = path()  # Directory containing LD weights (computed against our panel)\n",
     "parameter: baseline_name = \"baseline_chr\"  # Prefix of baseline annotation files\n",
     "parameter: weight_name = \"weights_chr\"  # Prefix of LD weights files\n",
-    "parameter: Mref = -1 # Reference number of SNPs in our panel (MAF > maf_cutoff). If > 0, use provided value; if <= 0, auto-calculate from ldscore files\n",
     "parameter: n_blocks = 200\n",
     "\n",
     "# Number of threads\n",
@@ -664,6 +679,33 @@
     "#               Run on a NEW annotation_file with the K selected annotations\n",
     "#               -> 1 joint dir with the conditional model.\n",
     "\n",
+    "#\n",
+    "# --- snplist (--snp_list) vs no-snplist: which polyfun script, output format,\n",
+    "#     column name, and the CM requirement ---\n",
+    "#   --snp_list given  -> ldsc.py --l2 --print-snps   -> output .l2.ldscore.gz\n",
+    "#   --snp_list absent -> compute_ldscores.py         -> output .l2.ldscore.parquet\n",
+    "#\n",
+    "#   LD-score column name (this is what becomes the .results \"Category\" in\n",
+    "#   [get_heritability], with a \"_<ref-ld-index>\" suffix appended there):\n",
+    "#     * compute_ldscores.py  ALWAYS keeps the annot column name(s):\n",
+    "#         single annot column \"ANNOT\"          -> ldscore column \"ANNOT\"\n",
+    "#         joint  annot columns \"ANNOT_1\",\"ANNOT_2\",...  -> \"ANNOT_1\",\"ANNOT_2\",...\n",
+    "#     * ldsc.py --l2 has a quirk: with EXACTLY ONE annotation (n_annot == 1) it\n",
+    "#       HARD-CODES the ldscore column name to \"L2\" and DROPS the annot's original\n",
+    "#       column name. With >=2 annotations it uses \"<annot_name>L2\"\n",
+    "#       (\"ANNOT_1L2\",\"ANNOT_2L2\",...).\n",
+    "#     => a single-target snplist run reports \"L2_0\" in .results, while a\n",
+    "#        single-target no-snplist run reports \"ANNOT_0\".  [postprocess] auto-\n",
+    "#        detects either; only matters if you pass --target-categories explicitly.\n",
+    "#\n",
+    "#   CM column requirement for snplist:  ldsc.py --l2 --print-snps requires the\n",
+    "#   target annot to (a) carry a \"CM\" (centimorgan) column and (b) line up with\n",
+    "#   the plink .bim (same SNP set, same row order). This step handles both\n",
+    "#   internally (normalize_for_ldsc: takes CM from the .bim 4th column, re-expands\n",
+    "#   the annot onto the .bim rows, filling 0). Therefore the plink .bim files MUST\n",
+    "#   carry genetic-map (cM) positions when using --ld-wind-cm (the default);\n",
+    "#   if your .bim has 0 in the cM column, switch to --ld-wind-kb instead.\n",
+    "#\n",
     "parameter: compute_single = True\n",
     "parameter: compute_joint = True\n",
     "parameter: score_column = 3\n",
@@ -1067,6 +1109,25 @@
     "# `target_anno_dirs` is the list produced by [make_annotation_files_ldscore]:\n",
     "# typically N _single_<i> directories plus optionally one _joint directory.\n",
     "\n",
+    "#\n",
+    "# --- about the \".results\" Category column and the \"_0 / _1\" suffix ---\n",
+    "# Each (trait, target_dir) pair is ONE polyfun call; its `ldsc.py --ref-ld-chr`\n",
+    "# always gets exactly two LD-score sources, in this order:\n",
+    "#     \"<target_dir>/<target>.\"   (index 0)  ,  \"<baseline_dir>/<baseline>\"   (index 1)\n",
+    "# With --overlap-annot, every annotation column in the .results \"Category\" is\n",
+    "# named  <ldscore_column_name>_<ref-ld-index>:\n",
+    "#     index 0 = the target file   -> \"ANNOT_0\"  (no-snplist; compute_ldscores.py keeps the annot col name)\n",
+    "#                                  -> \"L2_0\"    (snplist + single annot; ldsc.py hard-codes \"L2\", see below)\n",
+    "#                                  -> \"ANNOT_1_0\",\"ANNOT_2_0\"      (no-snplist joint dir, N>=2 annot cols)\n",
+    "#                                  -> \"ANNOT_1L2_0\",\"ANNOT_2L2_0\"  (snplist joint dir, N>=2 -> \"<name>L2\")\n",
+    "#     index 1 = the baseline file -> \"base_1\",\"Coding_UCSC_1\", ...  (the 97 baseline annots)\n",
+    "# So in this pipeline the suffix is only ever 0 (target) or 1 (baseline); it would\n",
+    "# continue 0,1,2,... only if you handed `ldsc.py --ref-ld-chr` more than two sources.\n",
+    "# (Why ANNOT_0 vs L2_0: see the [make_annotation_files_ldscore] header — ldsc.py's\n",
+    "#  \"n_annot == 1 -> column name 'L2'\" quirk vs compute_ldscores.py keeping the annot\n",
+    "#  column name.)  [postprocess] auto-detects the target Category; if you instead pass\n",
+    "# --target-categories, the names must match this column exactly.\n",
+    "#\n",
     "parameter: target_anno_dirs = paths()\n",
     "parameter: all_traits = []\n",
     "\n",
@@ -1173,6 +1234,10 @@
     "parameter: traits_file = path()             # text file: one trait sumstats filename per line\n",
     "parameter: heritability_cwd = path()        # parent directory of [get_heritability] outputs (contains <annotation_name>_single_<i>/ subdirs and optionally <annotation_name>_joint/)\n",
     "parameter: target_categories = []           # target annotation names. Auto-detected from the joint-run results if empty.\n",
+    "parameter: target_categories_label = []     # optional display names, same order as target_categories;\n",
+    "                                            # when given, every \"target\" column / tau*-block colname in\n",
+    "                                            # the output RDS is renamed to these (params$target_categories\n",
+    "                                            # holds the labels, params$target_categories_orig the originals).\n",
     "parameter: target_anno_dir = path()         # directory of target .annot.gz files used for sd_C and binary detection (typically the joint dir, since it carries all target columns)\n",
     "\n",
     "input: traits_file\n",
@@ -1184,6 +1249,7 @@
     "\n",
     "    traits <- readLines(\"${traits_file}\")\n",
     "    target_cats <- c(${\",\".join('\"%s\"' % c for c in target_categories)})\n",
+    "    target_lab  <- c(${\",\".join('\"%s\"' % c for c in target_categories_label)})\n",
     "\n",
     "    # Auto-detect single-target and joint-target output directories.\n",
     "    her_root  <- \"${heritability_cwd}\"\n",
@@ -1218,7 +1284,8 @@
     "        frqfile_dir           = \"${frqfile_dir}\",\n",
     "        plink_name            = \"${plink_name}\",\n",
     "        maf_cutoff            = ${maf_cutoff},\n",
-    "        target_categories     = if (length(target_cats) > 0) target_cats else NULL\n",
+    "        target_categories     = if (length(target_cats) > 0) target_cats else NULL,\n",
+    "        target_labels         = if (length(target_lab)  > 0) target_lab  else NULL\n",
     "    )\n",
     "\n",
     "    saveRDS(res, \"${_output[0]}\")\n",
@@ -1241,6 +1308,9 @@
     "parameter: subset_traits_file = path()        # text file: one trait id per line, subset of those passed to [postprocess]\n",
     "parameter: subset_name = str                  # label used in the output filename\n",
     "parameter: target_categories = []             # target annotation names to meta on; if empty, uses all from postprocess output\n",
+    "# If [postprocess] was run with --target-categories-label, the cached RDS already\n",
+    "# carries the display names (params$target_categories = the labels), so leave\n",
+    "# --target-categories empty here (or pass the labels, not the original ANNOT_* names).\n",
     "\n",
     "input: postprocess_rds, subset_traits_file\n",
     "output: f\"{cwd:a}/{annotation_name}.{subset_name}.meta.rds\"\n",
@@ -1312,4 +1382,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}