diff --git a/code/enrichment/sldsc_enrichment.ipynb b/code/enrichment/sldsc_enrichment.ipynb index f94d40fb5..381c49e29 100644 --- a/code/enrichment/sldsc_enrichment.ipynb +++ b/code/enrichment/sldsc_enrichment.ipynb @@ -409,7 +409,11 @@ }, "source": [ "##### 1.2 joint tau\n", - "with more than one annotation as the input" + "with more than one annotation as the input\n", + "\n", + "`--snp_list ` is **optional** and **orthogonal to single/joint**: it restricts which SNP *rows* are written to the `.l2.ldscore` output (HM3-style restriction) — it does **not** change the LD-score *values* of retained SNPs (the r²-window still uses all `.bim` SNPs), nor `.l2.M` / `.l2.M_5_50`. With `--snp_list` the step runs polyfun's `ldsc.py --print-snps` and emits `.l2.ldscore.gz` (without it: `compute_ldscores.py` → `.l2.ldscore.parquet`), and the target annot is normalized to the plink `.bim`. Downstream `get_heritability` / `postprocess` **commands are unchanged** — they just read whichever `.l2.ldscore.{gz,parquet}` exists; the regression runs on the intersection sumstats ∩ baseline ∩ weights ∩ target, so for the HM3 restriction to take full effect the baseline & weights LD scores should also be HM3-restricted.\n", + "\n", + "**Which polyfun script & what to align.** `--snp_list` present → step 1 runs `ldsc.py --l2 --print-snps` (output `.l2.ldscore.gz`); absent → `compute_ldscores.py` (output `.l2.ldscore.parquet`). `ldsc.py` requires the target annot to line up with the plink `.bim` — **same SNP set and same row order**. The pipeline handles this internally (`normalize_for_ldsc` re-expands the annot onto the `.bim` rows, filling 0), but if you assemble inputs by hand, keep the **plink `.bim`, `.frq`, target annot, and baseline annot all on the same reference panel with consistent SNP set/order**: target and baseline annot row counts must match (polyfun hstacks them), and the `.frq` MAF filter (`--frqfile-chr`, m50) plus the by-SNP merge in `get_heritability` both assume a consistent panel. Mixing panels (e.g. 1000G plink with an ADSP-derived annot, or a `.frq` whose row count/order differs from the annot) is the most common source of wrong `Prop_SNPs` / Enrichment.\n" ] }, { @@ -493,7 +497,8 @@ " --weight-name weights_chr \\\n", " --annotation-name my_analysis \\\n", " --cwd output/heritability \\\n", - " --maf-cutoff 0.05" + " --maf-cutoff 0.05\n", + "# (allm variant: use --maf-cutoff 0 — no MAF filter; polyfun then uses --not-M-5-50)\n" ] }, { @@ -508,7 +513,9 @@ "(which contains the $N$ single-target subdirectories and optionally the\n", "joint subdirectory) and calls `pecotmr::sldsc_postprocessing_pipeline()`\n", "to produce per-trait standardized tables and the default random-effects\n", - "meta across all traits." + "meta across all traits.\n", + "\n", + "Use `--target-categories-label` (same order as `--target-categories`) to give the target annotations friendly names in the output — e.g. `--target-categories ANNOT_1_0 ANNOT_2_0 --target-categories-label quantile_eQTL eQTL` makes the `target` column read `quantile_eQTL` / `eQTL` instead of `ANNOT_1_0` / `ANNOT_2_0` (the original names are kept in `params$target_categories_orig`). Omit it to keep the polyfun `.results` names.\n" ] }, { @@ -524,16 +531,25 @@ "# pecotmr::sldsc_postprocessing_pipeline. Single and joint target subdirectories\n", "# are auto-detected from _single_ and _joint.\n", "\n", + "# --target-categories: the joint-run target columns as they appear in the .results \"Category\"\n", + "# column. The 2-target joint dir here has columns ANNOT_1, ANNOT_2 (compute_ldscores.py keeps\n", + "# the annot col names) and polyfun appends \"_0\" (target = ref-ld file 0) -> \"ANNOT_1_0\", \"ANNOT_2_0\".\n", + "# (Single-target dirs would be \"ANNOT_0\"; with --snp_list/ldsc.py and a single annot it is \"L2_0\".)\n", + "# You can drop --target-categories entirely to auto-detect from the joint-run results.\n", + "# --target-categories-label (optional, same order as --target-categories): friendly display\n", + "# names; renames the \"target\" column in the output (originals kept in params$target_categories_orig).\n", "sos run pipeline/sldsc_enrichment.ipynb postprocess \\\n", " --traits-file data/all_traits.txt \\\n", - " --heritability-cwd output/polyfun/heritability \\\n", - " --target-categories anno1 anno2 \\\n", - " --target-anno-dir output/polyfun/ldscores/my_analysis_joint \\\n", + " --heritability-cwd output/heritability \\\n", + " --target-categories ANNOT_1_0 ANNOT_2_0 \\\n", + " --target-categories-label quantile_eQTL eQTL \\\n", + " --target-anno-dir output/my_analysis_joint \\\n", " --frqfile-dir data/ADSP/frq \\\n", " --plink-name ADSP_chr \\\n", " --annotation-name my_analysis \\\n", " --cwd output/sldsc_postprocess \\\n", - " --maf-cutoff 0.05" + " --maf-cutoff 0.05\n", + "# (allm variant: use --maf-cutoff 0 — no MAF filter; polyfun then uses --not-M-5-50)\n" ] }, { @@ -561,7 +577,7 @@ " --postprocess-rds output/sldsc_postprocess/my_analysis.sldsc_postprocess.rds \\\n", " --subset-traits-file data/neuro_traits.txt \\\n", " --subset-name neuro \\\n", - " --target-categories anno1 anno2 \\\n", + " --target-categories ANNOT_1_0 ANNOT_2_0 \\\n", " --annotation-name my_analysis \\\n", " --cwd output/sldsc_postprocess" ] @@ -610,7 +626,6 @@ "parameter: weights_dir = path() # Directory containing LD weights (computed against our panel)\n", "parameter: baseline_name = \"baseline_chr\" # Prefix of baseline annotation files\n", "parameter: weight_name = \"weights_chr\" # Prefix of LD weights files\n", - "parameter: Mref = -1 # Reference number of SNPs in our panel (MAF > maf_cutoff). If > 0, use provided value; if <= 0, auto-calculate from ldscore files\n", "parameter: n_blocks = 200\n", "\n", "# Number of threads\n", @@ -664,6 +679,33 @@ "# Run on a NEW annotation_file with the K selected annotations\n", "# -> 1 joint dir with the conditional model.\n", "\n", + "#\n", + "# --- snplist (--snp_list) vs no-snplist: which polyfun script, output format,\n", + "# column name, and the CM requirement ---\n", + "# --snp_list given -> ldsc.py --l2 --print-snps -> output .l2.ldscore.gz\n", + "# --snp_list absent -> compute_ldscores.py -> output .l2.ldscore.parquet\n", + "#\n", + "# LD-score column name (this is what becomes the .results \"Category\" in\n", + "# [get_heritability], with a \"_\" suffix appended there):\n", + "# * compute_ldscores.py ALWAYS keeps the annot column name(s):\n", + "# single annot column \"ANNOT\" -> ldscore column \"ANNOT\"\n", + "# joint annot columns \"ANNOT_1\",\"ANNOT_2\",... -> \"ANNOT_1\",\"ANNOT_2\",...\n", + "# * ldsc.py --l2 has a quirk: with EXACTLY ONE annotation (n_annot == 1) it\n", + "# HARD-CODES the ldscore column name to \"L2\" and DROPS the annot's original\n", + "# column name. With >=2 annotations it uses \"L2\"\n", + "# (\"ANNOT_1L2\",\"ANNOT_2L2\",...).\n", + "# => a single-target snplist run reports \"L2_0\" in .results, while a\n", + "# single-target no-snplist run reports \"ANNOT_0\". [postprocess] auto-\n", + "# detects either; only matters if you pass --target-categories explicitly.\n", + "#\n", + "# CM column requirement for snplist: ldsc.py --l2 --print-snps requires the\n", + "# target annot to (a) carry a \"CM\" (centimorgan) column and (b) line up with\n", + "# the plink .bim (same SNP set, same row order). This step handles both\n", + "# internally (normalize_for_ldsc: takes CM from the .bim 4th column, re-expands\n", + "# the annot onto the .bim rows, filling 0). Therefore the plink .bim files MUST\n", + "# carry genetic-map (cM) positions when using --ld-wind-cm (the default);\n", + "# if your .bim has 0 in the cM column, switch to --ld-wind-kb instead.\n", + "#\n", "parameter: compute_single = True\n", "parameter: compute_joint = True\n", "parameter: score_column = 3\n", @@ -1067,6 +1109,25 @@ "# `target_anno_dirs` is the list produced by [make_annotation_files_ldscore]:\n", "# typically N _single_ directories plus optionally one _joint directory.\n", "\n", + "#\n", + "# --- about the \".results\" Category column and the \"_0 / _1\" suffix ---\n", + "# Each (trait, target_dir) pair is ONE polyfun call; its `ldsc.py --ref-ld-chr`\n", + "# always gets exactly two LD-score sources, in this order:\n", + "# \"/.\" (index 0) , \"/\" (index 1)\n", + "# With --overlap-annot, every annotation column in the .results \"Category\" is\n", + "# named _:\n", + "# index 0 = the target file -> \"ANNOT_0\" (no-snplist; compute_ldscores.py keeps the annot col name)\n", + "# -> \"L2_0\" (snplist + single annot; ldsc.py hard-codes \"L2\", see below)\n", + "# -> \"ANNOT_1_0\",\"ANNOT_2_0\" (no-snplist joint dir, N>=2 annot cols)\n", + "# -> \"ANNOT_1L2_0\",\"ANNOT_2L2_0\" (snplist joint dir, N>=2 -> \"L2\")\n", + "# index 1 = the baseline file -> \"base_1\",\"Coding_UCSC_1\", ... (the 97 baseline annots)\n", + "# So in this pipeline the suffix is only ever 0 (target) or 1 (baseline); it would\n", + "# continue 0,1,2,... only if you handed `ldsc.py --ref-ld-chr` more than two sources.\n", + "# (Why ANNOT_0 vs L2_0: see the [make_annotation_files_ldscore] header — ldsc.py's\n", + "# \"n_annot == 1 -> column name 'L2'\" quirk vs compute_ldscores.py keeping the annot\n", + "# column name.) [postprocess] auto-detects the target Category; if you instead pass\n", + "# --target-categories, the names must match this column exactly.\n", + "#\n", "parameter: target_anno_dirs = paths()\n", "parameter: all_traits = []\n", "\n", @@ -1173,6 +1234,10 @@ "parameter: traits_file = path() # text file: one trait sumstats filename per line\n", "parameter: heritability_cwd = path() # parent directory of [get_heritability] outputs (contains _single_/ subdirs and optionally _joint/)\n", "parameter: target_categories = [] # target annotation names. Auto-detected from the joint-run results if empty.\n", + "parameter: target_categories_label = [] # optional display names, same order as target_categories;\n", + " # when given, every \"target\" column / tau*-block colname in\n", + " # the output RDS is renamed to these (params$target_categories\n", + " # holds the labels, params$target_categories_orig the originals).\n", "parameter: target_anno_dir = path() # directory of target .annot.gz files used for sd_C and binary detection (typically the joint dir, since it carries all target columns)\n", "\n", "input: traits_file\n", @@ -1184,6 +1249,7 @@ "\n", " traits <- readLines(\"${traits_file}\")\n", " target_cats <- c(${\",\".join('\"%s\"' % c for c in target_categories)})\n", + " target_lab <- c(${\",\".join('\"%s\"' % c for c in target_categories_label)})\n", "\n", " # Auto-detect single-target and joint-target output directories.\n", " her_root <- \"${heritability_cwd}\"\n", @@ -1218,7 +1284,8 @@ " frqfile_dir = \"${frqfile_dir}\",\n", " plink_name = \"${plink_name}\",\n", " maf_cutoff = ${maf_cutoff},\n", - " target_categories = if (length(target_cats) > 0) target_cats else NULL\n", + " target_categories = if (length(target_cats) > 0) target_cats else NULL,\n", + " target_labels = if (length(target_lab) > 0) target_lab else NULL\n", " )\n", "\n", " saveRDS(res, \"${_output[0]}\")\n", @@ -1241,6 +1308,9 @@ "parameter: subset_traits_file = path() # text file: one trait id per line, subset of those passed to [postprocess]\n", "parameter: subset_name = str # label used in the output filename\n", "parameter: target_categories = [] # target annotation names to meta on; if empty, uses all from postprocess output\n", + "# If [postprocess] was run with --target-categories-label, the cached RDS already\n", + "# carries the display names (params$target_categories = the labels), so leave\n", + "# --target-categories empty here (or pass the labels, not the original ANNOT_* names).\n", "\n", "input: postprocess_rds, subset_traits_file\n", "output: f\"{cwd:a}/{annotation_name}.{subset_name}.meta.rds\"\n", @@ -1312,4 +1382,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}