diff --git a/code/mnm_analysis/mnm_postprocessing.ipynb b/code/mnm_analysis/mnm_postprocessing.ipynb index b63df912c..013db6d87 100644 --- a/code/mnm_analysis/mnm_postprocessing.ipynb +++ b/code/mnm_analysis/mnm_postprocessing.ipynb @@ -18,248 +18,150 @@ "kernel": "SoS" }, "source": [ - "This notebook is to post-process the susie results into different text file\n" + "This notebook is to post-process the susie results into exported tables. \n" ] }, { "cell_type": "markdown", - "id": "thirty-jurisdiction", + "id": "7be7fed9", "metadata": { "kernel": "SoS" }, "source": [ - "### Extracting susie results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fifteen-routine", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb susie_to_tsv \\\n", - " --cwd output/test --rds_path `ls output/test/cache/*rds | head ` --region-list <(head -50 ./dlpfc_region_list) --container containers/stephenslab.sif " - ] - }, - { - "cell_type": "markdown", - "id": "spatial-speed", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "### Extracting susie_rss results for ADGWAS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "unavailable-dayton", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb susie_to_tsv \\\n", - " --cwd output/ADGWAS_finemapping_extracted/Bellenguez/ --rds_path `ls GWAS_Finemapping_Results/Bellenguez/ADGWAS2022*rds ` \\\n", - " --region-list ~/1300_hg38_EUR_LD_blocks_orig.tsv \\\n", - " --container containers/stephenslab.sif \n", - "\n", - "sos run pipeline/SuSiE_post_processing.ipynb susie_tsv_collapse \\\n", - " --cwd output/ADGWAS_finemapping_extracted --tsv_path `ls output/ADGWAS_finemapping_extracted/*lbf.tsv` \\\n", - " --container containers/stephenslab.sif " - ] - }, - { - "cell_type": "markdown", - "id": "trained-index", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "### Extracting fsusie results " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "improving-broadcast", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", - " --cwd output/f_susie_tad_haQTL_pos --rds_path `ls output/f_susie_tad_haQTL_pos/cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "least-remark", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", - " --cwd output/f_susie_tad_meQTL_pos_selected/ --rds_path `ls output/f_susie_tad_meQTL_pos_selected//cache/*1204*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "vietnamese-embassy", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", - " --cwd output/f_susie_tad_meQTL_pos_2/ --rds_path `ls output/f_susie_tad_meQTL_pos_2//cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "placed-rwanda", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", - " --cwd output/f_susie_tad_meQTL_pos/ --rds_path `ls output/f_susie_tad_meQTL_pos//cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "marine-retailer", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", - " --cwd output/f_susie_tad_haQTL_pos_check_pure_2 --rds_path `ls output/f_susie_tad_haQTL_pos_check_pure_2/cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "optional-shepherd", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb fsusie_to_tsv \\\n", - " --cwd output/f_susie_tad_meQTL_pos_2/ --rds_path `ls output/f_susie_tad_meQTL_pos_2//cache/*rds ` \\\n", - " --region-list ../eqtl/dlpfc_tad_list \\\n", - " --container containers/stephenslab.sif -s build" + "#### Workflow choices\n", + "\n", + "| Command | Produces | When to use |\n", + "|---|---|---|\n", + "| `export_top_loci` (**recommended**) | per-region RDS + per-region `top_loci.bed.gz` + combined `{name}.{qtl_type}.top_loci.bed.gz` | Default path. Chains the export, the top_loci extraction, and the combine step in one `sos run` invocation. See concrete examples under \"Export all top loci\" below. |\n", + "| `cis_results_export --step1_only` | per-region RDS only | When a downstream consumer needs the raw exported RDS but not the top_loci summary. Equivalent to running just `export_top_loci_1`. |\n", + "| `cis_results_export` (no flag) | per-region RDS + combined meta TSV `{name}.cis_results_db.tsv` | Used to be the default workflow; now mostly superseded by `export_top_loci`. Step 2 produces the meta TSV that listed per-region paths for legacy downstream tools. |\n", + "\n", + "#### Input modality flags\n", + "\n", + "Both workflows share step 1, so these flags work identically for `cis_results_export` and `export_top_loci`:\n", + "\n", + "| Flag | Use for | Effect on intermediate filename |\n", + "|---|---|---|\n", + "| *(none)* | univariate SuSiE (default) | `{name}.{region}.cis_results_db.rds` |\n", + "| `--fsusie` | functional SuSiE (mQTL, haQTL, ...) | `{name}.epigenomics_{region}.cis_results_db.rds` |\n", + "| `--metaQTL` | meta-analysis QTL | `{name}.metabolomics_{region}.cis_results_db.rds` |\n", + "| `--mnm` | multi-gene multivariate SuSiE | same as default but skips the sumstats RDS write |\n", + "\n", + "Common per-mode `--suffix` values for the upstream fine-mapping output:\n", + "\n", + "- susie: `univariate_susie_twas_weights.rds` or `univariate_bvsr.rds`\n", + "- fsusie: `fsusie_mixture_normal_top_pc_weights.rds`\n", + "- mnm: `multigene_bvsr.rds`\n", + "\n", + "Common `--region_file` choices: `TADB_enhanced_cis.coding.bed` (cis), `extended_TADB.bed` (fsusie / multi-gene), `hg38_1362_blocks.bed` (metaQTL).\n", + "\n", + "All concrete invocation examples are under [Export all top loci](#Export-all-top-loci) below; add the modality flag (`--fsusie` / `--metaQTL` / `--mnm`) plus the matching `--suffix` to switch input type. To run only the RDS export without top_loci extraction or combine, swap the workflow name `export_top_loci` → `cis_results_export --step1_only`.\n" ] }, { "cell_type": "markdown", - "id": "amazing-measure", + "id": "a5e4ea3a-8e02-4767-b533-b88f39197e83", "metadata": { "kernel": "SoS", "tags": [] }, "source": [ - "### Plotting the pip plot" + "### Export all top loci" ] }, { "cell_type": "code", "execution_count": null, - "id": "documented-secretary", + "id": "893d187b", "metadata": { - "kernel": "SoS" + "vscode": { + "languageId": "plaintext" + } }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb susie_pip_landscape_plot \\\n", - " --cwd output/test/ --plot_list plot_recipe --annot_tibble ~/Annotatr_builtin_annotation_tibble.tsv -s force &" + "# Required CLI args (substitute placeholders <...> with your real values):\n", + "# --file_path dir containing fine-mapping RDS files\n", + "# --prefix filename part BEFORE the region id\n", + "# --suffix filename part AFTER the region id\n", + "# expected per-region path:\n", + "# {file_path}/{prefix}.{region}.{suffix}\n", + "# example: ROSMAP_AC_eQTL.ENSG00000012779.univariate_bvsr.rds\n", + "# \\__ prefix __/ \\___ region ___/ \\___ suffix ___/\n", + "# --name tag used in output filenames and intermediate RDS names\n", + "# (final output: {cwd}/summary/{name}.{qtl_type}.top_loci.bed.gz)\n", + "# --qtl_type cis | trans | trans_ appears in the combined filename\n", + "# --region_file bed of regions (chr, start, end, region_id) to enumerate\n", + "# --geno_ref tabix-indexed bim.gz for allele-flipping checks\n", + "#\n", + "# Outputs produced:\n", + "# {cwd}/{name}.{region}.cis_results_db.rds per-region intermediate RDS\n", + "# {cwd}/summary/{name}.{region}.top_loci.bed.gz per-region top_loci\n", + "# {cwd}/summary/{name}.{qtl_type}.top_loci.bed.gz combined public-facing file\n", + "\n", + "# Full run: export all regions via SLURM and combine into one study-level file.\n", + "sos run ./mnm_postprocessing.ipynb export_top_loci \\\n", + " --region_file ./TADB_enhanced_cis.coding.bed \\\n", + " --file_path /path/to/fine_mapping \\\n", + " --prefix \\\n", + " --suffix \\\n", + " --name \\\n", + " --min_corr 0.8 \\\n", + " --geno_ref ./Fungen_xQTL.ROSMAP_NIA_WGS.ROSMAP_NIA_WGS.MSBB_WGS_ADSP_hg38.MiGA.MAP_Brain-xQTL_Gwas_geno_0.STARNET.aligned.bim.gz \\\n", + " --qtl_type cis \\\n", + " --cwd ~/tmp/export_run \\\n", + " --mem 14G --walltime 72h \\\n", + " -q slurm -j 100 -s build\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "understood-blackjack", + "id": "4ac34f02", "metadata": { - "kernel": "SoS" + "vscode": { + "languageId": "plaintext" + } }, "outputs": [], "source": [ - "sos run pipeline/SuSiE_post_processing.ipynb susie_upsetR_plot \\\n", - " --cwd output/test/ --plot_list plot_recipe_1 -s force &" - ] - }, - { - "cell_type": "markdown", - "id": "varying-drilling", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "The required input for this step is a tab-delimited plot_recipe file that specifies the path to each of the variant.tsv files generated from this module. Each column represents a molecular phenotype, and each row indicates the files that share common variants. Since one TAD may correspond to multiple genes, additional eQTL are permitted. If there are additional molecular phenotypes or ADGWAS datasets, additional columns can be appended.\n", - "\n", - "The built-in Annotatr_builtin_annotation_tibble.tsv can be downloaded from [synapse](https://www.synapse.org/#!Synapse:syn51198526), please download it and specify the path." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "nuclear-chancellor", - "metadata": { - "kernel": "Bash", - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "haQTL\tmQTL\teQTL\teQTL\tADGWAS\n", - "/mnt/vast/hpc/csg/molecular_phenotype_calling/QTL_fine_mapping/output/f_susie_tad_meQTL_pos//meQTL.yuqi_mqtl.tad100.uni_Fsusie.mixture_normal_per_scale.variant.tsv\t/mnt/vast/hpc/csg/molecular_phenotype_calling/QTL_fine_mapping/output/f_susie_tad_haQTL_pos//haQTL.rosmap_haqtl.tad100.uni_Fsusie.mixture_normal_per_scale.variant.tsv\t/mnt/vast/hpc/csg/molecular_phenotype_calling/eqtl//output/susie_per_gene_tad//demo.ENSG00000117322.unisusie.fit.variant.tsv\t/mnt/vast/hpc/csg/molecular_phenotype_calling/eqtl//output/susie_per_gene_tad//demo.ENSG00000203710.unisusie.fit.variant.tsv\t/mnt/vast/hpc/csg/xqtl_workflow_testing/susie_rss/output/ADGWAS_finemapping_extracted/Bellenguez/ADGWAS2022.chr1.sumstat.chr1_205972031_208461272.unisusie_rss.fit.variant.tsv\n" - ] - } - ], - "source": [ - "cat /mnt/vast/hpc/csg/molecular_phenotype_calling/QTL_fine_mapping/plot_recipe" + "# Subset by region: step 3 (combine) is auto-skipped because --region-name is set.\n", + "# Use this for one-region debugging or a re-run of a few failed regions.\n", + "sos run ./mnm_postprocessing.ipynb export_top_loci \\\n", + " --region_file ./TADB_enhanced_cis.coding.bed \\\n", + " --file_path /path/to/fine_mapping \\\n", + " --prefix \\\n", + " --suffix \\\n", + " --name \\\n", + " --min_corr 0.8 \\\n", + " --geno_ref ./Fungen_xQTL.ROSMAP_NIA_WGS.ROSMAP_NIA_WGS.MSBB_WGS_ADSP_hg38.MiGA.MAP_Brain-xQTL_Gwas_geno_0.STARNET.aligned.bim.gz \\\n", + " --qtl_type cis \\\n", + " --cwd ~/tmp/export_run \\\n", + " --region-name ENSG00000048740 \\\n", + " --mem 14G --walltime 72h \\\n", + " -q slurm -j 100 -s build\n", + "\n", + "# Combine-step overrides:\n", + "# --combine yes force run step 3 on a subset (rare; e.g. re-combining after partial re-run)\n", + "# --combine no skip step 3 on a full run (rare; e.g. combine offline)\n", + "# default 'auto': run if no region filter, skip if --region-name / --region-list is set\n", + "#\n", + "# Note: --job_size defaults to 1 (one substep per sbatch). Pass --job_size N to bundle\n", + "# N substeps in parallel inside one task; requires N x cores and N x mem on the node.\n" ] }, { "cell_type": "markdown", - "id": "9fe78e94-87f7-4de7-b46d-fae572aa1194", - "metadata": { - "kernel": "SoS" - }, + "id": "bd963f0c", + "metadata": {}, "source": [ "### Exporting cis_analysis susie_twas results " ] }, { "cell_type": "markdown", - "id": "8c6c5448-ef4d-4e09-8a55-f87aabf8da1c", - "metadata": { - "kernel": "SoS" - }, + "id": "944e2c3f", + "metadata": {}, "source": [ "meta file is produced by:\n", "```\n", @@ -285,12 +187,9 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "7fd09e57-ee13-4822-a4a7-8a3fc640f999", - "metadata": { - "kernel": "Bash", - "tags": [] - }, + "execution_count": null, + "id": "25f18719", + "metadata": {}, "outputs": [ { "name": "stdout", @@ -313,109 +212,10 @@ "head /mnt/vast/hpc/csg/rf2872/Work/Multivariate/susie_2024_new/eQTL_meta.tsv" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "137961b5-1c6c-4c4b-b29a-a27643d5b1d1", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "#susie\n", - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n", - " --region_file /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/TADB_enhanced_cis.protein_coding.bed \\\n", - " --file_path /mnt/vast/hpc/homes/rf2872/aws/rds_files \\\n", - " --name demo_susie \\\n", - " --suffix univariate_susie_twas_weights.rds \\\n", - " --prefix MiGA_eQTL KNIGHT_pQTL \\\n", - " --min_corr 0.8 \\\n", - " --geno_ref /mnt/vast/hpc/csg/rf2872/data/Fungen_xqtl/geno_align/Fungen_xQTL.ROSMAP_NIA_WGS.ROSMAP_NIA_WGS.MSBB_WGS_ADSP_hg38.MiGA.MAP_Brain-xQTL_Gwas_geno_0.STARNET.aligned.bim.gz \\\n", - " --context-meta /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/context_meta.tsv \\\n", - " --cwd demo_susie \\\n", - " --step1_only #optional to keep cache file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b2f9f93-82d8-4093-aff0-4843758a7dd2", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "#fsusie\n", - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n", - " --region_file /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/extended_TADB.bed \\\n", - " --file_path /mnt/vast/hpc/homes/rf2872/aws/rds_files \\\n", - " --name demo_fsusie \\\n", - " --suffix fsusie_mixture_normal_top_pc_weights.rds \\\n", - " --prefix ROSMAP_mQTL ROSMAP_haQTL \\\n", - " --min_corr 0.8 \\\n", - " --geno_ref /mnt/vast/hpc/csg/rf2872/data/Fungen_xqtl/geno_align/Fungen_xQTL.ROSMAP_NIA_WGS.ROSMAP_NIA_WGS.MSBB_WGS_ADSP_hg38.MiGA.MAP_Brain-xQTL_Gwas_geno_0.STARNET.aligned.bim.gz \\\n", - " --context-meta /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/context_meta.tsv \\\n", - " --cwd demo_fsusie \\\n", - " --fsusie \\\n", - " --step1_only #optional to keep cache file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74eb199a-a490-41af-acf3-1b483f6cba11", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "#meta QTL\n", - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n", - " --region_file /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/hg38_1362_blocks.bed \\\n", - " --file_path /mnt/vast/hpc/homes/rf2872/aws/rds_files \\\n", - " --name demo_metaQTL \\\n", - " --suffix univariate_susie_twas_weights.rds \\\n", - " --prefix ROSMAP_metaQTL \\\n", - " --min_corr 0.8 \\\n", - " --geno_ref /mnt/vast/hpc/csg/rf2872/data/Fungen_xqtl/geno_align/Fungen_xQTL.ROSMAP_NIA_WGS.ROSMAP_NIA_WGS.MSBB_WGS_ADSP_hg38.MiGA.MAP_Brain-xQTL_Gwas_geno_0.STARNET.aligned.bim.gz \\\n", - " --context-meta /mnt/vast/hpc/homes/rf2872/codes/xqtl-analysis/resource/context_meta.tsv \\\n", - " --cwd demo_metaQTL \\\n", - " --metaQTL \\\n", - " --step1_only #optional to keep cache file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0dc253a4-eac3-4408-9f63-da73ffed62d0", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# multi gene mvsusie\n", - "# multi gene mvsusie\n", - " sos run /data/interactive_analysis/rf2872/codes/Jan/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb cis_results_export \\\n", - " --region_file /data/xqtl-analysis/resource/extended_TADB.bed \\\n", - " --file_path /data/analysis_result/mnm/ROSMAP/mnm_genes/ \\\n", - " --name ROSMAP_AC_DeJager_eQTL \\\n", - " --suffix multigene_bvsr.rds \\\n", - " --prefix ROSMAP_AC_DeJager_eQTL \\\n", - " --min_corr 0.8 \\\n", - " --geno_ref /data/resource/geno_align/Fungen_xQTL.ROSMAP_NIA_WGS.ROSMAP_NIA_WGS.MSBB_WGS_ADSP_hg38.MiGA.MAP_Brain-xQTL_Gwas_geno_0.STARNET.aligned.bim.gz \\\n", - " --cwd demo_multigene \\\n", - " --mnm \\\n", - " --region-name chr10_100845599_104855543 \\#optional for testing with one region\n", - " --step1_only #optional to keep cache file" - ] - }, { "cell_type": "markdown", - "id": "65f3b184-552a-4ca0-b11d-0e56516ed6ff", - "metadata": { - "kernel": "SoS", - "tags": [] - }, + "id": "23e5334d", + "metadata": {}, "source": [ "### Export gwas data" ] @@ -423,10 +223,8 @@ { "cell_type": "code", "execution_count": null, - "id": "46c6b547-b86a-4014-9377-07bbad828d62", - "metadata": { - "kernel": "SoS" - }, + "id": "88c9f65a", + "metadata": {}, "outputs": [], "source": [ " sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb gwas_results_export \\\n", @@ -446,11 +244,8 @@ }, { "cell_type": "markdown", - "id": "56ab5c55-2668-4a99-8ad8-1d28c33a5ca8", - "metadata": { - "kernel": "SoS", - "tags": [] - }, + "id": "82d628c0", + "metadata": {}, "source": [ "### combine seperate meta file" ] @@ -458,10 +253,8 @@ { "cell_type": "code", "execution_count": null, - "id": "3fa6916a-6f8e-458b-aca0-8a33fe708a5b", - "metadata": { - "kernel": "SoS" - }, + "id": "d410d36d", + "metadata": {}, "outputs": [], "source": [ "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb combine_export_meta \\\n", @@ -471,11 +264,8 @@ }, { "cell_type": "markdown", - "id": "815bcc90-94ef-4d9b-b3eb-57abaacfd5c9", - "metadata": { - "kernel": "SoS", - "tags": [] - }, + "id": "73e2799e", + "metadata": {}, "source": [ "### Overlapped gwas data and eQTL data " ] @@ -483,10 +273,8 @@ { "cell_type": "code", "execution_count": null, - "id": "c0efd99d-cd04-4f8d-9d2b-418b67df5d3e", - "metadata": { - "kernel": "SoS" - }, + "id": "a4954429", + "metadata": {}, "outputs": [], "source": [ "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb overlap_qtl_gwas \\\n", @@ -496,33 +284,6 @@ " --cwd demo_overlap" ] }, - { - "cell_type": "markdown", - "id": "a5e4ea3a-8e02-4767-b533-b88f39197e83", - "metadata": { - "kernel": "SoS", - "tags": [] - }, - "source": [ - "### Export all top loci" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a7f9625-2368-4505-8f9b-6b359f2641db", - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "sos run /home/rf2872/codes/xqtl-pipeline/pipeline/fine_mapping_post_processing.ipynb export_top_loci \\\n", - "--export_path /home/ubuntu/demo_singlecontext/ \\\n", - "--region ENSG00000197106 \\\n", - "--prefix ROSMAP_DeJager \\\n", - "--suffix cis_results_db.export.rds" - ] - }, { "cell_type": "code", "execution_count": null, @@ -543,7 +304,7 @@ "## Containers that contains the necessary packages\n", "parameter: container = \"\"\n", "# For cluster jobs, number commands to run per job\n", - "parameter: job_size = 50\n", + "parameter: job_size = 1\n", "# Wall clock time expected\n", "parameter: walltime = \"96h\"\n", "# Memory expected\n", @@ -1495,7 +1256,7 @@ "outputs": [], "source": [ "# Exporting cis susie_twas results\n", - "[cis_results_export_1, gwas_results_export_1]\n", + "[cis_results_export_1, gwas_results_export_1, export_top_loci_1]\n", "# per chunk we process at most 200 datasets\n", "parameter: per_chunk = 200\n", "# Region list should have last column being region name. \n", @@ -1802,10 +1563,10 @@ " filter(rowSums(select(., starts_with('cs_coverage_'))) > 0)\n", " \n", " susie_vars <- rbind(top_loci_others %>% filter(pip >= 0.05), top_loci_cs) %>% pull(variant_id)\n", - " if(length(susie_vars) > 0) susie_vars <- align_to_genoref(var_list = susie_vars, geno_ref = geno_ref, region = paste0(gsub(\"chr\",\"\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", + " if(length(susie_vars) > 0) susie_vars <- align_to_genoref(var_list = susie_vars, geno_ref = geno_ref, region = paste0(sub(\"^(chr)?\",\"chr\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", " \n", " susie_vars_cs <- top_loci_cs %>% pull(variant_id)\n", - " if(length(susie_vars_cs) > 0) susie_vars_cs <- align_to_genoref(var_list = susie_vars_cs, geno_ref = geno_ref, region = paste0(gsub(\"chr\",\"\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", + " if(length(susie_vars_cs) > 0) susie_vars_cs <- align_to_genoref(var_list = susie_vars_cs, geno_ref = geno_ref, region = paste0(sub(\"^(chr)?\",\"chr\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", " \n", " top_loci <- top_loci %>% mutate(annotated_susie = ifelse(variant_id %in% susie_vars, 1, 0),\n", " annotated_susie_cs = ifelse(variant_id %in% susie_vars_cs, 1, 0))\n", @@ -2151,7 +1912,7 @@ " if(has_rows(dat_con$top_loci) || has_rows(dat_con$preset_top_loci)) cons_top_loci[[id]][[context]] <- context else cons_top_loci[[id]][[context]] <- NULL \n", " variant_ids <- c(dat_con$top_loci$variant_id, dat_con$variant_names, dat_con$preset_variants_result$top_loci$variant_id, dat_con$preset_variants_result$variant_names)\n", " unique_variant_ids <- unique(variant_ids)\n", - " aligned_variant_ids <- align_to_genoref(unique_variant_ids, geno_ref, paste0(gsub(\"chr\",\"\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", + " aligned_variant_ids <- align_to_genoref(unique_variant_ids, geno_ref, paste0(sub(\"^(chr)?\",\"chr\",\"${_meta_info[0]}\"), \":\", \"${_meta_info[1]}\", \"-\", \"${_meta_info[2]}\"))\n", " names(aligned_variant_ids) <- unique_variant_ids\n", "\n", " # change beta or z in top loci and sumstats\n", @@ -2244,8 +2005,8 @@ " cons_top_loci <- if(length(cons_top_loci) > 0) cons_top_loci else NA\n", "\n", " combine_data = combine_data_sumstats = cons_top_loci_minp = ''\n", - " combine_data = paste0(\"${_output:add}\",\"/\",\"${name}\", \".\", ${'\"epigenomics_\"' if fsusie else '\"\"'}, ${ '\"metabolomics_\"' if metaQTL else '\"\"'}, gene, \".cis_results_db.export.rds\")\n", - " if (${\"FALSE\" if fsusie else \"TRUE\"}) combine_data_sumstats = gsub(\"export.rds$\", \"export_sumstats.rds\", combine_data)\n", + " combine_data = paste0(\"${_output:add}\",\"/\",\"${name}\", \".\", ${'\"epigenomics_\"' if fsusie else '\"\"'}, ${ '\"metabolomics_\"' if metaQTL else '\"\"'}, gene, \".cis_results_db.rds\")\n", + " if (${\"FALSE\" if fsusie else \"TRUE\"}) combine_data_sumstats = gsub(\"\\\\.rds$\", \".sumstats.rds\", combine_data)\n", " \n", " if (${\"TRUE\" if exported_file.is_file() else \"FALSE\"}){\n", " if (file.exists(combine_data)) {\n", @@ -2462,16 +2223,22 @@ }, "outputs": [], "source": [ - "[export_top_loci]\n", - "parameter: export_path=path\n", - "parameter: region = str\n", - "parameter: prefix = str\n", - "parameter: suffix = str\n", + "[export_top_loci_2]\n", + "# `export_suffix` deliberately differs from step 1's `--suffix` (input rds) to avoid CLI collision.\n", + "parameter: export_path = cwd\n", + "parameter: export_prefix = '' # falls back to step-1 `name` if empty\n", + "parameter: export_suffix = 'cis_results_db.rds'\n", "parameter: fsusie_prefix = ''\n", "parameter: preset_top_loci = False\n", "parameter: lfsr_thres = 0.01\n", - "input: f\"{export_path}/{prefix}.{fsusie_prefix}{region}.{suffix}\" \n", - "output: f\"{cwd:a}/{prefix}.{region}.toploci.bed.gz\" \n", + "\n", + "_pfx = export_prefix if export_prefix else name\n", + "import glob as _glob\n", + "_rds_files = sorted(_glob.glob(f\"{export_path}/{_pfx}.{fsusie_prefix}*.{export_suffix}\"))\n", + "stop_if(len(_rds_files) == 0, f\"No RDS files matched: {export_path}/{_pfx}.{fsusie_prefix}*.{export_suffix}\")\n", + "\n", + "input: _rds_files, group_by = 1\n", + "output: f\"{cwd}/summary/{_input:bnn}.top_loci.bed.gz\"\n", "task: trunk_workers = 1, walltime = '1h', trunk_size = job_size, mem = '16G', cores = 1, tags = f'{_output:bn}'\n", "R: expand = \"${ }\", container = container, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n", " library(tidyverse)\n", @@ -2543,7 +2310,7 @@ " MAF = maf,\n", " betahat = betahat,\n", " sebetahat = sebetahat,\n", - " z=z,\n", + " z = if (\"z\" %in% colnames(.)) z else NA,\n", " \n", " ) %>%\n", " {\n", @@ -2596,6 +2363,53 @@ " }\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "72d3c547", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "[export_top_loci_3]\n", + "# Combine per-region .top_loci.bed.gz files into one study-level file.\n", + "# Public-facing name: {name}.{qtl_type}[.{variant_tag}].top_loci.bed.gz\n", + "parameter: qtl_type = str\n", + "parameter: variant_tag = ''\n", + "# 'auto' (default): skip step 3 if region_name or region_list is set; 'yes': force run; 'no': force skip.\n", + "parameter: combine = 'auto'\n", + "# Re-declared so this step can detect step-1 subset filters.\n", + "parameter: region_list = path()\n", + "parameter: region_name = []\n", + "stop_if(combine not in ('auto', 'yes', 'no'), f\"--combine must be auto|yes|no, got {combine}\")\n", + "_region_filter_set = (region_list.is_file() or len(region_name) > 0)\n", + "_do_combine = (combine == 'yes') or (combine == 'auto' and not _region_filter_set)\n", + "skip_if(not _do_combine, f\"Skipping combine: combine={combine}, region_filter_set={_region_filter_set}\")\n", + "_tag = f\".{variant_tag}\" if variant_tag else \"\"\n", + "input: group_by = 'all'\n", + "output: f\"{cwd}/summary/{name}.{qtl_type}{_tag}.top_loci.bed.gz\"\n", + "task: trunk_workers = 1, walltime = '2h', trunk_size = 1, mem = '8G', cores = 1, tags = f'{_output:bn}'\n", + "bash: expand = \"${ }\", container = container, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', entrypoint=entrypoint\n", + " set -euo pipefail\n", + " mkdir -p \"$(dirname \"${_output}\")\"\n", + " GZ=$(command -v bgzip || echo gzip)\n", + " {\n", + " # header from first non-empty file\n", + " for f in ${_input}; do\n", + " if [ \"$(zcat -f \"$f\" 2>/dev/null | wc -l)\" -ge 2 ]; then\n", + " zcat \"$f\" | head -1\n", + " break\n", + " fi\n", + " done\n", + " # data rows from every non-empty file\n", + " for f in ${_input}; do\n", + " if [ \"$(zcat -f \"$f\" 2>/dev/null | wc -l)\" -ge 2 ]; then\n", + " zcat \"$f\" | tail -n +2\n", + " fi\n", + " done\n", + " } | $GZ > ${_output}\n" + ] + }, { "cell_type": "markdown", "id": "1a563cfe-7cff-4536-b440-c34126e7a192",