Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 52c7cdf

Browse files
committed
update dev scripts
1 parent 09ad7b3 commit 52c7cdf

File tree

3 files changed

+480
-699
lines changed

3 files changed

+480
-699
lines changed

dev/cellxgene_to_metadata.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ tar_script({
7171
tasks_max = 5,
7272
verbose = T, ,
7373
seconds_idle = 30
74-
script_directory = paste0("/vast/scratch/users/mangiola.s/cellxgenedp/crew_cluster/", basename(tempdir()))
7574
),
7675
crew_controller_slurm(
7776
name = "slurm_1_40",

dev/make_pseudobulk.R

Lines changed: 52 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ library(duckdb)
88
# Get input
99

1010

11-
result_directory = "/vast/scratch/users/mangiola.s/pseudobulk_cellNexus_1_0_0_coarse"
11+
result_directory = "/vast/scratch/users/mangiola.s/pseudobulk_cellNexus_1_0_2"
1212

1313
tar_script({
1414

15-
result_directory = "/vast/scratch/users/mangiola.s/pseudobulk_cellNexus_1_0_0_coarse"
15+
result_directory = "/vast/scratch/users/mangiola.s/pseudobulk_cellNexus_1_0_2"
1616

1717

1818
#-----------------------#
@@ -123,30 +123,30 @@ tar_script({
123123
#
124124
)
125125

126-
split_metadata = function(){
126+
split_metadata = function(metadata_parquet){
127127

128128
# CAQ_directory = "/vast/projects/cellxgene_curated/cellNexus"
129129

130130
my_metadata =
131131
tbl(
132132
dbConnect(duckdb::duckdb(), dbdir = ":memory:"),
133-
sql("SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_0.parquet')")
133+
sql(glue("SELECT * FROM read_parquet('{metadata_parquet}')"))
134134
) |>
135135

136136
#
137137

138138
# this, because for curated Atlas, query, I don't need of the other meta data,
139139
# I can add it later from the full meta data
140-
select(sample_id, cell_, file_id_cellNexus, cell_type_consensus_harmonised) |>
141-
filter(cell_type_consensus_harmonised!="other", !cell_type_consensus_harmonised |> is.na()) |>
142-
filter(!file_id_cellNexus |> is.na()) |>
140+
select(sample_id, cell__id, file_id_cellNexus_single_Cell, cell_type_unified_ensemble) |>
141+
filter(cell_type_unified_ensemble!="other", !cell_type_unified_ensemble |> is.na()) |>
142+
filter(!file_id_cellNexus_single_Cell |> is.na()) |>
143143

144144
# THIS SHOULD BE REMOVED IN THE FUTURE
145-
mutate(file_id_cellNexus = paste0(file_id_cellNexus, ".h5ad")) |>
145+
mutate(file_id_cellNexus_single_Cell = paste0(file_id_cellNexus_single_Cell, ".h5ad")) |>
146146

147147
# NEST
148148
as_tibble() |>
149-
mutate(chunk = file_id_cellNexus) |>
149+
mutate(chunk = file_id_cellNexus_single_Cell) |>
150150
nest(data = -c(chunk)) |>
151151

152152
mutate(number_of_cells = map_int(data, nrow))
@@ -162,19 +162,19 @@ tar_script({
162162
data, chunk,
163163
~ {
164164

165-
file_id_cellNexus = .y
165+
file_id_cellNexus_single_Cell = .y
166166

167167
# THIS IS TEMPORARY BECAUSE I HAVE BRAIN DATASETS WITH 1M CELL THAT ARE HARD TO SAVE IN THE DB
168-
if(!file.exists(paste0(cache.path, "/original/", file_id_cellNexus))) {
169-
warning(paste0(cache.path, "/original/", file_id_cellNexus, " does not exist in the cache") )
168+
if(!file.exists(paste0(cache.path, "/original/", file_id_cellNexus_single_Cell))) {
169+
warning(paste0(cache.path, "/original/", file_id_cellNexus_single_Cell, " does not exist in the cache") )
170170
return(NULL)
171171
}
172172

173173
.x |>
174174
CuratedAtlasQueryR:::get_data_container(
175175
repository = NULL,
176176
cache_directory = cache.path,
177-
grouping_column = "file_id_cellNexus"
177+
grouping_column = "file_id_cellNexus_single_Cell"
178178
)
179179
}
180180

@@ -198,12 +198,12 @@ tar_script({
198198
pseudobulk =
199199
aggregateAcrossCells(
200200
.x,
201-
colData(.x)[,c("sample_id", "cell_type_consensus_harmonised")],
201+
colData(.x)[,c("sample_id", "cell_type_unified_ensemble")],
202202
BPPARAM = MulticoreParam(workers = if_else(.y>50000, 20, 5))
203203
)
204-
colnames(pseudobulk) = paste0(colData(pseudobulk)$sample_id, "___", colData(pseudobulk)$cell_type_consensus_harmonised)
204+
colnames(pseudobulk) = paste0(colData(pseudobulk)$sample_id, "___", colData(pseudobulk)$cell_type_unified_ensemble)
205205

206-
pseudobulk = pseudobulk |> select(.cell, sample_id, file_id_cellNexus, cell_type_consensus_harmonised)
206+
pseudobulk = pseudobulk |> select(.cell, sample_id, file_id_cellNexus_single_Cell, cell_type_unified_ensemble)
207207

208208
# Decrease size
209209
# We will reattach rowames later
@@ -226,7 +226,7 @@ tar_script({
226226

227227
# Add columns and filter
228228
mutate(data = map2(
229-
data, file_id_cellNexus,
229+
data, file_id_cellNexus_single_Cell,
230230
~ {
231231

232232
# ## TEMPORARY FIX BECAUSE I FORGOT TO ADD ASSAY NAME
@@ -236,8 +236,8 @@ tar_script({
236236
# Add columns
237237
se =
238238
.x |>
239-
mutate(file_id_cellNexus = .y) |>
240-
select(-any_of(c("file_id_cellNexus", ".cell", "original_cell_id")))
239+
mutate(file_id_cellNexus_single_Cell = .y) |>
240+
select(-any_of(c("file_id_cellNexus_single_Cell", ".cell", "original_cell_id")))
241241

242242

243243
# # Identify samples with many genes
@@ -260,7 +260,7 @@ tar_script({
260260
.progress=TRUE
261261
)) |>
262262

263-
nest(data = -file_id_cellNexus) |>
263+
nest(data = -file_id_cellNexus_single_Cell) |>
264264
mutate(data = map(data,
265265
~ {
266266
se = .x |>
@@ -383,33 +383,40 @@ tar_script({
383383
# Pipeline
384384
#-----------------------#
385385
list(
386-
tar_target(cache.path, "/vast/projects/cellxgene_curated/cellNexus/cellxgene/29_10_2024/", deployment = "main"),
387-
388-
# Get rownames
386+
tar_target(cache.path, "/vast/scratch/users/mangiola.s/cellNexus/cellxgene/15_11_2024/", deployment = "main"),
389387
tar_target(
390-
sce_rownames,
391-
tbl(
392-
dbConnect(duckdb::duckdb(), dbdir = ":memory:"),
393-
sql("SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_0.parquet')")
394-
) |>
395-
head(1) |>
396-
mutate(file_id_cellNexus = paste0(file_id_cellNexus, ".h5ad")) |>
397-
CuratedAtlasQueryR:::get_data_container(
398-
repository = NULL,
399-
cache_directory = cache.path,
400-
grouping_column = "file_id_cellNexus"
401-
) |>
402-
rownames(),
403-
resources = tar_resources(crew = tar_resources_crew("slurm_1_20")),
404-
packages = c("arrow", "dplyr", "duckdb")
388+
metadata_parquet,
389+
"/vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_2.parquet",
390+
format = "file",
391+
deployment = "main"
405392
),
406393

394+
395+
# # Get rownames
396+
# tar_target(
397+
# sce_rownames,
398+
# tbl(
399+
# dbConnect(duckdb::duckdb(), dbdir = ":memory:"),
400+
# sql("SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_2.parquet')")
401+
# ) |>
402+
# head(1) |>
403+
# mutate(file_id_cellNexus_single_Cell = paste0(file_id_cellNexus_single_Cell, ".h5ad")) |>
404+
# CuratedAtlasQueryR:::get_data_container(
405+
# repository = NULL,
406+
# cache_directory = cache.path,
407+
# grouping_column = "file_id_cellNexus_single_Cell"
408+
# ) |>
409+
# rownames(),
410+
# resources = tar_resources(crew = tar_resources_crew("slurm_1_20")),
411+
# packages = c("arrow", "dplyr", "duckdb")
412+
# ),
413+
407414
# Do metadata
408415
tar_target(
409416
metadata_split,
410-
split_metadata(),
417+
split_metadata(metadata_parquet),
411418
resources = tar_resources(crew = tar_resources_crew("slurm_1_20")),
412-
packages = c("arrow", "dplyr", "duckdb", "tidyr", "dplyr", "purrr")
419+
packages = c("arrow", "dplyr", "duckdb", "tidyr", "dplyr", "purrr", "glue")
413420
),
414421

415422
# Get SCE SMALL
@@ -599,7 +606,7 @@ job::job({
599606

600607
tar_make(
601608
# callr_function = NULL,
602-
reporter = "summary",
609+
reporter = "verbose_positives",
603610
script = glue("{result_directory}/_targets.R"),
604611
store = glue("{result_directory}/_targets")
605612
)
@@ -617,7 +624,7 @@ tar_meta(store = glue("{result_directory}/_targets")) |>
617624
library(SummarizedExperiment)
618625
library(tidySummarizedExperiment)
619626

620-
sccomp_counts = readRDS("/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/sccomp_on_cellNexus_1_0_0/cell_metadata_1_0_0_sccomp_input_counts.rds")
627+
sccomp_counts = readRDS("/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/sccomp_on_cellNexus_1_0_2/cell_metadata_1_0_2_sccomp_input_counts.rds")
621628

622629
# Save into one SE
623630
job::job({
@@ -640,7 +647,7 @@ job::job({
640647
zellkonverter::writeH5AD(
641648
se,
642649
verbose = TRUE,
643-
file = "/vast/projects/cellxgene_curated/cellNexus/pseudobulk_sample_cell_type_1_0_0.h5ad",
650+
file = "/vast/projects/cellxgene_curated/cellNexus/pseudobulk_sample_cell_type_1_0_2.h5ad",
644651
X_name = "counts_scaled"
645652
)
646653
})
@@ -665,11 +672,11 @@ system("~/bin/rclone copy /vast/projects/cellxgene_curated/cellNexus/pseudobulk_
665672

666673
tar_read_raw("pseudobulk_file_id_quantile_normalised_processed_split_1_HDF5_1b4b19de1079ec95", store = glue("{result_directory}/_targets"))
667674

668-
# tar_invalidate(pseudobulk_file_id_quantile_normalised_processed_split_2_HDF5, store = glue("{result_directory}/_targets"))
675+
# tar_invalidate(metadata_split, store = glue("{result_directory}/_targets"))
669676
# tar_invalidate(pseudobulk_file_id_quantile_normalised_processed_split_1, store = glue("{result_directory}/_targets"))
670677

671678
tar_workspace(
672-
pseudobulk_file_id_quantile_normalised_processed_split_1_HDF5_5f69024c54bd8215,
679+
"metadata_split_SMALL",
673680
script = glue("{result_directory}/_targets.R"),
674681
store = glue("{result_directory}/_targets")
675682
)

0 commit comments

Comments
 (0)