Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit f3b59b4

Browse files
committed
update make pseudobulk
1 parent 626e0a9 commit f3b59b4

File tree

1 file changed

+97
-30
lines changed

1 file changed

+97
-30
lines changed

dev/make_pseudobulk.R

Lines changed: 97 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ tar_script(
3434
memory = "transient",
3535
storage = "worker",
3636
retrieval = "worker",
37-
#error = "continue",
37+
error = "continue",
3838
format = "qs",
39+
# cue = tar_cue(mode = "never") ,
3940

4041
#-----------------------#
4142
# SLURM
@@ -56,14 +57,15 @@ tar_script(
5657
slurm_memory_gigabytes_per_cpu = 10,
5758
slurm_cpus_per_task = 1,
5859
workers = 100,
60+
tasks_max = 5,
5961
verbose = T
6062
),
6163
crew_controller_slurm(
6264
name = "slurm_1_20",
6365
slurm_memory_gigabytes_per_cpu = 20,
6466
slurm_cpus_per_task = 1,
65-
workers = 50,
66-
# tasks_max = 5,
67+
workers = 60,
68+
tasks_max = 5,
6769
verbose = T
6870
),
6971
crew_controller_slurm(
@@ -78,7 +80,7 @@ tar_script(
7880
name = "slurm_1_80",
7981
slurm_memory_gigabytes_per_cpu = 80,
8082
slurm_cpus_per_task = 1,
81-
workers = 20,
83+
workers = 5,
8284
tasks_max = 5,
8385
verbose = T
8486
),
@@ -91,11 +93,11 @@ tar_script(
9193
verbose = T
9294
)
9395
),
94-
#debug = "pseudobulk_file_id",
96+
# debug = "pseudobulk_file_id_quantile_normalised_2_7240767f602a5810",
9597

9698
resources = tar_resources(crew = tar_resources_crew("slurm_2_20"))
9799
#, # Set the target you want to debug.
98-
#cue = tar_cue(mode = "never")
100+
#
99101
)
100102

101103
split_metadata = function(){
@@ -108,22 +110,58 @@ tar_script(
108110
# DROP FETAL SCI-SEQ FOR THE MOMENT
109111
filter(collection_id != "c114c20f-1ef4-49a5-9c2e-d965787fb90c") |>
110112
################################
111-
112-
distinct(file_id, sample_, cell_, file_id_db, cell_type_harmonised) |>
113+
114+
# this, because for curated Atlas, query, I don't need of the other meta data,
115+
# I can add it later from the full meta data
116+
select(file_id, sample_, cell_, file_id_db, cell_type_harmonised) |>
113117
as_tibble() |>
114118
mutate(chunk = sample_) |>
115119
nest(data = -c(chunk, file_id)) |>
116-
mutate(number_of_cells = map_int(data, nrow))
120+
mutate(number_of_cells = map_int(data, nrow))
117121

122+
123+
# Add a subsampled version of the sci seq
124+
my_sci_seq =
125+
get_metadata(cache_directory = CAQ_directory) |>
126+
127+
# DROP FETAL SCI-SEQ FOR THE MOMENT
128+
filter(collection_id == "c114c20f-1ef4-49a5-9c2e-d965787fb90c") |>
129+
select(file_id, sample_, cell_, file_id_db, cell_type_harmonised) |>
130+
as_tibble() |>
131+
nest(data = -c(file_id, sample_, cell_type_harmonised)) |>
132+
mutate(n = map_int(data, nrow)) |>
133+
mutate(data = map2(
134+
data, n,
135+
~ .x |> slice_sample( n = min(.y, 10000), replace = FALSE)
136+
)) |>
137+
unnest(data) |>
138+
select(-n) |>
139+
mutate(chunk = sample_) |>
140+
nest(data = -c(chunk, file_id)) |>
141+
mutate(number_of_cells = map_int(data, nrow))
142+
143+
my_metadata |> bind_rows(my_sci_seq)
144+
145+
118146
}
119147

120-
get_sce = function(tissue_cell_type_metadata) {
148+
get_sce = function(tissue_cell_type_metadata, sce_rownames) {
149+
150+
cache.path = "/vast/projects/mangiola_immune_map/caq_cloud3"
151+
dir.create(cache.path, recursive = TRUE, showWarnings = FALSE)
121152

122153
tissue_cell_type_metadata |>
123154
mutate(data = pmap(
124155
list(data),
125156
~ ..1 |>
126-
get_single_cell_experiment(cache_directory = "/vast/projects/cellxgene_curated")
157+
get_single_cell_experiment(cache_directory = cache.path)
158+
# |>
159+
#
160+
# # !!!!
161+
# # this is needed, because, for some reason, the local data repository, still has ENSEMBL genes,
162+
# # while the cloud repository does not
163+
# _[sce_rownames,]
164+
127165
# |>
128166
# mutate(sample_se =
129167
#
@@ -162,17 +200,17 @@ tar_script(
162200
print("Start aggregate")
163201
gc()
164202

165-
se =
203+
166204
se_df |>
167205

168206
# Add columns and filter
169207
mutate(data = map2(
170208
data, file_id,
171209
~ {
172210

173-
## TEMPORARY FIX BECAUSE I FORGOT TO ADD ASSAY NAME
174-
assays(.x) = .x |> assays() |> as.list() |> set_names("counts")
175-
#####
211+
# ## TEMPORARY FIX BECAUSE I FORGOT TO ADD ASSAY NAME
212+
# assays(.x) = .x |> assays() |> as.list() |> set_names("counts")
213+
# #####
176214

177215
# Add columns
178216
se =
@@ -222,7 +260,8 @@ tar_script(
222260
rownames(se) = se_rownames
223261

224262
se
225-
}))
263+
})
264+
)
226265

227266
}
228267

@@ -263,34 +302,62 @@ tar_script(
263302

264303
# Get SCE SMALL
265304
tar_target(
266-
metadata_split_pseudobulk_SMALL,
267-
metadata_split_SMALL |> get_sce() |> get_pseudobulk(),
305+
metadata_split_pseudobulk_SMALL_2,
306+
metadata_split_SMALL |> get_sce(sce_rownames) |> get_pseudobulk(),
268307
pattern = map(metadata_split_SMALL),
269308
resources = tar_resources(crew = tar_resources_crew("slurm_1_20"))
270309
),
271310

272311
# Get SCE BIG
273312
tar_target(
274313
metadata_split_pseudobulk_BIG,
275-
metadata_split_BIG |> get_sce() |> get_pseudobulk(),
314+
metadata_split_BIG |> get_sce(sce_rownames) |> get_pseudobulk(),
276315
pattern = map(metadata_split_BIG),
277316
resources = tar_resources(crew = tar_resources_crew("slurm_1_80"))
278317
),
279318

280319
# Group samples
281320
tarchetypes::tar_group_by(
282-
metadata_grouped_pseudobulk,
283-
metadata_split_pseudobulk_SMALL |> bind_rows(metadata_split_pseudobulk_BIG),
321+
metadata_grouped_pseudobulk_2,
322+
metadata_split_pseudobulk_SMALL_2 |> bind_rows(metadata_split_pseudobulk_BIG),
284323
file_id,
285324
resources = tar_resources(crew = tar_resources_crew("slurm_2_400"))
286-
),
287-
325+
) ,
326+
288327
# Aggregate
289328
tar_target(
290-
pseudobulk_file_id,
291-
metadata_grouped_pseudobulk |> aggregate(sce_rownames) ,
292-
pattern = map(metadata_grouped_pseudobulk),
293-
resources = tar_resources(crew = tar_resources_crew("slurm_1_20"))#, deployment = "main"
329+
pseudobulk_grouped_by_file_id_2,
330+
metadata_grouped_pseudobulk_2 |> aggregate(sce_rownames) ,
331+
pattern = map(metadata_grouped_pseudobulk_2),
332+
resources = tar_resources(crew = tar_resources_crew("slurm_1_20"))
333+
) ,
334+
335+
# Quantile normalisation taking a "good looking distribution"
336+
tar_target(
337+
the_best_target_distribution,
338+
pseudobulk_grouped_by_file_id_2 |>
339+
filter(file_id== "c0dca32e-fa64-4448-bc12-2b8f16702c29") |>
340+
pull(data) |>
341+
_[[1]] |>
342+
assay( "counts") |>
343+
as.matrix() |>
344+
preprocessCore::normalize.quantiles.determine.target() ,
345+
resources = tar_resources(crew = tar_resources_crew("slurm_1_20"))
346+
),
347+
348+
tar_target(
349+
pseudobulk_file_id_quantile_normalised_3,
350+
pseudobulk_grouped_by_file_id_2 |> mutate(data = map(
351+
data,
352+
~ .x |>
353+
quantile_normalise_abundance(
354+
method="preprocesscore_normalize_quantiles_use_target",
355+
target_distribution = the_best_target_distribution
356+
) |>
357+
select(-counts)
358+
)) ,
359+
pattern = map(pseudobulk_grouped_by_file_id_2),
360+
resources = tar_resources(crew = tar_resources_crew("slurm_1_20"))
294361
)
295362

296363
)
@@ -303,13 +370,13 @@ tar_script(
303370

304371

305372
tar_make(
306-
#callr_function = NULL,
307-
reporter = "verbose_positives",
373+
# callr_function = NULL,
374+
reporter = "summary",
308375
script = glue("{result_directory}/_targets.R"),
309376
store = glue("{result_directory}/_targets")
310377
)
311378

312379

313380

314-
tar_read(pseudobulk_file_id, store = "/vast/projects/cellxgene_curated/pseudobulk_0.2.3.4/_targets")
381+
tar_read(pseudobulk_file_id_quantile_normalised_3, store = "/vast/projects/cellxgene_curated/pseudobulk_0.2.3.4/_targets")
315382

0 commit comments

Comments
 (0)