@@ -34,8 +34,9 @@ tar_script(
3434 memory = " transient" ,
3535 storage = " worker" ,
3636 retrieval = " worker" ,
37- # error = "continue",
37+ error = " continue" ,
3838 format = " qs" ,
39+ # cue = tar_cue(mode = "never") ,
3940
4041 # -----------------------#
4142 # SLURM
@@ -56,14 +57,15 @@ tar_script(
5657 slurm_memory_gigabytes_per_cpu = 10 ,
5758 slurm_cpus_per_task = 1 ,
5859 workers = 100 ,
60+ tasks_max = 5 ,
5961 verbose = T
6062 ),
6163 crew_controller_slurm(
6264 name = " slurm_1_20" ,
6365 slurm_memory_gigabytes_per_cpu = 20 ,
6466 slurm_cpus_per_task = 1 ,
65- workers = 50 ,
66- # tasks_max = 5,
67+ workers = 60 ,
68+ tasks_max = 5 ,
6769 verbose = T
6870 ),
6971 crew_controller_slurm(
@@ -78,7 +80,7 @@ tar_script(
7880 name = " slurm_1_80" ,
7981 slurm_memory_gigabytes_per_cpu = 80 ,
8082 slurm_cpus_per_task = 1 ,
81- workers = 20 ,
83+ workers = 5 ,
8284 tasks_max = 5 ,
8385 verbose = T
8486 ),
@@ -91,11 +93,11 @@ tar_script(
9193 verbose = T
9294 )
9395 ),
94- # debug = "pseudobulk_file_id ",
96+ # debug = "pseudobulk_file_id_quantile_normalised_2_7240767f602a5810 ",
9597
9698 resources = tar_resources(crew = tar_resources_crew(" slurm_2_20" ))
9799 # , # Set the target you want to debug.
98- # cue = tar_cue(mode = "never")
100+ #
99101 )
100102
101103 split_metadata = function (){
@@ -108,22 +110,58 @@ tar_script(
108110 # DROP FETAL SCI-SEQ FOR THE MOMENT
109111 filter(collection_id != " c114c20f-1ef4-49a5-9c2e-d965787fb90c" ) | >
110112 # ###############################
111-
112- distinct(file_id , sample_ , cell_ , file_id_db , cell_type_harmonised ) | >
113+
114+ # this, because for curated Atlas, query, I don't need of the other meta data,
115+ # I can add it later from the full meta data
116+ select(file_id , sample_ , cell_ , file_id_db , cell_type_harmonised ) | >
113117 as_tibble() | >
114118 mutate(chunk = sample_ ) | >
115119 nest(data = - c(chunk , file_id )) | >
116- mutate(number_of_cells = map_int(data , nrow ))
120+ mutate(number_of_cells = map_int(data , nrow ))
117121
122+
123+ # Add a subsampled version of the sci seq
124+ my_sci_seq =
125+ get_metadata(cache_directory = CAQ_directory ) | >
126+
127+ # DROP FETAL SCI-SEQ FOR THE MOMENT
128+ filter(collection_id == " c114c20f-1ef4-49a5-9c2e-d965787fb90c" ) | >
129+ select(file_id , sample_ , cell_ , file_id_db , cell_type_harmonised ) | >
130+ as_tibble() | >
131+ nest(data = - c(file_id , sample_ , cell_type_harmonised )) | >
132+ mutate(n = map_int(data , nrow )) | >
133+ mutate(data = map2(
134+ data , n ,
135+ ~ .x | > slice_sample( n = min(.y , 10000 ), replace = FALSE )
136+ )) | >
137+ unnest(data ) | >
138+ select(- n ) | >
139+ mutate(chunk = sample_ ) | >
140+ nest(data = - c(chunk , file_id )) | >
141+ mutate(number_of_cells = map_int(data , nrow ))
142+
143+ my_metadata | > bind_rows(my_sci_seq )
144+
145+
118146 }
119147
120- get_sce = function (tissue_cell_type_metadata ) {
148+ get_sce = function (tissue_cell_type_metadata , sce_rownames ) {
149+
150+ cache.path = " /vast/projects/mangiola_immune_map/caq_cloud3"
151+ dir.create(cache.path , recursive = TRUE , showWarnings = FALSE )
121152
122153 tissue_cell_type_metadata | >
123154 mutate(data = pmap(
124155 list (data ),
125156 ~ ..1 | >
126- get_single_cell_experiment(cache_directory = " /vast/projects/cellxgene_curated" )
157+ get_single_cell_experiment(cache_directory = cache.path )
158+ # |>
159+ #
160+ # # !!!!
161+ # # this is needed, because, for some reason, the local data repository, still has ENSEMBL genes,
162+ # # while the cloud repository does not
163+ # _[sce_rownames,]
164+
127165 # |>
128166 # mutate(sample_se =
129167 #
@@ -162,17 +200,17 @@ tar_script(
162200 print(" Start aggregate" )
163201 gc()
164202
165- se =
203+
166204 se_df | >
167205
168206 # Add columns and filter
169207 mutate(data = map2(
170208 data , file_id ,
171209 ~ {
172210
173- # # TEMPORARY FIX BECAUSE I FORGOT TO ADD ASSAY NAME
174- assays(.x ) = .x | > assays() | > as.list() | > set_names(" counts" )
175- # ####
211+ # # # TEMPORARY FIX BECAUSE I FORGOT TO ADD ASSAY NAME
212+ # assays(.x) = .x |> assays() |> as.list() |> set_names("counts")
213+ # # ####
176214
177215 # Add columns
178216 se =
@@ -222,7 +260,8 @@ tar_script(
222260 rownames(se ) = se_rownames
223261
224262 se
225- }))
263+ })
264+ )
226265
227266 }
228267
@@ -263,34 +302,62 @@ tar_script(
263302
264303 # Get SCE SMALL
265304 tar_target(
266- metadata_split_pseudobulk_SMALL ,
267- metadata_split_SMALL | > get_sce() | > get_pseudobulk(),
305+ metadata_split_pseudobulk_SMALL_2 ,
306+ metadata_split_SMALL | > get_sce(sce_rownames ) | > get_pseudobulk(),
268307 pattern = map(metadata_split_SMALL ),
269308 resources = tar_resources(crew = tar_resources_crew(" slurm_1_20" ))
270309 ),
271310
272311 # Get SCE BIG
273312 tar_target(
274313 metadata_split_pseudobulk_BIG ,
275- metadata_split_BIG | > get_sce() | > get_pseudobulk(),
314+ metadata_split_BIG | > get_sce(sce_rownames ) | > get_pseudobulk(),
276315 pattern = map(metadata_split_BIG ),
277316 resources = tar_resources(crew = tar_resources_crew(" slurm_1_80" ))
278317 ),
279318
280319 # Group samples
281320 tarchetypes :: tar_group_by(
282- metadata_grouped_pseudobulk ,
283- metadata_split_pseudobulk_SMALL | > bind_rows(metadata_split_pseudobulk_BIG ),
321+ metadata_grouped_pseudobulk_2 ,
322+ metadata_split_pseudobulk_SMALL_2 | > bind_rows(metadata_split_pseudobulk_BIG ),
284323 file_id ,
285324 resources = tar_resources(crew = tar_resources_crew(" slurm_2_400" ))
286- ),
287-
325+ ) ,
326+
288327 # Aggregate
289328 tar_target(
290- pseudobulk_file_id ,
291- metadata_grouped_pseudobulk | > aggregate(sce_rownames ) ,
292- pattern = map(metadata_grouped_pseudobulk ),
293- resources = tar_resources(crew = tar_resources_crew(" slurm_1_20" ))# , deployment = "main"
329+ pseudobulk_grouped_by_file_id_2 ,
330+ metadata_grouped_pseudobulk_2 | > aggregate(sce_rownames ) ,
331+ pattern = map(metadata_grouped_pseudobulk_2 ),
332+ resources = tar_resources(crew = tar_resources_crew(" slurm_1_20" ))
333+ ) ,
334+
335+ # Quantile normalisation taking a "good looking distribution"
336+ tar_target(
337+ the_best_target_distribution ,
338+ pseudobulk_grouped_by_file_id_2 | >
339+ filter(file_id == " c0dca32e-fa64-4448-bc12-2b8f16702c29" ) | >
340+ pull(data ) | >
341+ _[[1 ]] | >
342+ assay( " counts" ) | >
343+ as.matrix() | >
344+ preprocessCore :: normalize.quantiles.determine.target() ,
345+ resources = tar_resources(crew = tar_resources_crew(" slurm_1_20" ))
346+ ),
347+
348+ tar_target(
349+ pseudobulk_file_id_quantile_normalised_3 ,
350+ pseudobulk_grouped_by_file_id_2 | > mutate(data = map(
351+ data ,
352+ ~ .x | >
353+ quantile_normalise_abundance(
354+ method = " preprocesscore_normalize_quantiles_use_target" ,
355+ target_distribution = the_best_target_distribution
356+ ) | >
357+ select(- counts )
358+ )) ,
359+ pattern = map(pseudobulk_grouped_by_file_id_2 ),
360+ resources = tar_resources(crew = tar_resources_crew(" slurm_1_20" ))
294361 )
295362
296363 )
@@ -303,13 +370,13 @@ tar_script(
303370
304371
305372tar_make(
306- # callr_function = NULL,
307- reporter = " verbose_positives " ,
373+ # callr_function = NULL,
374+ reporter = " summary " ,
308375 script = glue(" {result_directory}/_targets.R" ),
309376 store = glue(" {result_directory}/_targets" )
310377)
311378
312379
313380
314- tar_read(pseudobulk_file_id , store = " /vast/projects/cellxgene_curated/pseudobulk_0.2.3.4/_targets" )
381+ tar_read(pseudobulk_file_id_quantile_normalised_3 , store = " /vast/projects/cellxgene_curated/pseudobulk_0.2.3.4/_targets" )
315382
0 commit comments