Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 09ad7b3

Browse files
committed
update dev scripts
1 parent 2f66076 commit 09ad7b3

5 files changed

+882
-368
lines changed

dev/cellxgene_to_metadata.R

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,23 +51,26 @@ tar_script({
5151
slurm_cpus_per_task = 1,
5252
workers = 200,
5353
tasks_max = 5,
54-
verbose = T
54+
verbose = T,
55+
seconds_idle = 30
5556
),
5657
crew_controller_slurm(
5758
name = "slurm_1_10",
5859
slurm_memory_gigabytes_per_cpu = 10,
5960
slurm_cpus_per_task = 1,
6061
workers = 100,
6162
tasks_max = 5,
62-
verbose = T
63+
verbose = T,
64+
seconds_idle = 30
6365
),
6466
crew_controller_slurm(
6567
name = "slurm_1_20",
6668
slurm_memory_gigabytes_per_cpu = 20,
6769
slurm_cpus_per_task = 1,
6870
workers = 100,
6971
tasks_max = 5,
70-
verbose = T,
72+
verbose = T, ,
73+
seconds_idle = 30
7174
script_directory = paste0("/vast/scratch/users/mangiola.s/cellxgenedp/crew_cluster/", basename(tempdir()))
7275
),
7376
crew_controller_slurm(
@@ -76,23 +79,26 @@ tar_script({
7679
slurm_cpus_per_task = 1,
7780
workers = 50,
7881
tasks_max = 5,
79-
verbose = T
82+
verbose = T,
83+
seconds_idle = 30
8084
),
8185
crew_controller_slurm(
8286
name = "slurm_1_80",
8387
slurm_memory_gigabytes_per_cpu = 80,
8488
slurm_cpus_per_task = 1,
8589
workers = 30,
8690
tasks_max = 5,
87-
verbose = T
91+
verbose = T,
92+
seconds_idle = 30
8893
),
8994
crew_controller_slurm(
9095
name = "slurm_1_200",
9196
slurm_memory_gigabytes_per_cpu = 200,
9297
slurm_cpus_per_task = 1,
9398
workers = 5,
9499
tasks_max = 5,
95-
verbose = T
100+
verbose = T,
101+
seconds_idle = 30
96102
)
97103
),
98104
resources = tar_resources(crew = tar_resources_crew("slurm_1_10"))
@@ -901,7 +907,6 @@ age_days_tbl |>
901907

902908

903909

904-
905910
# #
906911
# cell_ids_for_metadata <- tbl(
907912
# dbConnect(duckdb::duckdb(), dbdir = ":memory:"),

dev/execute_hpcell_on_census_and_defining_data_tranformation.R

Lines changed: 87 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,16 @@
1-
library(glue)
21
library(dplyr)
32
library(tibble)
43
library(glue)
54
library(purrr)
6-
library(targets)
75
library(stringr)
86
library(HPCell)
9-
library(crew.cluster)
107
library(arrow)
118
library(CuratedAtlasQueryR)
129
directory = "/vast/scratch/users/shen.m/Census_rerun/split_h5ad_based_on_sample_id/"
1310
sample_anndata <- dir(glue("{directory}"), full.names = T)
1411
downloaded_samples_tbl <- read_parquet("/vast/scratch/users/shen.m/Census_rerun/census_samples_to_download_groups.parquet")
1512
downloaded_samples_tbl <- downloaded_samples_tbl |>
16-
rename(cell_number = list_length) |>
13+
dplyr::rename(cell_number = list_length) |>
1714
mutate(cell_number = cell_number |> as.integer(),
1815
file_name = glue("{directory}{sample_2}.h5ad") |> as.character(),
1916
tier = case_when(
@@ -104,10 +101,9 @@ job::job({
104101

105102
sample_names |>
106103
initialise_hpc(
104+
store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store/",
107105
gene_nomenclature = "ensembl",
108106
data_container_type = "anndata",
109-
store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store",
110-
# store = "/vast/projects/cellxgene_curated/census_hpcell_oct_2024/target_store_1_20",
111107
tier = tiers,
112108
computing_resources = list(
113109
crew_controller_slurm(
@@ -117,7 +113,8 @@ job::job({
117113
workers = 300,
118114
tasks_max = 10,
119115
verbose = T,
120-
launch_max = 5
116+
launch_max = 10,
117+
seconds_idle = 30
121118
),
122119

123120
crew_controller_slurm(
@@ -127,7 +124,8 @@ job::job({
127124
workers = 200,
128125
tasks_max = 10,
129126
verbose = T,
130-
launch_max = 5
127+
launch_max = 10,
128+
seconds_idle = 30
131129
),
132130
crew_controller_slurm(
133131
name = "tier_3",
@@ -136,7 +134,8 @@ job::job({
136134
workers = 100,
137135
tasks_max = 10,
138136
verbose = T,
139-
launch_max = 5
137+
launch_max = 5,
138+
seconds_idle = 30
140139
),
141140
crew_controller_slurm(
142141
name = "tier_4",
@@ -145,18 +144,19 @@ job::job({
145144
workers = 100,
146145
tasks_max = 10,
147146
verbose = T,
148-
launch_max = 5
147+
launch_max = 5,
148+
seconds_idle = 30
149149
)
150150
),
151-
verbosity = "verbose_positives",
151+
verbosity = "summary",
152152
# debug_step = "annotation_tbl_tier_4",
153153
update = "never",
154154
error = "continue",
155155
garbage_collection = 100,
156156
workspace_on_error = TRUE
157157

158158
) |>
159-
tranform_assay(fx = functions, target_output = "sce_transformed") |>
159+
transform_assay(fx = functions, target_output = "sce_transformed") |>
160160

161161
# # Remove empty outliers based on RNA count threshold per cell
162162
remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |>
@@ -171,7 +171,7 @@ job::job({
171171

172172

173173

174-
tar_meta(starts_with("annotation_tbl_tier_4"), store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store") |>
174+
tar_meta(starts_with("annotation_tbl_"), store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store") |>
175175
filter(!data |> is.na()) |> arrange(desc(time)) |> select(error, name)
176176

177177
# I have to check the input of this NULL target
@@ -207,7 +207,6 @@ annotation_label_transfer(sce_transformed_tier_4, empty_droplets_tbl = empty_tbl
207207
#'
208208
#' @example Usage:
209209
#' The pipeline script is saved as `/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target.R` and can be run using `tar_make()`.
210-
211210
tar_script({
212211
library(dplyr)
213212
library(magrittr)
@@ -218,41 +217,44 @@ tar_script({
218217
library(crew.cluster)
219218
tar_option_set(
220219
memory = "transient",
221-
garbage_collection = 1000,
220+
garbage_collection = 100,
222221
storage = "worker",
223222
retrieval = "worker",
224223
error = "continue",
225-
debug = "annotation_tbl_light",
224+
#debug = "annotation_tbl_light",
226225
cue = tar_cue(mode = "never"),
227226
controller = crew_controller_group(
228227
list(
229228
crew_controller_slurm(
230229
name = "tier_1",
231230
script_lines = "#SBATCH --mem 8G",
232231
slurm_cpus_per_task = 1,
233-
workers = 500,
232+
workers = 200,
234233
tasks_max = 10,
235234
verbose = T,
236-
launch_max = 5
235+
launch_max = 5,
236+
seconds_idle = 30
237237
),
238238

239239
crew_controller_slurm(
240240
name = "tier_2",
241241
script_lines = "#SBATCH --mem 10G",
242242
slurm_cpus_per_task = 1,
243-
workers = 300,
243+
workers = 200,
244244
tasks_max = 10,
245245
verbose = T,
246-
launch_max = 5
246+
launch_max = 5,
247+
seconds_idle = 30
247248
),
248249
crew_controller_slurm(
249250
name = "tier_3",
250251
script_lines = "#SBATCH --mem 15G",
251252
slurm_cpus_per_task = 1,
252-
workers = 300,
253+
workers = 200,
253254
tasks_max = 10,
254255
verbose = T,
255-
launch_max = 5
256+
launch_max = 5,
257+
seconds_idle = 30
256258
),
257259
crew_controller_slurm(
258260
name = "tier_4",
@@ -261,16 +263,21 @@ tar_script({
261263
workers = 30,
262264
tasks_max = 10,
263265
verbose = T,
264-
launch_max = 5
266+
launch_max = 5,
267+
seconds_idle = 30
265268
)
266269
)
267270
),
268-
trust_object_timestamps = TRUE
271+
trust_object_timestamps = TRUE,
272+
workspaces = "annotation_tbl_light_ffcd3d5a64bedf1f"
269273
)
270274

271275
lighten_annotation = function(target_name, my_store ){
272276
annotation_tbl = tar_read_raw( target_name, store = my_store )
273-
if(annotation_tbl |> is.null()) return(NULL)
277+
if(annotation_tbl |> is.null()) {
278+
warning("this annotation is null -> ", target_name)
279+
return(NULL)
280+
}
274281

275282
annotation_tbl |>
276283
unnest(blueprint_scores_fine) |>
@@ -314,7 +321,7 @@ job::job({
314321
tar_make(
315322
script = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target.R",
316323
store = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target",
317-
reporter = "summary", callr_function = NULL
324+
reporter = "summary"
318325
)
319326

320327
})
@@ -324,31 +331,69 @@ library(arrow)
324331
library(dplyr)
325332
library(duckdb)
326333

327-
# Write annotation light
328-
tar_read(annotation_tbl_light, store = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target") |>
329-
rename(
330-
blueprint_first_labels_fine = blueprint_first.labels.fine,
331-
monaco_first_labels_fine = monaco_first.labels.fine,
332-
azimuth_predicted_celltype_l2 = azimuth_predicted.celltype.l2
333-
) |>
334-
write_parquet("/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/annotation_tbl_light.parquet")
334+
write_parquet_to_parquet = function(data_tbl, output_parquet, compression = "gzip") {
335+
336+
# Establish connection to DuckDB in-memory database
337+
con_write <- dbConnect(duckdb::duckdb(), dbdir = ":memory:")
338+
339+
# Register `data_tbl` within the DuckDB connection (this doesn't load it into memory)
340+
duckdb::duckdb_register(con_write, "data_tbl_view", data_tbl)
341+
342+
# Use DuckDB's COPY command to write `data_tbl` directly to Parquet with compression
343+
copy_query <- paste0("
344+
COPY data_tbl_view TO '", output_parquet, "' (FORMAT PARQUET, COMPRESSION '", compression, "');
345+
")
346+
347+
# Execute the COPY command
348+
dbExecute(con_write, copy_query)
349+
350+
# Unregister the temporary view
351+
duckdb::duckdb_unregister(con_write, "data_tbl_view")
352+
353+
# Disconnect from the database
354+
dbDisconnect(con_write, shutdown = TRUE)
355+
}
335356

336-
cell_metadata <- tbl(
337-
dbConnect(duckdb::duckdb(), dbdir = ":memory:"),
338-
sql("SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_metadata.parquet')")
339-
) |>
357+
358+
# Write annotation light
359+
cell_metadata <-
360+
tbl(
361+
dbConnect(duckdb::duckdb(), dbdir = ":memory:"),
362+
sql("SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_metadata.parquet')")
363+
) |>
340364
mutate(cell_ = paste0(cell_, "___", dataset_id)) |>
341365
select(cell_, observation_joinid, contains("cell_type"), dataset_id, self_reported_ethnicity, tissue, donor_id, sample_id, is_primary_data, assay)
342366

343367

368+
cell_annotation =
369+
tar_read(annotation_tbl_light, store = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target") |>
370+
rename(
371+
blueprint_first_labels_fine = blueprint_first.labels.fine,
372+
monaco_first_labels_fine = monaco_first.labels.fine,
373+
azimuth_predicted_celltype_l2 = azimuth_predicted.celltype.l2
374+
)
375+
376+
empty_droplet =
377+
tar_read(empty_tbl_tier_1, store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store") |>
378+
c(tar_read(empty_tbl_tier_2, store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store")) |>
379+
c(tar_read(empty_tbl_tier_3, store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store")) |>
380+
c(tar_read(empty_tbl_tier_4, store = "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store")) |>
381+
bind_rows() |>
382+
rename(cell_ = .cell)
383+
344384

345-
tar_read(annotation_tbl_light, store = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target") |>
346-
left_join(cell_metadata, copy = TRUE) |>
347-
write_parquet("/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_annotation.parquet")
385+
cell_metadata |>
386+
left_join(empty_droplet, copy=TRUE) |>
387+
left_join(cell_annotation, copy=TRUE) |>
388+
write_parquet_to_parquet("/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_annotation.parquet")
348389

349390
system("~/bin/rclone copy /vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_annotation.parquet box_adelaide:/Mangiola_ImmuneAtlas/reannotation_consensus/")
350391

351-
392+
tar_workspace(
393+
"annotation_tbl_light_ffcd3d5a64bedf1f",
394+
script = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target.R",
395+
store = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target",
396+
)
352397

353398
# tar_workspaces(annotation_tbl_light_c8078b8175604dd3, store = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target")
354399

0 commit comments

Comments
 (0)