1- library(glue )
21library(dplyr )
32library(tibble )
43library(glue )
54library(purrr )
6- library(targets )
75library(stringr )
86library(HPCell )
9- library(crew.cluster )
107library(arrow )
118library(CuratedAtlasQueryR )
129directory = " /vast/scratch/users/shen.m/Census_rerun/split_h5ad_based_on_sample_id/"
1310sample_anndata <- dir(glue(" {directory}" ), full.names = T )
1411downloaded_samples_tbl <- read_parquet(" /vast/scratch/users/shen.m/Census_rerun/census_samples_to_download_groups.parquet" )
1512downloaded_samples_tbl <- downloaded_samples_tbl | >
16- rename(cell_number = list_length ) | >
13+ dplyr :: rename(cell_number = list_length ) | >
1714 mutate(cell_number = cell_number | > as.integer(),
1815 file_name = glue(" {directory}{sample_2}.h5ad" ) | > as.character(),
1916 tier = case_when(
@@ -104,10 +101,9 @@ job::job({
104101
105102 sample_names | >
106103 initialise_hpc(
104+ store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store/" ,
107105 gene_nomenclature = " ensembl" ,
108106 data_container_type = " anndata" ,
109- store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" ,
110- # store = "/vast/projects/cellxgene_curated/census_hpcell_oct_2024/target_store_1_20",
111107 tier = tiers ,
112108 computing_resources = list (
113109 crew_controller_slurm(
@@ -117,7 +113,8 @@ job::job({
117113 workers = 300 ,
118114 tasks_max = 10 ,
119115 verbose = T ,
120- launch_max = 5
116+ launch_max = 10 ,
117+ seconds_idle = 30
121118 ),
122119
123120 crew_controller_slurm(
@@ -127,7 +124,8 @@ job::job({
127124 workers = 200 ,
128125 tasks_max = 10 ,
129126 verbose = T ,
130- launch_max = 5
127+ launch_max = 10 ,
128+ seconds_idle = 30
131129 ),
132130 crew_controller_slurm(
133131 name = " tier_3" ,
@@ -136,7 +134,8 @@ job::job({
136134 workers = 100 ,
137135 tasks_max = 10 ,
138136 verbose = T ,
139- launch_max = 5
137+ launch_max = 5 ,
138+ seconds_idle = 30
140139 ),
141140 crew_controller_slurm(
142141 name = " tier_4" ,
@@ -145,18 +144,19 @@ job::job({
145144 workers = 100 ,
146145 tasks_max = 10 ,
147146 verbose = T ,
148- launch_max = 5
147+ launch_max = 5 ,
148+ seconds_idle = 30
149149 )
150150 ),
151- verbosity = " verbose_positives " ,
151+ verbosity = " summary " ,
152152 # debug_step = "annotation_tbl_tier_4",
153153 update = " never" ,
154154 error = " continue" ,
155155 garbage_collection = 100 ,
156156 workspace_on_error = TRUE
157157
158158 ) | >
159- tranform_assay (fx = function s , target_output = " sce_transformed" ) | >
159+ transform_assay (fx = function s , target_output = " sce_transformed" ) | >
160160
161161 # # Remove empty outliers based on RNA count threshold per cell
162162 remove_empty_threshold(target_input = " sce_transformed" , RNA_feature_threshold = 200 ) | >
@@ -171,7 +171,7 @@ job::job({
171171
172172
173173
174- tar_meta(starts_with(" annotation_tbl_tier_4 " ), store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" ) | >
174+ tar_meta(starts_with(" annotation_tbl_ " ), store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" ) | >
175175 filter(! data | > is.na()) | > arrange(desc(time )) | > select(error , name )
176176
177177# I have to check the input of this NULL target
@@ -207,7 +207,6 @@ annotation_label_transfer(sce_transformed_tier_4, empty_droplets_tbl = empty_tbl
207207# '
208208# ' @example Usage:
209209# ' The pipeline script is saved as `/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target.R` and can be run using `tar_make()`.
210-
211210tar_script({
212211 library(dplyr )
213212 library(magrittr )
@@ -218,41 +217,44 @@ tar_script({
218217 library(crew.cluster )
219218 tar_option_set(
220219 memory = " transient" ,
221- garbage_collection = 1000 ,
220+ garbage_collection = 100 ,
222221 storage = " worker" ,
223222 retrieval = " worker" ,
224223 error = " continue" ,
225- debug = " annotation_tbl_light" ,
224+ # debug = "annotation_tbl_light",
226225 cue = tar_cue(mode = " never" ),
227226 controller = crew_controller_group(
228227 list (
229228 crew_controller_slurm(
230229 name = " tier_1" ,
231230 script_lines = " #SBATCH --mem 8G" ,
232231 slurm_cpus_per_task = 1 ,
233- workers = 500 ,
232+ workers = 200 ,
234233 tasks_max = 10 ,
235234 verbose = T ,
236- launch_max = 5
235+ launch_max = 5 ,
236+ seconds_idle = 30
237237 ),
238238
239239 crew_controller_slurm(
240240 name = " tier_2" ,
241241 script_lines = " #SBATCH --mem 10G" ,
242242 slurm_cpus_per_task = 1 ,
243- workers = 300 ,
243+ workers = 200 ,
244244 tasks_max = 10 ,
245245 verbose = T ,
246- launch_max = 5
246+ launch_max = 5 ,
247+ seconds_idle = 30
247248 ),
248249 crew_controller_slurm(
249250 name = " tier_3" ,
250251 script_lines = " #SBATCH --mem 15G" ,
251252 slurm_cpus_per_task = 1 ,
252- workers = 300 ,
253+ workers = 200 ,
253254 tasks_max = 10 ,
254255 verbose = T ,
255- launch_max = 5
256+ launch_max = 5 ,
257+ seconds_idle = 30
256258 ),
257259 crew_controller_slurm(
258260 name = " tier_4" ,
@@ -261,16 +263,21 @@ tar_script({
261263 workers = 30 ,
262264 tasks_max = 10 ,
263265 verbose = T ,
264- launch_max = 5
266+ launch_max = 5 ,
267+ seconds_idle = 30
265268 )
266269 )
267270 ),
268- trust_object_timestamps = TRUE
271+ trust_object_timestamps = TRUE ,
272+ workspaces = " annotation_tbl_light_ffcd3d5a64bedf1f"
269273 )
270274
271275 lighten_annotation = function (target_name , my_store ){
272276 annotation_tbl = tar_read_raw( target_name , store = my_store )
273- if (annotation_tbl | > is.null()) return (NULL )
277+ if (annotation_tbl | > is.null()) {
278+ warning(" this annotation is null -> " , target_name )
279+ return (NULL )
280+ }
274281
275282 annotation_tbl | >
276283 unnest(blueprint_scores_fine ) | >
@@ -314,7 +321,7 @@ job::job({
314321 tar_make(
315322 script = " /vast/scratch/users/mangiola.s/lighten_annotation_tbl_target.R" ,
316323 store = " /vast/scratch/users/mangiola.s/lighten_annotation_tbl_target" ,
317- reporter = " summary" , callr_function = NULL
324+ reporter = " summary"
318325 )
319326
320327})
@@ -324,31 +331,69 @@ library(arrow)
324331library(dplyr )
325332library(duckdb )
326333
327- # Write annotation light
328- tar_read(annotation_tbl_light , store = " /vast/scratch/users/mangiola.s/lighten_annotation_tbl_target" ) | >
329- rename(
330- blueprint_first_labels_fine = blueprint_first.labels.fine ,
331- monaco_first_labels_fine = monaco_first.labels.fine ,
332- azimuth_predicted_celltype_l2 = azimuth_predicted.celltype.l2
333- ) | >
334- write_parquet(" /vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/annotation_tbl_light.parquet" )
334+ write_parquet_to_parquet = function (data_tbl , output_parquet , compression = " gzip" ) {
335+
336+ # Establish connection to DuckDB in-memory database
337+ con_write <- dbConnect(duckdb :: duckdb(), dbdir = " :memory:" )
338+
339+ # Register `data_tbl` within the DuckDB connection (this doesn't load it into memory)
340+ duckdb :: duckdb_register(con_write , " data_tbl_view" , data_tbl )
341+
342+ # Use DuckDB's COPY command to write `data_tbl` directly to Parquet with compression
343+ copy_query <- paste0("
344+ COPY data_tbl_view TO '" , output_parquet , " ' (FORMAT PARQUET, COMPRESSION '" , compression , " ');
345+ " )
346+
347+ # Execute the COPY command
348+ dbExecute(con_write , copy_query )
349+
350+ # Unregister the temporary view
351+ duckdb :: duckdb_unregister(con_write , " data_tbl_view" )
352+
353+ # Disconnect from the database
354+ dbDisconnect(con_write , shutdown = TRUE )
355+ }
335356
336- cell_metadata <- tbl(
337- dbConnect(duckdb :: duckdb(), dbdir = " :memory:" ),
338- sql(" SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_metadata.parquet')" )
339- ) | >
357+
358+ # Write annotation light
359+ cell_metadata <-
360+ tbl(
361+ dbConnect(duckdb :: duckdb(), dbdir = " :memory:" ),
362+ sql(" SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_metadata.parquet')" )
363+ ) | >
340364 mutate(cell_ = paste0(cell_ , " ___" , dataset_id )) | >
341365 select(cell_ , observation_joinid , contains(" cell_type" ), dataset_id , self_reported_ethnicity , tissue , donor_id , sample_id , is_primary_data , assay )
342366
343367
368+ cell_annotation =
369+ tar_read(annotation_tbl_light , store = " /vast/scratch/users/mangiola.s/lighten_annotation_tbl_target" ) | >
370+ rename(
371+ blueprint_first_labels_fine = blueprint_first.labels.fine ,
372+ monaco_first_labels_fine = monaco_first.labels.fine ,
373+ azimuth_predicted_celltype_l2 = azimuth_predicted.celltype.l2
374+ )
375+
376+ empty_droplet =
377+ tar_read(empty_tbl_tier_1 , store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" ) | >
378+ c(tar_read(empty_tbl_tier_2 , store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" )) | >
379+ c(tar_read(empty_tbl_tier_3 , store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" )) | >
380+ c(tar_read(empty_tbl_tier_4 , store = " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" )) | >
381+ bind_rows() | >
382+ rename(cell_ = .cell )
383+
344384
345- tar_read(annotation_tbl_light , store = " /vast/scratch/users/mangiola.s/lighten_annotation_tbl_target" ) | >
346- left_join(cell_metadata , copy = TRUE ) | >
347- write_parquet(" /vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_annotation.parquet" )
385+ cell_metadata | >
386+ left_join(empty_droplet , copy = TRUE ) | >
387+ left_join(cell_annotation , copy = TRUE ) | >
388+ write_parquet_to_parquet(" /vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_annotation.parquet" )
348389
349390system(" ~/bin/rclone copy /vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024/cell_annotation.parquet box_adelaide:/Mangiola_ImmuneAtlas/reannotation_consensus/" )
350391
351-
392+ tar_workspace(
393+ " annotation_tbl_light_ffcd3d5a64bedf1f" ,
394+ script = " /vast/scratch/users/mangiola.s/lighten_annotation_tbl_target.R" ,
395+ store = " /vast/scratch/users/mangiola.s/lighten_annotation_tbl_target" ,
396+ )
352397
353398# tar_workspaces(annotation_tbl_light_c8078b8175604dd3, store = "/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target")
354399
0 commit comments