Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 0ccef13

Browse files
committed
update dev steps
1 parent 0fa42be commit 0ccef13

File tree

3 files changed

+80
-83
lines changed

3 files changed

+80
-83
lines changed

dev/2_execute_hpcell_on_census_and_defining_data_tranformation.R

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,6 @@ annotation_label_transfer(sce_transformed_tier_4, empty_droplets_tbl = empty_tbl
182182
#' @function `lighten_annotation`: Processes each annotation table target, unnesting and selecting specific columns to reduce data size.
183183
#'
184184
#' @example Usage:
185-
#' The pipeline script is saved as `/vast/scratch/users/mangiola.s/lighten_annotation_tbl_target.R` and can be run using `tar_make()`.
186185
tar_script({
187186
library(dplyr)
188187
library(magrittr)

dev/3_prepare_local_cache_splitting_du_dataset_and_cell_type.R

Lines changed: 79 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,8 @@ tar_script({
206206
storage = "worker",
207207
retrieval = "worker",
208208
error = "continue",
209-
# debug = "saved_dataset_cpm_b1de4e660a0a9cf5",
210-
cue = tar_cue(mode = "never"),
209+
debug = "dataset_id_sce",
210+
211211
workspace_on_error = TRUE,
212212
controller = crew_controller_group(
213213
list(
@@ -294,36 +294,36 @@ tar_script({
294294
# cores = as.numeric(Sys.getenv("SLURM_CPUS_PER_TASK", unset = 1))
295295
# bp <- MulticoreParam(workers = cores , progressbar = TRUE) # Adjust the number of workers as needed
296296
#
297-
dataset_id_sce |>
298-
purrr::transpose() |>
299-
lapply(
300-
FUN = function(x) {
301-
302-
.x = x[[2]]
303-
.y = x[[1]]
304-
305-
# Check if the 'sce' has only one cell (column)
306-
if(ncol(assay(.x)) == 1) {
307-
308-
# Duplicate the assay to prevent saving errors due to single-column matrices
309-
my_assay = cbind(assay(.x), assay(.x))
310-
# Rename the second column to distinguish it
311-
colnames(my_assay)[2] = paste0("DUMMY", "___", colnames(my_assay)[2])
312-
313-
cd = colData(.x)
314-
cd = cd |> rbind(cd)
315-
rownames(cd)[2] = paste0("DUMMY", "___", rownames(cd)[2])
316-
317-
318-
319-
.x = SingleCellExperiment(assay = list( counts = my_assay ), colData = cd)
320-
}
321-
322-
323-
# TEMPORARY FOR SOME REASON THE MIN COUNTS IS NOT 0 FOR SOME SAMPLES
324-
.x = HPCell:::check_if_assay_minimum_count_is_zero_and_correct_TEMPORARY(.x, assays(.x) |> names() |> _[1], subset_up_to_number_of_cells = 100)
325-
326-
.x = SingleCellExperiment(assay = list( counts = .x |> assay()), colData = colData(.x))
297+
298+
299+
300+
.x = dataset_id_sce |> pull(sce) |> _[[1]]
301+
.y = dataset_id_sce |> pull(file_id_cellNexus_single_cell) |> _[[1]] |> str_remove("\\.h5ad")
302+
303+
.x |> assays() |> names() = "counts"
304+
305+
# # Check if the 'sce' has only one cell (column)
306+
# if(ncol(assay(.x)) == 1) {
307+
#
308+
# # Duplicate the assay to prevent saving errors due to single-column matrices
309+
# my_assay = cbind(assay(.x), assay(.x))
310+
# # Rename the second column to distinguish it
311+
# colnames(my_assay)[2] = paste0("DUMMY", "___", colnames(my_assay)[2])
312+
#
313+
# cd = colData(.x)
314+
# cd = cd |> rbind(cd)
315+
# rownames(cd)[2] = paste0("DUMMY", "___", rownames(cd)[2])
316+
#
317+
#
318+
#
319+
# .x = SingleCellExperiment(assay = list( counts = my_assay ), colData = cd)
320+
# }
321+
#
322+
#
323+
# # TEMPORARY FOR SOME REASON THE MIN COUNTS IS NOT 0 FOR SOME SAMPLES
324+
# .x = HPCell:::check_if_assay_minimum_count_is_zero_and_correct_TEMPORARY(.x, assays(.x) |> names() |> _[1], subset_up_to_number_of_cells = 100)
325+
#
326+
# .x = SingleCellExperiment(assay = list( counts = .x |> assay()), colData = colData(.x))
327327

328328

329329
# My attempt to save a integer, sparse, delayed matrix (with zellkonverter it is not possible to save integers)
@@ -334,12 +334,7 @@ tar_script({
334334
.x |> save_experiment_data(glue("{cache_directory}/{.y}"))
335335

336336
return(TRUE) # Indicate successful saving
337-
}
338-
#,
339-
#BPPARAM = bp # Use the defined parallel backend
340-
)
341-
342-
return("saved")
337+
343338

344339
}
345340

@@ -489,13 +484,9 @@ tar_script({
489484
# cores = as.numeric(Sys.getenv("SLURM_CPUS_PER_TASK", unset = 1))
490485
# bp <- MulticoreParam(workers = cores , progressbar = TRUE) # Adjust the number of workers as needed
491486
#
492-
dataset_id_sce |>
493-
purrr::transpose() |>
494-
purrr::map(
495-
~ {
496-
x = .x
497-
.x = x[[2]]
498-
.y = x[[1]]
487+
488+
.x = dataset_id_sce |> pull(sce) |> _[[1]]
489+
.y = dataset_id_sce |> pull(file_id_cellNexus_single_cell) |> _[[1]]
499490

500491
# Check if the 'sce' has only one cell (column)
501492
if(ncol(assay(.x)) == 1) {
@@ -541,13 +532,7 @@ tar_script({
541532
.x |> save_experiment_data(glue("{cache_directory}/{.y}"))
542533

543534
return(TRUE) # Indicate successful saving
544-
},
545-
.progress = TRUE
546-
#,
547-
#BPPARAM = bp # Use the defined parallel backend
548-
)
549-
550-
return("saved")
535+
551536

552537

553538

@@ -567,20 +552,26 @@ tar_script({
567552
sql(glue("SELECT * FROM read_parquet('{file_id_db_file}')"))
568553
) |>
569554
filter(dataset_id == my_dataset_id) |>
570-
select(cell_id, sample_id, dataset_id, file_id_cellNexus_single_cell) |>
571-
572-
# Drop extension because it is added later
573-
mutate(file_id_cellNexus_single_cell = file_id_cellNexus_single_cell |> str_remove("\\.h5ad")) |>
574-
as_tibble()
555+
select(cell_id, sample_id, dataset_id, file_id_cellNexus_single_cell)
556+
# |>
557+
#
558+
# # Drop extension because it is added later
559+
# mutate(file_id_cellNexus_single_cell = file_id_cellNexus_single_cell |> str_remove("\\.h5ad")) |>
560+
# as_tibble()
561+
562+
file_id_db =
563+
target_name_grouped_by_dataset_id |>
564+
left_join(file_id_db, copy = TRUE)
565+
575566

576567
# Parallelise
577568
cores = as.numeric(Sys.getenv("SLURM_CPUS_PER_TASK", unset = 1))
578569
bp <- MulticoreParam(workers = cores , progressbar = TRUE) # Adjust the number of workers as needed
579570

580571
# Begin processing the data pipeline with the initial dataset 'target_name_grouped_by_dataset_id'
581572
sce_df =
582-
target_name_grouped_by_dataset_id |>
583-
573+
file_id_db |>
574+
nest(cells = cell_id) |>
584575
# Step 1: Read raw data for each 'target_name' and store it in a new column 'sce'
585576
mutate(
586577
sce = bplapply(
@@ -591,7 +582,9 @@ tar_script({
591582
) |>
592583

593584
# This should not be needed, but there are some data sets with zero cells
594-
filter(!map_lgl(sce, is.null))
585+
filter(!map_lgl(sce, is.null)) |>
586+
587+
mutate(sce = map2(sce, cells, ~ .x |> filter(.cell %in% .y$cell_id), .progress = TRUE))
595588

596589

597590

@@ -601,9 +594,7 @@ tar_script({
601594
}
602595

603596
# plan(multisession, workers = 20)
604-
605-
sce_df =
606-
sce_df |>
597+
sce_df |>
607598

608599
# # Step 4: Group the data by 'dataset_id' and 'tar_group' for further summarization
609600
# group_by(dataset_id, tar_group, chunk) |>
@@ -616,18 +607,19 @@ tar_script({
616607
mutate(sce = map(sce, ~ SingleCellExperiment(assay = assays(.x), colData = colData(.x)) )) |>
617608

618609
# Step 5: Combine all 'sce' objects within each group into a single 'sce' object
619-
summarise( sce = list(do.call(cbind, args = sce) ) ) |>
610+
group_by(file_id_cellNexus_single_cell) |>
611+
summarise( sce = list(do.call(cbind, args = sce) ) )
620612

621-
mutate(sce = map(sce,
622-
~ { .x =
623-
.x |>
624-
left_join(file_id_db, by = join_by(.cell==cell_id, dataset_id==dataset_id, sample_id==sample_id))
625-
.x |>
626-
HPCell:::splitColData(colData(.x)$file_id_cellNexus_single_cell) |> # Split 'sce' by 'cell_type'
627-
enframe(name = "file_id_cellNexus_single_cell", value = "sce") # Convert to tibble with 'cell_type' and 'sce' columns
628-
})) |>
613+
# mutate(sce = map(sce,
614+
# ~ { .x =
615+
# .x |>
616+
# left_join(file_id_db, by = join_by(.cell==cell_id, dataset_id==dataset_id, sample_id==sample_id))
617+
# .x |>
618+
# HPCell:::splitColData(colData(.x)$file_id_cellNexus_single_cell) |> # Split 'sce' by 'cell_type'
619+
# enframe(name = "file_id_cellNexus_single_cell", value = "sce") # Convert to tibble with 'cell_type' and 'sce' columns
620+
# })) |>
629621
# Step 8: Unnest the list of 'sce' objects to have one row per 'cell_type'
630-
unnest(sce)
622+
# unnest_single_cell_experiment(sce)
631623

632624

633625
}
@@ -647,7 +639,7 @@ tar_script({
647639
dbConnect(duckdb::duckdb(), dbdir = ":memory:"),
648640
sql(glue("SELECT * FROM read_parquet('{cell_metadata}')"))
649641
) |>
650-
distinct(dataset_id, sample_id, sample_chunk, cell_chunk) |>
642+
distinct(dataset_id, sample_id, sample_chunk, cell_chunk, file_id_cellNexus_single_cell) |>
651643
as_tibble(),
652644
copy=T
653645
)
@@ -692,7 +684,7 @@ tar_script({
692684

693685
# The input DO NOT DELETE
694686
tar_target(my_store, "/vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store", deployment = "main"),
695-
tar_target(cache_directory, "/vast/scratch/users/mangiola.s/cellNexus/cellxgene/26_11_2024", deployment = "main"),
687+
tar_target(cache_directory, "/vast/scratch/users/mangiola.s/cellNexus/cellxgene/18_12_2024", deployment = "main"),
696688
tar_target(
697689
cell_metadata,
698690
"/vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_4.parquet",
@@ -723,7 +715,11 @@ tar_script({
723715
tar_target(
724716
target_name_grouped_by_dataset_id,
725717
create_chunks_for_reading_and_saving(dataset_id_sample_id, cell_metadata) |>
726-
group_by(dataset_id, sample_chunk, cell_chunk) |>
718+
719+
# !!! JUST FOR TESTING !!!
720+
filter(file_id_cellNexus_single_cell == "004e0dd96de6f3091dac2cf8cc64ddc4___1.h5ad") |>
721+
722+
group_by(dataset_id, sample_chunk, cell_chunk, file_id_cellNexus_single_cell) |>
727723
tar_group(),
728724
iteration = "group",
729725
resources = tar_resources(
@@ -794,24 +790,26 @@ job::job({
794790
tar_make(
795791
script = paste0(store_file_cellNexus, "_target_script.R"),
796792
store = store_file_cellNexus,
797-
reporter = "summary" #, callr_function = NULL
793+
reporter = "verbose_positives" #, callr_function = NULL
798794
)
799795

800796
})
801797

802798
tar_make(script = paste0(store_file_cellNexus, "_target_script.R"), store = store_file_cellNexus, callr_function = NULL)
803799

804-
x = tar_read(saved_dataset, store = store_file_cellNexus)
800+
x = tar_read(dataset_id_sample_id, store = store_file_cellNexus)
801+
y = tar_read(target_name_grouped_by_dataset_id, store = store_file_cellNexus)
802+
805803

806804
tar_meta(store = store_file_cellNexus) |>
807805
arrange(desc(time)) |>
808806
filter(!error |> is.na()) |>
809807
select(name, error, warnings, time)
810808

811-
tar_workspace(saved_dataset_rank_912838f659391dd0, store = store_file_cellNexus, script = paste0(store_file_cellNexus, "_target_script.R"))
809+
tar_workspace(save_anndata_86c62bdb3d08d7b6, store = store_file_cellNexus, script = paste0(store_file_cellNexus, "_target_script.R"))
812810

813-
tar_invalidate(saved_dataset, store = store_file_cellNexus)
814-
tar_delete(saved_dataset, , store = store_file_cellNexus)
811+
tar_invalidate(starts_with("save_anndata"), store = store_file_cellNexus)
812+
tar_delete(starts_with("save_anndata"), , store = store_file_cellNexus)
815813

816814
target_name_grouped_by_dataset_id =tar_read(target_name_grouped_by_dataset_id, store = store_file_cellNexus)
817815

dev/4_make_pseudobulk.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,7 @@ job::job({
774774
se,
775775
verbose = TRUE,
776776
file = "/vast/projects/cellxgene_curated/cellNexus/pseudobulk_sample_cell_type_1_0_4.h5ad",
777-
X_name = "counts_scaled"
777+
X_name = "counts"
778778
)
779779
})
780780

0 commit comments

Comments
 (0)