@@ -206,8 +206,8 @@ tar_script({
206206 storage = " worker" ,
207207 retrieval = " worker" ,
208208 error = " continue" ,
209- # debug = "saved_dataset_cpm_b1de4e660a0a9cf5 ",
210- cue = tar_cue( mode = " never " ),
209+ debug = " dataset_id_sce " ,
210+
211211 workspace_on_error = TRUE ,
212212 controller = crew_controller_group(
213213 list (
@@ -294,36 +294,36 @@ tar_script({
294294 # cores = as.numeric(Sys.getenv("SLURM_CPUS_PER_TASK", unset = 1))
295295 # bp <- MulticoreParam(workers = cores , progressbar = TRUE) # Adjust the number of workers as needed
296296 #
297- dataset_id_sce | >
298- purrr :: transpose() | >
299- lapply(
300- FUN = function ( x ) {
301-
302- .x = x [[ 2 ]]
303- .y = x [[ 1 ]]
304-
305- # Check if the 'sce' has only one cell (column)
306- if (ncol(assay(.x )) == 1 ) {
307-
308- # Duplicate the assay to prevent saving errors due to single-column matrices
309- my_assay = cbind(assay(.x ), assay(.x ))
310- # Rename the second column to distinguish it
311- colnames(my_assay )[2 ] = paste0(" DUMMY" , " ___" , colnames(my_assay )[2 ])
312-
313- cd = colData(.x )
314- cd = cd | > rbind(cd )
315- rownames(cd )[2 ] = paste0(" DUMMY" , " ___" , rownames(cd )[2 ])
316-
317-
318-
319- .x = SingleCellExperiment(assay = list ( counts = my_assay ), colData = cd )
320- }
321-
322-
323- # TEMPORARY FOR SOME REASON THE MIN COUNTS IS NOT 0 FOR SOME SAMPLES
324- .x = HPCell ::: check_if_assay_minimum_count_is_zero_and_correct_TEMPORARY(.x , assays(.x ) | > names() | > _[1 ], subset_up_to_number_of_cells = 100 )
325-
326- .x = SingleCellExperiment(assay = list ( counts = .x | > assay()), colData = colData(.x ))
297+
298+
299+
300+ .x = dataset_id_sce | > pull( sce ) | > _[[ 1 ]]
301+ .y = dataset_id_sce | > pull( file_id_cellNexus_single_cell ) | > _[[ 1 ]] | > str_remove( " \\ .h5ad " )
302+
303+ .x | > assays() | > names() = " counts "
304+
305+ # # Check if the 'sce' has only one cell (column)
306+ # if(ncol(assay(.x)) == 1) {
307+ #
308+ # # Duplicate the assay to prevent saving errors due to single-column matrices
309+ # my_assay = cbind(assay(.x), assay(.x))
310+ # # Rename the second column to distinguish it
311+ # colnames(my_assay)[2] = paste0("DUMMY", "___", colnames(my_assay)[2])
312+ #
313+ # cd = colData(.x)
314+ # cd = cd |> rbind(cd)
315+ # rownames(cd)[2] = paste0("DUMMY", "___", rownames(cd)[2])
316+ #
317+ #
318+ #
319+ # .x = SingleCellExperiment(assay = list( counts = my_assay ), colData = cd)
320+ # }
321+ #
322+ #
323+ # # TEMPORARY FOR SOME REASON THE MIN COUNTS IS NOT 0 FOR SOME SAMPLES
324+ # .x = HPCell:::check_if_assay_minimum_count_is_zero_and_correct_TEMPORARY(.x, assays(.x) |> names() |> _[1], subset_up_to_number_of_cells = 100)
325+ #
326+ # .x = SingleCellExperiment(assay = list( counts = .x |> assay()), colData = colData(.x))
327327
328328
329329 # My attempt to save a integer, sparse, delayed matrix (with zellkonverter it is not possible to save integers)
@@ -334,12 +334,7 @@ tar_script({
334334 .x | > save_experiment_data(glue(" {cache_directory}/{.y}" ))
335335
336336 return (TRUE ) # Indicate successful saving
337- }
338- # ,
339- # BPPARAM = bp # Use the defined parallel backend
340- )
341-
342- return (" saved" )
337+
343338
344339 }
345340
@@ -489,13 +484,9 @@ tar_script({
489484 # cores = as.numeric(Sys.getenv("SLURM_CPUS_PER_TASK", unset = 1))
490485 # bp <- MulticoreParam(workers = cores , progressbar = TRUE) # Adjust the number of workers as needed
491486 #
492- dataset_id_sce | >
493- purrr :: transpose() | >
494- purrr :: map(
495- ~ {
496- x = .x
497- .x = x [[2 ]]
498- .y = x [[1 ]]
487+
488+ .x = dataset_id_sce | > pull(sce ) | > _[[1 ]]
489+ .y = dataset_id_sce | > pull(file_id_cellNexus_single_cell ) | > _[[1 ]]
499490
500491 # Check if the 'sce' has only one cell (column)
501492 if (ncol(assay(.x )) == 1 ) {
@@ -541,13 +532,7 @@ tar_script({
541532 .x | > save_experiment_data(glue(" {cache_directory}/{.y}" ))
542533
543534 return (TRUE ) # Indicate successful saving
544- },
545- .progress = TRUE
546- # ,
547- # BPPARAM = bp # Use the defined parallel backend
548- )
549-
550- return (" saved" )
535+
551536
552537
553538
@@ -567,20 +552,26 @@ tar_script({
567552 sql(glue(" SELECT * FROM read_parquet('{file_id_db_file}')" ))
568553 ) | >
569554 filter(dataset_id == my_dataset_id ) | >
570- select(cell_id , sample_id , dataset_id , file_id_cellNexus_single_cell ) | >
571-
572- # Drop extension because it is added later
573- mutate(file_id_cellNexus_single_cell = file_id_cellNexus_single_cell | > str_remove(" \\ .h5ad" )) | >
574- as_tibble()
555+ select(cell_id , sample_id , dataset_id , file_id_cellNexus_single_cell )
556+ # |>
557+ #
558+ # # Drop extension because it is added later
559+ # mutate(file_id_cellNexus_single_cell = file_id_cellNexus_single_cell |> str_remove("\\.h5ad")) |>
560+ # as_tibble()
561+
562+ file_id_db =
563+ target_name_grouped_by_dataset_id | >
564+ left_join(file_id_db , copy = TRUE )
565+
575566
576567 # Parallelise
577568 cores = as.numeric(Sys.getenv(" SLURM_CPUS_PER_TASK" , unset = 1 ))
578569 bp <- MulticoreParam(workers = cores , progressbar = TRUE ) # Adjust the number of workers as needed
579570
580571 # Begin processing the data pipeline with the initial dataset 'target_name_grouped_by_dataset_id'
581572 sce_df =
582- target_name_grouped_by_dataset_id | >
583-
573+ file_id_db | >
574+ nest( cells = cell_id ) | >
584575 # Step 1: Read raw data for each 'target_name' and store it in a new column 'sce'
585576 mutate(
586577 sce = bplapply(
@@ -591,7 +582,9 @@ tar_script({
591582 ) | >
592583
593584 # This should not be needed, but there are some data sets with zero cells
594- filter(! map_lgl(sce , is.null ))
585+ filter(! map_lgl(sce , is.null )) | >
586+
587+ mutate(sce = map2(sce , cells , ~ .x | > filter(.cell %in% .y $ cell_id ), .progress = TRUE ))
595588
596589
597590
@@ -601,9 +594,7 @@ tar_script({
601594 }
602595
603596 # plan(multisession, workers = 20)
604-
605- sce_df =
606- sce_df | >
597+ sce_df | >
607598
608599 # # Step 4: Group the data by 'dataset_id' and 'tar_group' for further summarization
609600 # group_by(dataset_id, tar_group, chunk) |>
@@ -616,18 +607,19 @@ tar_script({
616607 mutate(sce = map(sce , ~ SingleCellExperiment(assay = assays(.x ), colData = colData(.x )) )) | >
617608
618609 # Step 5: Combine all 'sce' objects within each group into a single 'sce' object
619- summarise( sce = list (do.call(cbind , args = sce ) ) ) | >
610+ group_by(file_id_cellNexus_single_cell ) | >
611+ summarise( sce = list (do.call(cbind , args = sce ) ) )
620612
621- mutate(sce = map(sce ,
622- ~ { .x =
623- .x | >
624- left_join(file_id_db , by = join_by(.cell == cell_id , dataset_id == dataset_id , sample_id == sample_id ))
625- .x | >
626- HPCell ::: splitColData(colData(.x )$ file_id_cellNexus_single_cell ) | > # Split 'sce' by 'cell_type'
627- enframe(name = " file_id_cellNexus_single_cell" , value = " sce" ) # Convert to tibble with 'cell_type' and 'sce' columns
628- })) | >
613+ # mutate(sce = map(sce,
614+ # ~ { .x =
615+ # .x |>
616+ # left_join(file_id_db, by = join_by(.cell==cell_id, dataset_id==dataset_id, sample_id==sample_id))
617+ # .x |>
618+ # HPCell:::splitColData(colData(.x)$file_id_cellNexus_single_cell) |> # Split 'sce' by 'cell_type'
619+ # enframe(name = "file_id_cellNexus_single_cell", value = "sce") # Convert to tibble with 'cell_type' and 'sce' columns
620+ # })) |>
629621 # Step 8: Unnest the list of 'sce' objects to have one row per 'cell_type'
630- unnest (sce )
622+ # unnest_single_cell_experiment (sce)
631623
632624
633625 }
@@ -647,7 +639,7 @@ tar_script({
647639 dbConnect(duckdb :: duckdb(), dbdir = " :memory:" ),
648640 sql(glue(" SELECT * FROM read_parquet('{cell_metadata}')" ))
649641 ) | >
650- distinct(dataset_id , sample_id , sample_chunk , cell_chunk ) | >
642+ distinct(dataset_id , sample_id , sample_chunk , cell_chunk , file_id_cellNexus_single_cell ) | >
651643 as_tibble(),
652644 copy = T
653645 )
@@ -692,7 +684,7 @@ tar_script({
692684
693685 # The input DO NOT DELETE
694686 tar_target(my_store , " /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/census_hpcell_oct_2024/target_store" , deployment = " main" ),
695- tar_target(cache_directory , " /vast/scratch/users/mangiola.s/cellNexus/cellxgene/26_11_2024 " , deployment = " main" ),
687+ tar_target(cache_directory , " /vast/scratch/users/mangiola.s/cellNexus/cellxgene/18_12_2024 " , deployment = " main" ),
696688 tar_target(
697689 cell_metadata ,
698690 " /vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_4.parquet" ,
@@ -723,7 +715,11 @@ tar_script({
723715 tar_target(
724716 target_name_grouped_by_dataset_id ,
725717 create_chunks_for_reading_and_saving(dataset_id_sample_id , cell_metadata ) | >
726- group_by(dataset_id , sample_chunk , cell_chunk ) | >
718+
719+ # !!! JUST FOR TESTING !!!
720+ filter(file_id_cellNexus_single_cell == " 004e0dd96de6f3091dac2cf8cc64ddc4___1.h5ad" ) | >
721+
722+ group_by(dataset_id , sample_chunk , cell_chunk , file_id_cellNexus_single_cell ) | >
727723 tar_group(),
728724 iteration = " group" ,
729725 resources = tar_resources(
@@ -794,24 +790,26 @@ job::job({
794790 tar_make(
795791 script = paste0(store_file_cellNexus , " _target_script.R" ),
796792 store = store_file_cellNexus ,
797- reporter = " summary " # , callr_function = NULL
793+ reporter = " verbose_positives " # , callr_function = NULL
798794 )
799795
800796})
801797
802798tar_make(script = paste0(store_file_cellNexus , " _target_script.R" ), store = store_file_cellNexus , callr_function = NULL )
803799
804- x = tar_read(saved_dataset , store = store_file_cellNexus )
800+ x = tar_read(dataset_id_sample_id , store = store_file_cellNexus )
801+ y = tar_read(target_name_grouped_by_dataset_id , store = store_file_cellNexus )
802+
805803
806804tar_meta(store = store_file_cellNexus ) | >
807805 arrange(desc(time )) | >
808806 filter(! error | > is.na()) | >
809807 select(name , error , warnings , time )
810808
811- tar_workspace(saved_dataset_rank_912838f659391dd0 , store = store_file_cellNexus , script = paste0(store_file_cellNexus , " _target_script.R" ))
809+ tar_workspace(save_anndata_86c62bdb3d08d7b6 , store = store_file_cellNexus , script = paste0(store_file_cellNexus , " _target_script.R" ))
812810
813- tar_invalidate(saved_dataset , store = store_file_cellNexus )
814- tar_delete(saved_dataset , , store = store_file_cellNexus )
811+ tar_invalidate(starts_with( " save_anndata " ) , store = store_file_cellNexus )
812+ tar_delete(starts_with( " save_anndata " ) , , store = store_file_cellNexus )
815813
816814target_name_grouped_by_dataset_id = tar_read(target_name_grouped_by_dataset_id , store = store_file_cellNexus )
817815
0 commit comments