@@ -8,11 +8,11 @@ library(duckdb)
88# Get input
99
1010
11- result_directory = " /vast/scratch/users/mangiola.s/pseudobulk_cellNexus_1_0_0_coarse "
11+ result_directory = " /vast/scratch/users/mangiola.s/pseudobulk_cellNexus_1_0_2 "
1212
1313tar_script({
1414
15- result_directory = " /vast/scratch/users/mangiola.s/pseudobulk_cellNexus_1_0_0_coarse "
15+ result_directory = " /vast/scratch/users/mangiola.s/pseudobulk_cellNexus_1_0_2 "
1616
1717
1818 # -----------------------#
@@ -123,30 +123,30 @@ tar_script({
123123 #
124124 )
125125
126- split_metadata = function (){
126+ split_metadata = function (metadata_parquet ){
127127
128128 # CAQ_directory = "/vast/projects/cellxgene_curated/cellNexus"
129129
130130 my_metadata =
131131 tbl(
132132 dbConnect(duckdb :: duckdb(), dbdir = " :memory:" ),
133- sql(" SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_0.parquet ')" )
133+ sql(glue( " SELECT * FROM read_parquet('{metadata_parquet} ')" ) )
134134 ) | >
135135
136136 #
137137
138138 # this, because for curated Atlas, query, I don't need of the other meta data,
139139 # I can add it later from the full meta data
140- select(sample_id , cell_ , file_id_cellNexus , cell_type_consensus_harmonised ) | >
141- filter(cell_type_consensus_harmonised != " other" , ! cell_type_consensus_harmonised | > is.na()) | >
142- filter(! file_id_cellNexus | > is.na()) | >
140+ select(sample_id , cell__id , file_id_cellNexus_single_Cell , cell_type_unified_ensemble ) | >
141+ filter(cell_type_unified_ensemble != " other" , ! cell_type_unified_ensemble | > is.na()) | >
142+ filter(! file_id_cellNexus_single_Cell | > is.na()) | >
143143
144144 # THIS SHOULD BE REMOVED IN THE FUTURE
145- mutate(file_id_cellNexus = paste0(file_id_cellNexus , " .h5ad" )) | >
145+ mutate(file_id_cellNexus_single_Cell = paste0(file_id_cellNexus_single_Cell , " .h5ad" )) | >
146146
147147 # NEST
148148 as_tibble() | >
149- mutate(chunk = file_id_cellNexus ) | >
149+ mutate(chunk = file_id_cellNexus_single_Cell ) | >
150150 nest(data = - c(chunk )) | >
151151
152152 mutate(number_of_cells = map_int(data , nrow ))
@@ -162,19 +162,19 @@ tar_script({
162162 data , chunk ,
163163 ~ {
164164
165- file_id_cellNexus = .y
165+ file_id_cellNexus_single_Cell = .y
166166
167167 # THIS IS TEMPORARY BECAUSE I HAVE BRAIN DATASETS WITH 1M CELL THAT ARE HARD TO SAVE IN THE DB
168- if (! file.exists(paste0(cache.path , " /original/" , file_id_cellNexus ))) {
169- warning(paste0(cache.path , " /original/" , file_id_cellNexus , " does not exist in the cache" ) )
168+ if (! file.exists(paste0(cache.path , " /original/" , file_id_cellNexus_single_Cell ))) {
169+ warning(paste0(cache.path , " /original/" , file_id_cellNexus_single_Cell , " does not exist in the cache" ) )
170170 return (NULL )
171171 }
172172
173173 .x | >
174174 CuratedAtlasQueryR ::: get_data_container(
175175 repository = NULL ,
176176 cache_directory = cache.path ,
177- grouping_column = " file_id_cellNexus "
177+ grouping_column = " file_id_cellNexus_single_Cell "
178178 )
179179 }
180180
@@ -198,12 +198,12 @@ tar_script({
198198 pseudobulk =
199199 aggregateAcrossCells(
200200 .x ,
201- colData(.x )[,c(" sample_id" , " cell_type_consensus_harmonised " )],
201+ colData(.x )[,c(" sample_id" , " cell_type_unified_ensemble " )],
202202 BPPARAM = MulticoreParam(workers = if_else(.y > 50000 , 20 , 5 ))
203203 )
204- colnames(pseudobulk ) = paste0(colData(pseudobulk )$ sample_id , " ___" , colData(pseudobulk )$ cell_type_consensus_harmonised )
204+ colnames(pseudobulk ) = paste0(colData(pseudobulk )$ sample_id , " ___" , colData(pseudobulk )$ cell_type_unified_ensemble )
205205
206- pseudobulk = pseudobulk | > select(.cell , sample_id , file_id_cellNexus , cell_type_consensus_harmonised )
206+ pseudobulk = pseudobulk | > select(.cell , sample_id , file_id_cellNexus_single_Cell , cell_type_unified_ensemble )
207207
208208 # Decrease size
209209 # We will reattach rowames later
@@ -226,7 +226,7 @@ tar_script({
226226
227227 # Add columns and filter
228228 mutate(data = map2(
229- data , file_id_cellNexus ,
229+ data , file_id_cellNexus_single_Cell ,
230230 ~ {
231231
232232 # ## TEMPORARY FIX BECAUSE I FORGOT TO ADD ASSAY NAME
@@ -236,8 +236,8 @@ tar_script({
236236 # Add columns
237237 se =
238238 .x | >
239- mutate(file_id_cellNexus = .y ) | >
240- select(- any_of(c(" file_id_cellNexus " , " .cell" , " original_cell_id" )))
239+ mutate(file_id_cellNexus_single_Cell = .y ) | >
240+ select(- any_of(c(" file_id_cellNexus_single_Cell " , " .cell" , " original_cell_id" )))
241241
242242
243243 # # Identify samples with many genes
@@ -260,7 +260,7 @@ tar_script({
260260 .progress = TRUE
261261 )) | >
262262
263- nest(data = - file_id_cellNexus ) | >
263+ nest(data = - file_id_cellNexus_single_Cell ) | >
264264 mutate(data = map(data ,
265265 ~ {
266266 se = .x | >
@@ -383,33 +383,40 @@ tar_script({
383383 # Pipeline
384384 # -----------------------#
385385 list (
386- tar_target(cache.path , " /vast/projects/cellxgene_curated/cellNexus/cellxgene/29_10_2024/" , deployment = " main" ),
387-
388- # Get rownames
386+ tar_target(cache.path , " /vast/scratch/users/mangiola.s/cellNexus/cellxgene/15_11_2024/" , deployment = " main" ),
389387 tar_target(
390- sce_rownames ,
391- tbl(
392- dbConnect(duckdb :: duckdb(), dbdir = " :memory:" ),
393- sql(" SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_0.parquet')" )
394- ) | >
395- head(1 ) | >
396- mutate(file_id_cellNexus = paste0(file_id_cellNexus , " .h5ad" )) | >
397- CuratedAtlasQueryR ::: get_data_container(
398- repository = NULL ,
399- cache_directory = cache.path ,
400- grouping_column = " file_id_cellNexus"
401- ) | >
402- rownames(),
403- resources = tar_resources(crew = tar_resources_crew(" slurm_1_20" )),
404- packages = c(" arrow" , " dplyr" , " duckdb" )
388+ metadata_parquet ,
389+ " /vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_2.parquet" ,
390+ format = " file" ,
391+ deployment = " main"
405392 ),
406393
394+
395+ # # Get rownames
396+ # tar_target(
397+ # sce_rownames,
398+ # tbl(
399+ # dbConnect(duckdb::duckdb(), dbdir = ":memory:"),
400+ # sql("SELECT * FROM read_parquet('/vast/projects/cellxgene_curated/cellNexus/cell_metadata_cell_type_consensus_v1_0_2.parquet')")
401+ # ) |>
402+ # head(1) |>
403+ # mutate(file_id_cellNexus_single_Cell = paste0(file_id_cellNexus_single_Cell, ".h5ad")) |>
404+ # CuratedAtlasQueryR:::get_data_container(
405+ # repository = NULL,
406+ # cache_directory = cache.path,
407+ # grouping_column = "file_id_cellNexus_single_Cell"
408+ # ) |>
409+ # rownames(),
410+ # resources = tar_resources(crew = tar_resources_crew("slurm_1_20")),
411+ # packages = c("arrow", "dplyr", "duckdb")
412+ # ),
413+
407414 # Do metadata
408415 tar_target(
409416 metadata_split ,
410- split_metadata(),
417+ split_metadata(metadata_parquet ),
411418 resources = tar_resources(crew = tar_resources_crew(" slurm_1_20" )),
412- packages = c(" arrow" , " dplyr" , " duckdb" , " tidyr" , " dplyr" , " purrr" )
419+ packages = c(" arrow" , " dplyr" , " duckdb" , " tidyr" , " dplyr" , " purrr" , " glue " )
413420 ),
414421
415422 # Get SCE SMALL
@@ -599,7 +606,7 @@ job::job({
599606
600607 tar_make(
601608 # callr_function = NULL,
602- reporter = " summary " ,
609+ reporter = " verbose_positives " ,
603610 script = glue(" {result_directory}/_targets.R" ),
604611 store = glue(" {result_directory}/_targets" )
605612 )
@@ -617,7 +624,7 @@ tar_meta(store = glue("{result_directory}/_targets")) |>
617624library(SummarizedExperiment )
618625library(tidySummarizedExperiment )
619626
620- sccomp_counts = readRDS(" /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/sccomp_on_cellNexus_1_0_0/cell_metadata_1_0_0_sccomp_input_counts .rds" )
627+ sccomp_counts = readRDS(" /vast/projects/mangiola_immune_map/PostDoc/immuneHealthyBodyMap/sccomp_on_cellNexus_1_0_2/cell_metadata_1_0_2_sccomp_input_counts .rds" )
621628
622629# Save into one SE
623630job :: job({
@@ -640,7 +647,7 @@ job::job({
640647 zellkonverter :: writeH5AD(
641648 se ,
642649 verbose = TRUE ,
643- file = " /vast/projects/cellxgene_curated/cellNexus/pseudobulk_sample_cell_type_1_0_0 .h5ad" ,
650+ file = " /vast/projects/cellxgene_curated/cellNexus/pseudobulk_sample_cell_type_1_0_2 .h5ad" ,
644651 X_name = " counts_scaled"
645652 )
646653})
@@ -665,11 +672,11 @@ system("~/bin/rclone copy /vast/projects/cellxgene_curated/cellNexus/pseudobulk_
665672
666673tar_read_raw(" pseudobulk_file_id_quantile_normalised_processed_split_1_HDF5_1b4b19de1079ec95" , store = glue(" {result_directory}/_targets" ))
667674
668- # tar_invalidate(pseudobulk_file_id_quantile_normalised_processed_split_2_HDF5 , store = glue("{result_directory}/_targets"))
675+ # tar_invalidate(metadata_split , store = glue("{result_directory}/_targets"))
669676# tar_invalidate(pseudobulk_file_id_quantile_normalised_processed_split_1, store = glue("{result_directory}/_targets"))
670677
671678tar_workspace(
672- pseudobulk_file_id_quantile_normalised_processed_split_1_HDF5_5f69024c54bd8215 ,
679+ " metadata_split_SMALL " ,
673680 script = glue(" {result_directory}/_targets.R" ),
674681 store = glue(" {result_directory}/_targets" )
675682)
0 commit comments