Skip to content
This repository was archived by the owner on Oct 14, 2025. It is now read-only.

Commit 8902d86

Browse files
authored
Merge pull request #153 from myushen/master
dev script for connecting cellxgene to census
2 parents 2c59f35 + b4574ba commit 8902d86

File tree

1 file changed

+168
-0
lines changed

1 file changed

+168
-0
lines changed

dev/split-large_samples.R

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# R Script for Processing Sample Metadata in Cellxgene Data
2+
# This script processes sample metadata related to Cellxgene datasets, focusing on Homo sapiens data.
3+
# It filters datasets based on certain criteria like primary data, accepted assays, and large sample size thresholds.
4+
# Additionally, it modifies cell identifiers and merges this information with related datasets to generate final outputs for further analysis.
5+
# The script employs several R packages like arrow, targets, glue, dplyr, and more for data manipulation and storage operations.
6+
7+
8+
library(arrow)
9+
library(targets)
10+
library(glue)
11+
library(dplyr)
12+
library(cellxgene.census)
13+
library(stringr)
14+
library(purrr)
15+
result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024"
16+
# # Sample metadata
17+
# sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets"))
18+
# saveRDS(sample_meta, "~/scratch/Census/cellxgene_to_census/sample_meta.rds")
19+
sample_meta <- readRDS("~/scratch/Census/cellxgene_to_census/sample_meta.rds")
20+
21+
# Sample to cell link
22+
sample_to_cell <- tar_read(metadata_dataset_id_cell_to_sample_mapping, store = glue("{result_directory}/_targets"))
23+
sample_to_cell_primary <- sample_to_cell |> filter(is_primary_data == TRUE)
24+
#saveRDS(sample_to_cell_primary, "~/scratch/Census/cellxgene_to_census/sample_to_cell_primary.rds")
25+
sample_to_cell_primary <- readRDS("~/scratch/Census/cellxgene_to_census/sample_to_cell_primary.rds")
26+
27+
sample_to_cell_primary_human <- sample_to_cell_primary |>
28+
left_join(sample_meta, by = c("sample_","dataset_id")) |>
29+
filter(organism == "Homo sapiens") |>
30+
select(observation_joinid, cell_, sample_, donor_id.x, dataset_id, is_primary_data.x,
31+
sample_heuristic.x, organism, tissue, development_stage, assay, collection_id,
32+
sex, self_reported_ethnicity, disease, cell_type)
33+
34+
# accepted_assays from census
35+
accepted_assays <- read.csv("~/projects/CuratedAtlasQueryR/cellxgene-to-census/census_accepted_assays.csv", header=TRUE)
36+
sample_to_cell_primary_human_accepted_assay <- sample_to_cell_primary_human |> filter(assay %in% accepted_assays$assay)
37+
38+
large_samples <- sample_to_cell_primary_human_accepted_assay |>
39+
dplyr::count(sample_, assay, collection_id, dataset_id) |>
40+
mutate(above_threshold = n > 15000)
41+
42+
large_samples_collection_id <- large_samples |> ungroup() |>
43+
dplyr::count(collection_id) |> arrange(desc(n))
44+
45+
# function to discard nucleotide in cell_ ---------------------------------
46+
# cell pattern repeated across samples.
47+
# Decision: use modified_cell and sample_ to split data
48+
49+
# drop cell ID if cell ID is a series of numbers
50+
# ACGT more than 5, drops
51+
# drop cellID if does not have special cahracter : - _
52+
remove_nucleotides_and_separators <- function(x) {
53+
# convert integer cell ID or contain numerics surrounded by special characters to NA
54+
x[str_detect(x, "^[0-9:_\\-*]+$")] <- NA
55+
56+
# drop sequence having a consistent stretch of 5 characters from ACGT
57+
modified <- str_replace_all(x, "[ACGT]{5,}", "")
58+
59+
#remove nucleotides surrounded by optional separators
60+
modified <- str_replace_all(modified, "[:_-]{2,}", "_")
61+
}
62+
63+
# List of collection IDs for sample cells great than 10K
64+
collection_ids <- large_samples_collection_id$collection_id
65+
66+
process_collection <- function(id) {
67+
filtered_data <- sample_to_cell_primary_human_accepted_assay |>
68+
filter(collection_id == id) |>
69+
select(cell_, sample_)
70+
71+
filtered_data$cell_modified <- remove_nucleotides_and_separators(filtered_data$cell_)
72+
filtered_data
73+
}
74+
75+
final_result <- map_df(collection_ids, process_collection)
76+
77+
# conditional generating sample_2 based on whether number of cells > 10K.
78+
sample_to_cell_primary_human_accepted_assay <- sample_to_cell_primary_human_accepted_assay |>
79+
left_join(large_samples, by = c("sample_", "assay","collection_id","dataset_id"))
80+
81+
sample_to_cell_primary_human_accepted_assay_sample_2 <-
82+
sample_to_cell_primary_human_accepted_assay |>
83+
left_join(final_result, by = c("cell_","sample_")) |>
84+
# manual adjust
85+
mutate(
86+
cell_modified = ifelse(dataset_id == "b2dda353-0c96-42df-8dcd-1ea7429a6feb" & sample_ == "5951a81f1d40153bab5d2b808e384f39",
87+
"s14",
88+
cell_modified),
89+
cell_modified = ifelse(dataset_id == "b2dda353-0c96-42df-8dcd-1ea7429a6feb" & sample_ == "7313173de022921da50c34ea2f87c7af",
90+
"s3",
91+
cell_modified)
92+
) |>
93+
mutate(sample_2 = if_else(above_threshold,
94+
paste(sample_, cell_modified, sep = "___"),
95+
sample_)
96+
)
97+
# save result
98+
#sample_to_cell_primary_human_accepted_assay_sample_2 |> arrow::write_parquet("~/scratch/Census_rerun/sample_to_cell_primary_human_accepted_assay_sample_2_modify.parquet")
99+
100+
# Load Census census_version = "2024-07-01"
101+
census <- open_soma(census_version = "stable")
102+
metadata <- census$get("census_data")$get("homo_sapiens")$get("obs")
103+
selected_columns <- c('assay', 'disease', 'donor_id', 'sex', 'self_reported_ethnicity', 'tissue', 'development_stage','is_primary_data','dataset_id','observation_joinid',
104+
"cell_type", "cell_type_ontology_term_id")
105+
samples <- metadata$read(column_names = selected_columns,
106+
value_filter = "is_primary_data == 'TRUE'")$concat()
107+
samples <- samples |> as.data.frame() |> distinct()
108+
#samples |> saveRDS("~/scratch/Census/cellxgene_to_census/census_samples_701.rds")
109+
110+
######## READ
111+
sample_to_cell_primary_human_accepted_assay_sample_2 <- arrow::read_parquet("~/scratch/Census_rerun/sample_to_cell_primary_human_accepted_assay_sample_2.parquet")
112+
samples <- readRDS("~/scratch/Census/cellxgene_to_census/census_samples_701.rds")
113+
114+
census_samples_to_download <- samples |>
115+
left_join(sample_to_cell_primary_human_accepted_assay_sample_2,
116+
by = c("observation_joinid", "dataset_id"),
117+
relationship = "many-to-many") |>
118+
select(-donor_id.x,
119+
-is_primary_data.x,
120+
-tissue.y,
121+
-development_stage.y,
122+
-assay.y,
123+
-sex.y,
124+
-self_reported_ethnicity.y,
125+
-disease.y) |>
126+
rename(assay = assay.x,
127+
disease = disease.x,
128+
sex = sex.x,
129+
self_reported_ethnicity = self_reported_ethnicity.x,
130+
tissue = tissue.x,
131+
development_stage = development_stage.x
132+
) |>
133+
as_tibble() |>
134+
# remove space in the sample_2, as sample_2 will be regarded as filename
135+
mutate(sample_2 = if_else(str_detect(sample_2, " "), str_replace_all(sample_2, " ",""), sample_2))
136+
137+
#census_samples_to_download |> arrow::write_parquet("~/scratch/Census_rerun/census_samples_to_download.parquet")
138+
con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = ":memory:")
139+
parquet_file = "~/scratch/Census_rerun/census_samples_to_download.parquet"
140+
141+
census_samples_to_download <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", parquet_file, "')")))
142+
143+
# This is important: please make sure observation_joinid and cell_ is unique per sample (sample_2) in census_samples_to_download
144+
census_samples_to_download |> dplyr::count(observation_joinid, sample_2) |> dplyr::count(n)
145+
census_samples_to_download |> dplyr::count(cell_, sample_2) |> dplyr::count(n)
146+
147+
148+
census_samples_to_download |> group_by(dataset_id, sample_2) |>
149+
summarise(observation_joinid = list(observation_joinid), .groups = "drop") |> as_tibble() |> mutate(list_length = map_dbl(observation_joinid, length)) |>
150+
filter(list_length >=100) |>
151+
arrow::write_parquet("~/scratch/Census_rerun/census_samples_to_download_groups.parquet")
152+
153+
census_samples_to_download_groups <- arrow::read_parquet("~/scratch/Census_rerun/census_samples_to_download_groups.parquet")
154+
155+
156+
157+
158+
# # census metadata
159+
# files <- readRDS("~/git_control/HPCell/data/files_3.rds")
160+
# metadata <- files |> left_join(census_samples_to_download, by = c("dataset_id", "sample_2")) |> filter(cell_number != 0)
161+
#
162+
# # write to parquet
163+
# metadata |> filter(is_primary_data == TRUE) |> select(-transformation_function) |>
164+
# arrow::write_parquet("~/cellxgene_curated/census_samples/primary_metatadata.parquet")
165+
#
166+
# # Calculate stats
167+
# metadata |> filter(!is.na(is_primary_data), cell_number != 0) |> distinct(dataset_id)
168+

0 commit comments

Comments
 (0)