|
| 1 | +# R Script for Processing Sample Metadata in Cellxgene Data |
| 2 | +# This script processes sample metadata related to Cellxgene datasets, focusing on Homo sapiens data. |
| 3 | +# It filters datasets based on certain criteria like primary data, accepted assays, and large sample size thresholds. |
| 4 | +# Additionally, it modifies cell identifiers and merges this information with related datasets to generate final outputs for further analysis. |
| 5 | +# The script employs several R packages like arrow, targets, glue, dplyr, and more for data manipulation and storage operations. |
| 6 | + |
| 7 | + |
| 8 | +library(arrow) |
| 9 | +library(targets) |
| 10 | +library(glue) |
| 11 | +library(dplyr) |
| 12 | +library(cellxgene.census) |
| 13 | +library(stringr) |
| 14 | +library(purrr) |
| 15 | +result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024" |
| 16 | +# # Sample metadata |
| 17 | +# sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets")) |
| 18 | +# saveRDS(sample_meta, "~/scratch/Census/cellxgene_to_census/sample_meta.rds") |
| 19 | +sample_meta <- readRDS("~/scratch/Census/cellxgene_to_census/sample_meta.rds") |
| 20 | + |
| 21 | +# Sample to cell link |
| 22 | +sample_to_cell <- tar_read(metadata_dataset_id_cell_to_sample_mapping, store = glue("{result_directory}/_targets")) |
| 23 | +sample_to_cell_primary <- sample_to_cell |> filter(is_primary_data == TRUE) |
| 24 | +#saveRDS(sample_to_cell_primary, "~/scratch/Census/cellxgene_to_census/sample_to_cell_primary.rds") |
| 25 | +sample_to_cell_primary <- readRDS("~/scratch/Census/cellxgene_to_census/sample_to_cell_primary.rds") |
| 26 | + |
| 27 | +sample_to_cell_primary_human <- sample_to_cell_primary |> |
| 28 | + left_join(sample_meta, by = c("sample_","dataset_id")) |> |
| 29 | + filter(organism == "Homo sapiens") |> |
| 30 | + select(observation_joinid, cell_, sample_, donor_id.x, dataset_id, is_primary_data.x, |
| 31 | + sample_heuristic.x, organism, tissue, development_stage, assay, collection_id, |
| 32 | + sex, self_reported_ethnicity, disease, cell_type) |
| 33 | + |
| 34 | +# accepted_assays from census |
| 35 | +accepted_assays <- read.csv("~/projects/CuratedAtlasQueryR/cellxgene-to-census/census_accepted_assays.csv", header=TRUE) |
| 36 | +sample_to_cell_primary_human_accepted_assay <- sample_to_cell_primary_human |> filter(assay %in% accepted_assays$assay) |
| 37 | + |
| 38 | +large_samples <- sample_to_cell_primary_human_accepted_assay |> |
| 39 | + dplyr::count(sample_, assay, collection_id, dataset_id) |> |
| 40 | + mutate(above_threshold = n > 15000) |
| 41 | + |
| 42 | +large_samples_collection_id <- large_samples |> ungroup() |> |
| 43 | + dplyr::count(collection_id) |> arrange(desc(n)) |
| 44 | + |
| 45 | +# function to discard nucleotide in cell_ --------------------------------- |
| 46 | +# cell pattern repeated across samples. |
| 47 | +# Decision: use modified_cell and sample_ to split data |
| 48 | + |
| 49 | +# drop cell ID if cell ID is a series of numbers |
| 50 | +# ACGT more than 5, drops |
| 51 | +# drop cellID if does not have special cahracter : - _ |
| 52 | +remove_nucleotides_and_separators <- function(x) { |
| 53 | + # convert integer cell ID or contain numerics surrounded by special characters to NA |
| 54 | + x[str_detect(x, "^[0-9:_\\-*]+$")] <- NA |
| 55 | + |
| 56 | + # drop sequence having a consistent stretch of 5 characters from ACGT |
| 57 | + modified <- str_replace_all(x, "[ACGT]{5,}", "") |
| 58 | + |
| 59 | + #remove nucleotides surrounded by optional separators |
| 60 | + modified <- str_replace_all(modified, "[:_-]{2,}", "_") |
| 61 | +} |
| 62 | + |
| 63 | +# List of collection IDs for sample cells great than 10K |
| 64 | +collection_ids <- large_samples_collection_id$collection_id |
| 65 | + |
| 66 | +process_collection <- function(id) { |
| 67 | + filtered_data <- sample_to_cell_primary_human_accepted_assay |> |
| 68 | + filter(collection_id == id) |> |
| 69 | + select(cell_, sample_) |
| 70 | + |
| 71 | + filtered_data$cell_modified <- remove_nucleotides_and_separators(filtered_data$cell_) |
| 72 | + filtered_data |
| 73 | +} |
| 74 | + |
| 75 | +final_result <- map_df(collection_ids, process_collection) |
| 76 | + |
| 77 | +# conditional generating sample_2 based on whether number of cells > 10K. |
| 78 | +sample_to_cell_primary_human_accepted_assay <- sample_to_cell_primary_human_accepted_assay |> |
| 79 | + left_join(large_samples, by = c("sample_", "assay","collection_id","dataset_id")) |
| 80 | + |
| 81 | +sample_to_cell_primary_human_accepted_assay_sample_2 <- |
| 82 | + sample_to_cell_primary_human_accepted_assay |> |
| 83 | + left_join(final_result, by = c("cell_","sample_")) |> |
| 84 | + # manual adjust |
| 85 | + mutate( |
| 86 | + cell_modified = ifelse(dataset_id == "b2dda353-0c96-42df-8dcd-1ea7429a6feb" & sample_ == "5951a81f1d40153bab5d2b808e384f39", |
| 87 | + "s14", |
| 88 | + cell_modified), |
| 89 | + cell_modified = ifelse(dataset_id == "b2dda353-0c96-42df-8dcd-1ea7429a6feb" & sample_ == "7313173de022921da50c34ea2f87c7af", |
| 90 | + "s3", |
| 91 | + cell_modified) |
| 92 | + ) |> |
| 93 | + mutate(sample_2 = if_else(above_threshold, |
| 94 | + paste(sample_, cell_modified, sep = "___"), |
| 95 | + sample_) |
| 96 | + ) |
| 97 | +# save result |
| 98 | +#sample_to_cell_primary_human_accepted_assay_sample_2 |> arrow::write_parquet("~/scratch/Census_rerun/sample_to_cell_primary_human_accepted_assay_sample_2_modify.parquet") |
| 99 | + |
| 100 | +# Load Census census_version = "2024-07-01" |
| 101 | +census <- open_soma(census_version = "stable") |
| 102 | +metadata <- census$get("census_data")$get("homo_sapiens")$get("obs") |
| 103 | +selected_columns <- c('assay', 'disease', 'donor_id', 'sex', 'self_reported_ethnicity', 'tissue', 'development_stage','is_primary_data','dataset_id','observation_joinid', |
| 104 | + "cell_type", "cell_type_ontology_term_id") |
| 105 | +samples <- metadata$read(column_names = selected_columns, |
| 106 | + value_filter = "is_primary_data == 'TRUE'")$concat() |
| 107 | +samples <- samples |> as.data.frame() |> distinct() |
| 108 | +#samples |> saveRDS("~/scratch/Census/cellxgene_to_census/census_samples_701.rds") |
| 109 | + |
| 110 | +######## READ |
| 111 | +sample_to_cell_primary_human_accepted_assay_sample_2 <- arrow::read_parquet("~/scratch/Census_rerun/sample_to_cell_primary_human_accepted_assay_sample_2.parquet") |
| 112 | +samples <- readRDS("~/scratch/Census/cellxgene_to_census/census_samples_701.rds") |
| 113 | + |
| 114 | +census_samples_to_download <- samples |> |
| 115 | + left_join(sample_to_cell_primary_human_accepted_assay_sample_2, |
| 116 | + by = c("observation_joinid", "dataset_id"), |
| 117 | + relationship = "many-to-many") |> |
| 118 | + select(-donor_id.x, |
| 119 | + -is_primary_data.x, |
| 120 | + -tissue.y, |
| 121 | + -development_stage.y, |
| 122 | + -assay.y, |
| 123 | + -sex.y, |
| 124 | + -self_reported_ethnicity.y, |
| 125 | + -disease.y) |> |
| 126 | + rename(assay = assay.x, |
| 127 | + disease = disease.x, |
| 128 | + sex = sex.x, |
| 129 | + self_reported_ethnicity = self_reported_ethnicity.x, |
| 130 | + tissue = tissue.x, |
| 131 | + development_stage = development_stage.x |
| 132 | + ) |> |
| 133 | + as_tibble() |> |
| 134 | + # remove space in the sample_2, as sample_2 will be regarded as filename |
| 135 | + mutate(sample_2 = if_else(str_detect(sample_2, " "), str_replace_all(sample_2, " ",""), sample_2)) |
| 136 | + |
| 137 | +#census_samples_to_download |> arrow::write_parquet("~/scratch/Census_rerun/census_samples_to_download.parquet") |
| 138 | +con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = ":memory:") |
| 139 | +parquet_file = "~/scratch/Census_rerun/census_samples_to_download.parquet" |
| 140 | + |
| 141 | +census_samples_to_download <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", parquet_file, "')"))) |
| 142 | + |
| 143 | +# This is important: please make sure observation_joinid and cell_ is unique per sample (sample_2) in census_samples_to_download |
| 144 | +census_samples_to_download |> dplyr::count(observation_joinid, sample_2) |> dplyr::count(n) |
| 145 | +census_samples_to_download |> dplyr::count(cell_, sample_2) |> dplyr::count(n) |
| 146 | + |
| 147 | + |
| 148 | +census_samples_to_download |> group_by(dataset_id, sample_2) |> |
| 149 | + summarise(observation_joinid = list(observation_joinid), .groups = "drop") |> as_tibble() |> mutate(list_length = map_dbl(observation_joinid, length)) |> |
| 150 | + filter(list_length >=100) |> |
| 151 | + arrow::write_parquet("~/scratch/Census_rerun/census_samples_to_download_groups.parquet") |
| 152 | + |
| 153 | +census_samples_to_download_groups <- arrow::read_parquet("~/scratch/Census_rerun/census_samples_to_download_groups.parquet") |
| 154 | + |
| 155 | + |
| 156 | + |
| 157 | + |
| 158 | +# # census metadata |
| 159 | +# files <- readRDS("~/git_control/HPCell/data/files_3.rds") |
| 160 | +# metadata <- files |> left_join(census_samples_to_download, by = c("dataset_id", "sample_2")) |> filter(cell_number != 0) |
| 161 | +# |
| 162 | +# # write to parquet |
| 163 | +# metadata |> filter(is_primary_data == TRUE) |> select(-transformation_function) |> |
| 164 | +# arrow::write_parquet("~/cellxgene_curated/census_samples/primary_metatadata.parquet") |
| 165 | +# |
| 166 | +# # Calculate stats |
| 167 | +# metadata |> filter(!is.na(is_primary_data), cell_number != 0) |> distinct(dataset_id) |
| 168 | + |
0 commit comments