|
| 1 | +library(arrow) |
| 2 | +library(dplyr) |
| 3 | +library(duckdb) |
| 4 | +library(HPCell) |
| 5 | + |
| 6 | +# Read the Parquet file into an R data frame |
| 7 | +con <- dbConnect(duckdb::duckdb(), dbdir = ":memory:") |
| 8 | +parquet_file = "/vast/projects/cellxgene_curated/census_samples/concensus_input.parquet" |
| 9 | + |
| 10 | +data_tbl <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", parquet_file, "')"))) |
| 11 | + |
| 12 | +annotation_combination = |
| 13 | + data_tbl |> |
| 14 | + #select(azimuth_predicted.celltype.l2, monaco_first.labels.fine, blueprint_first.labels.fine) |> |
| 15 | + select(cell_, dataset_id, cell_type, cell_type_ontology_term_id, azimuth_predicted.celltype.l2, monaco_first.labels.fine, blueprint_first.labels.fine) |
| 16 | + #arrange(desc(n)) |> |
| 17 | + |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | + |
| 22 | +annotation_consensus = |
| 23 | + annotation_combination |> |
| 24 | + distinct(azimuth_predicted.celltype.l2, monaco_first.labels.fine, blueprint_first.labels.fine) |> |
| 25 | + as_tibble() |> |
| 26 | + mutate(reannotation_consensus = reference_annotation_to_consensus(azimuth_input = azimuth_predicted.celltype.l2, monaco_input = monaco_first.labels.fine, blueprint_input = blueprint_first.labels.fine )) |
| 27 | + |
| 28 | + |
| 29 | +annotation_combination = |
| 30 | + annotation_combination |> |
| 31 | + left_join(annotation_consensus, copy = TRUE) |
| 32 | + |
| 33 | +output_parquet <- "/vast/projects/mangiola_immune_map/PostDoc/CuratedAtlasQueryR/dev/consensus_output.parquet" |
| 34 | + |
| 35 | +# Use DuckDB's COPY TO command to write the data back to Parquet |
| 36 | +# We need to execute a SQL command using dbExecute() |
| 37 | +copy_query <- paste0(" |
| 38 | + COPY ( |
| 39 | + SELECT * |
| 40 | + FROM ( |
| 41 | + ", dbplyr::sql_render(annotation_combination), " |
| 42 | + ) |
| 43 | + ) TO '", output_parquet, "' (FORMAT PARQUET); |
| 44 | +") |
| 45 | + |
| 46 | +# Execute the COPY command |
| 47 | +dbExecute(con, copy_query) |
| 48 | + |
| 49 | +# Disconnect from the database |
| 50 | +dbDisconnect(con, shutdown = TRUE) |
| 51 | + |
| 52 | +# Read back |
| 53 | +con <- dbConnect(duckdb::duckdb(), dbdir = ":memory:") |
| 54 | +data_consensus <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", output_parquet, "')"))) |
| 55 | + |
0 commit comments