simgen/create_data.R at main · bodkan/simgen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# this script creates all data for use in the tidyverse and ggplot2 chapters

library(dplyr)
library(tidyr)
library(readr)

library(sf)


# metadata table ----------------------------------------------------------

df <- read_tsv("../ibdmix-bonus/data/neo.impute.1000g.sampleInfo_clusterInfo.txt")

# get individuals to be removed from metadata and IBD data
to_remove <- filter(
  df,
  sampleId == "K1"  |  # ancient sample with no date
  country == "Europe"  # Europe is not a country, 1000GP
)$sampleId

# remove columns which are not useful, filter individuals who are not useful
df <- df %>%
  select(-c(shape, starts_with("cluster"), color, callset, colorA, shapeA)) %>%
  filter(!sampleId %in% to_remove)

# annotate with continent assigmnent
continents <- list(
  Africa = c("WestAfrica", "EastAfrica", "SouthAfrica", "CentralAfrica", "NorthAfrica"),
  Asia = c("SouthAsia", "SouthEastAsia", "EastAsia", "WesternAsia", "CentralAsia", "NorthAsia"),
  America = c("NorthAmerica", "SouthAmerica"),
  Europe = c("SouthernEurope", "WesternEurope", "NorthernEurope", "CentralEasternEurope"
  )
)
df <- df %>% mutate(continent = case_when(
  region %in% continents$Africa ~ "Africa",
  region %in% continents$Asia ~ "Asia",
  region %in% continents$America ~ "America",
  region %in% continents$Europe ~ "Europe",
), .after = region)

write_tsv(df, "files/tidy/metadata.tsv")

system("git add files/tidy/metadata.tsv")
system("git commit -m 'Update metadata'")
system("git push")


# relatedness information -------------------------------------------------

metadata_all <- read_tsv("files/tidy/metadata.tsv")

rel_df <-
  metadata_all %>%
  filter(flag != "0", !grepl("low|cont|age", flag)) %>%
  select(x = sampleId, flag) %>%
  mutate(y = gsub("^\\dd_rel_(.*$)", "\\1", flag) %>% gsub("^dup_(.*$)", "\\1", .),
         rel = gsub("(^\\dd)_.*", "\\1", flag) %>% gsub("(^dup)_.*", "\\1", .)) %>%
  mutate(
    rel = case_when(
      rel == "1d" ~ "1st degree",
      rel == "2d" ~ "2nd degree",
      rel == "dup" ~ "duplicate"
    )
  ) %>%
  select(x, y, rel)

rel_df <- bind_rows(rel_df, select(rel_df, y = x, x = y, rel))


# IBD table ---------------------------------------------------------------

to_keep <- metadata_all$sampleId

load("~/Desktop/neo.impute.1000g.ibd.filter.strict.RData")
ibd_segments <- dd
ibd_segments <- dplyr::as_tibble(ibd_segments)
ibd_segments <- dplyr::select(ibd_segments, sample1, sample2, chrom = chromosome, start = posCmStart, end = posCmEnd)
ibd_segments <- dplyr::filter(ibd_segments, sample1 %in% to_keep & sample2 %in% to_keep)
ibd_segments <- left_join(ibd_segments, rel_df, by = c("sample1" = "x", "sample2" = "y"))

set.seed(123)
filter(ibd_segments, chrom == 21) %>%
  sample_frac %>%
  readr::write_tsv(here::here("files/tidy/ibd_segments.tsv"), na = "none")

# commit updates for downstream processing --------------------------------

system("git add files/tidy/ibd_segments.tsv.gz")
system("git commit -m 'Add subset of IBD data'")
system("git push")

# big IBD summary table ---------------------------------------------------

# this is a replacement for `process_ibd()`
ibd_segments <- ibd_segments %>% mutate(length = end - start)

# this is a replacement for `process_metadata()`
process_metadata2 <- function(bin_step) {
  metadata_all <- read_tsv("https://tinyurl.com/simgen-metadata", show_col_types = FALSE)

  # select a subset of columns
  metadata <- metadata_all %>%
    select(sampleId, country, continent, ageAverage, coverage, longitude, latitude) %>%
    rename(sample = sampleId, age = ageAverage, lon = longitude, lat = latitude)

  # replace missing ages of present-day individuals with 0
  metadata <- mutate(metadata, age = if_else(is.na(age), 0, age))
  # ignore archaic individuals
  metadata <- filter(metadata, !sample %in% c("Vindija33.19", "AltaiNeandertal", "Denisova"))

  # bin individuals according to their age
  metadata$age_bin <- cut(metadata$age, breaks = seq(0, 50000, by = bin_step), dig.lab = 10)
  bin_levels <- levels(metadata$age_bin)

  metadata <- metadata %>%
    mutate(
      age_bin = as.character(age_bin),
      age_bin = if_else(is.na(age_bin), "present-day", age_bin),
      age_bin = factor(age_bin, levels = c("present-day", bin_levels))
    )

  return(metadata)
}
# this is a replacement for `join_metadata()`
join_metadata2 <- function(ibd, metadata) {
  cat("Joining IBD data and metadata...\n")

  # prepare metadata for IBD annotation
  metadata1 <- select(metadata, -coverage)
  metadata2 <- select(metadata, -coverage)
  colnames(metadata1) <- paste0(colnames(metadata1), "1")
  colnames(metadata2) <- paste0(colnames(metadata2), "2")

  # join based on sample1
  ibd <- inner_join(ibd, metadata1)
  # join based on sample2
  ibd <- inner_join(ibd, metadata2)

  ibd_sp <- filter(ibd, !is.na(lon1), !is.na(lon2), !is.na(lat1), !is.na(lat2)) %>%
    st_as_sf(coords = c("lon1", "lat1"), sf_column_name = "location1") %>%
    as_tibble() %>%
    st_as_sf(coords = c("lon2", "lat2"), sf_column_name = "location2") %>%
    as_tibble() %>%
    mutate(distance = as.numeric(st_distance(location1, location2, by_element = TRUE) / 1e3)) %>%
    select(-c(location1, location2))

  ibd_nonsp <- filter(ibd, is.na(lat1), is.na(lat2), is.na(lon1), is.na(lon2)) %>%
    mutate(distance = NA_real_) %>%
    select(-c(lat1, lat2, lon1, lon2))

  ibd <- rbind(ibd_nonsp, ibd_sp)

  # swap countries to maintain order
  ibd$order_country <- ibd$country1 > ibd$country2
  ibd$xcountry1 <- ifelse(ibd$order_country, ibd$country2, ibd$country1)
  ibd$xcountry2 <- ifelse(ibd$order_country, ibd$country1, ibd$country2)
  ibd$country1 <- ibd$xcountry1; ibd$xcountry1 <- NULL
  ibd$country2 <- ibd$xcountry2; ibd$xcountry2 <- NULL

  # swap regions to maintain order
  ibd$order_continent <- ibd$continent1 > ibd$continent2
  ibd$xcontinent1 <- ifelse(ibd$order_continent, ibd$continent2, ibd$continent1)
  ibd$xcontinent2 <- ifelse(ibd$order_continent, ibd$continent1, ibd$continent2)
  ibd$continent1 <- ibd$xcontinent1; ibd$xcontinent1 <- NULL
  ibd$continent2 <- ibd$xcontinent2; ibd$xcontinent2 <- NULL

  # annotate with new columns indicating a pair of countries or time bins
  ibd <- mutate(ibd,
                country_pair = paste(country1, country2, sep = ":"),
                region_pair = paste(continent1, continent2, sep = ":"),
                time_pair = paste(age_bin1, age_bin2, sep = ":"),
                .before = chrom)

  return(ibd)
}

metadata <- process_metadata2(bin_step = 2500)

# annotate missing geography with country centroid (needed just for present-day
# humans from 1000GP data)
# library(sf)
# library(rnaturalearth)
#
# country_names <- filter(metadata, continent == "Europe") %>% .$country %>% unique
# sf_countries <- ne_countries() %>%
#   filter(admin %in% country_names) %>%
#   select(country = admin, geometry) %>%
#   st_make_valid()
# st_agr(sf_countries) <- "constant"
# sf_centers <- st_centroid(sf_countries)
# metadata <- inner_join(metadata, sf_centers, by = "country")

ibd_segments <- join_metadata2(ibd_segments, metadata)

# only long segments

ibd_sum <-
  ibd_segments %>%
  filter(length > 10 & age_bin1 == age_bin2) %>%
  group_by(sample1, sample2, rel, country_pair, region_pair, time_pair, distance) %>%
  summarize(n_ibd = n(), total_ibd = sum(length))

pryr::object_size(ibd_sum)

write_tsv(ibd_sum, here::here("files/tidy/ibd_sum.tsv"), na = "none")

system("git add files/tidy/ibd_long.tsv")
system("git commit -m 'Add summarized long IBD data'")
system("git push")


# f4-ratio proportions data -----------------------------------------------

f4 <- readRDS("../demografr/inst/examples/grid_data.rds") %>%
  rename(replicate = rep, rate_afr2afr = rate_aa, rate_eur2afr = rate_ea) %>%
  filter(rate_afr2afr == 0.2)

direct_f4 <- unnest(f4, direct) %>% select(-indirect) %>% rename(proportion = alpha) %>% mutate(statistic = "direct f4-ratio")
indirect_f4 <- unnest(f4, indirect) %>% select(-direct) %>% rename(proportion = alpha) %>% mutate(statistic = "indirect f4-ratio", proportion = 1 - proportion)

samples <- tibble(
  X = paste0("eur_", 1:21),
  time = seq(40000, 0, -2000)
)

df_f4 <- rbind(direct_f4, indirect_f4) %>%
  select(-c(A, B, C, O)) %>%
  inner_join(samples, by = "X") %>%
  rename(individual = X) %>%
  select(individual, time, statistic, proportion, -rate_afr2afr, rate_eur2afr, replicate)

write_tsv(df_f4, here::here("files/tidy/f4ratio.tsv"), na = "none")

system("git add files/tidy/f4ratio.tsv")
system("git commit -m 'Add example f4-ratio data set'")
system("git push")