R_scripts/GVIZ_plot_prep_functions.R at main · lhomas/R_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# Creating function to generate GRanges object summarising coverage information
## This function takes as input a .tsv file generated from the alignment_stat_gen.sh script
## Specficially it takes a list of coverage values (1 entry per line) for 100bp windows across a specified genomic region
## It outputs GRanges object that can be visualised with GVIZ

coverage_GRange<-function(cov_file, chr = chrom, strt = start, nd = end) {
  # Read in coverage stats file
  cov_scores <- scan(file = cov_file)

  # Generate IRanges object to be used for ranges in GRanges object
  cov_window <- seq(strt, nd-1000, 1000)
  cov_width <- rep(1000, length(cov_window))
  cov_irange <- IRanges(c(cov_window), width=c(cov_width))

  # Create GRanges object
  gr  <- GRanges(
    seqnames = chr,
    ranges = cov_irange, # Using IRanges object created above
    strand = "*", # Strand unimportant, using *
    Cov = cov_scores # Creating the metadata field to be plotted
  )

  # Return create granges obejct
  return(gr)
}


# Creating function to generate GRanges object summarising mapping quality information
## This function takes as input a .tsv file generated from the alignment_stat_gen.sh script
## Specficially it takes a list of average mapping quality values (1 entry per line) for 10000bp windows across a specified genomic region
## Rather tha showoing average mapping quality, this function checks to see if the average value for a window is above or below 57
## Greater than 57 is assigned 1, below is assigned 0
## It outputs GRanges object that can be visualised with GVIZ

# Creating function to generate GRanges object summarising coverage information
mapq_GRange<-function(mapq_file, chr = chrom, strt = start, nd = end) {
  # Read in coverage stats file
  mapq_scores <- scan(file = mapq_file)

  # Generate IRanges object to be used for ranges in GRanges object
  mapq_window <- seq(strt, nd-10000, 10000)
  mapq_width <- rep(10000, length(mapq_window))
  mapq_irange <- IRanges(c(mapq_window), width=c(mapq_width))
  accessability <- ifelse(mapq_scores < 57, 0, 1)

  # Create GRanges object
  gr  <- GRanges(
    seqnames = chr, # All windows are on chr14
    ranges = mapq_irange, # Using IRanges object created above
    strand = "*", # Strand unimportant, using *
    MAPQ = accessability # Creating the metadata field to be plotted
  )

  # Return create granges obejct
  return(gr)
}


# Creating function to generate GRanges object summarising LOD scores
## This function takes as input a number of parametric linkage tables to be combined (merlin output), and an associated map file
## This function only works for a single chromomse (provided as the chrom argument)
## It outputs GRanges object that can be visualised with GVIZ

LOD_grange<-function(..., map, chrom) {

  # Read provided parametric data tables into a list
  path_list <- list(...)

  if (length(path_list) > 1 ){

    # Create list containing data frames housing linkage information
    table_list <- NULL
    for (n in 1:length(path_list)){
      table_list[[n]] <- paste("linkage_table", n, sep = "_")
    }

    # Read linkage information into data.frames stored in table_list
    for (n in 1:length(path_list)){
      dat <- read_tsv(path_list[[n]])
      table_list[[n]] <- dat
    }

    # create variable for number of linkage tables being combined
    table_num <- length(path_list)

    # Create data.frame to store and combine linkage information
    cumulative_LOD <- NULL
    cumulative_LOD <- rbind(cumulative_LOD, table_list[[1]][c("CHR","POS","LABEL","MODEL","LOD")])
    cumulative_LOD$LOD <- as.numeric(as.character(cumulative_LOD$LOD))
    cumulative_names <- c("CHR","POS","LABEL","MODEL","LOD_1")
    for (n in 2:table_num){
      cumulative_names[[4+n]] <- paste("LOD", n, sep = "_")
    }

    # Read LOD score data into cumulative data.frame
    for (n in 2:table_num) {
      table_list[[n]]$LOD <- as.numeric(as.character(table_list[[n]]$LOD))
      cumulative_LOD <- cbind(cumulative_LOD, table_list[[n]]["LOD"])
    }
    colnames(cumulative_LOD) <- cumulative_names

    # This line removes some spurious lines introduced into the data.frame
    cumulative_LOD <- cumulative_LOD[!grepl("linkage", cumulative_LOD$CHR),]

    # Convert any values below 0 to NA
    for (l in 1:table_num) {
      cumulative_LOD[[4+l]] <- replace(cumulative_LOD[[4+l]], which(cumulative_LOD[[4+l]] < 0), NA)
    }

    # At loci where all LOD scores are positive sum all LOD scores and put value in new "Total" column
    cumulative_LOD <- cumulative_LOD %>%
      mutate(LOD = dplyr::select(., -1:-table_num) %>% rowSums(na.rm = TRUE))
    cumulative_LOD$LOD[!complete.cases(cumulative_LOD)] <- NA

    # Save cumulative_LOD as lod_table
    lod_table <- cumulative_LOD
  } else if (length(path_list) == 1) {
    lod_table <- read_tsv(...)
    lod_table <- lod_table[ ,1:5]
    lod_table$LOD <- replace(lod_table$LOD, which(lod_table$LOD < 0), NA)
  }

  # Add additional columns to data.frame for marker information
  lod_table$LABEL2 <- NA
  lod_table$MARKER <- NA
  lod_table$MARKER_POS <- NA
  lod_table$MARKER_COORD <- NA
  for (n in 1:nrow(lod_table)) {
    lod_table$LABEL2[n] <- lod_table$LABEL[n+1]
  }
  lod_table$LABEL2[nrow(lod_table)] <- 999

  # Read in map file for information on SNPs used
  snp_map <- read_tsv(map)

  # Using the map file from merlin to identify markers used in the linkage analysis and assign them to corresponding genomic interval
  for (n in 1:nrow(snp_map)) {
    pos <- snp_map$POSITION[n]
    for (r in 1:nrow(lod_table)) {
      if ( pos >= lod_table$LABEL[r] & pos < lod_table$LABEL2[r]) {
        lod_table$MARKER[r] <- snp_map$MARKER[n]
        lod_table$MARKER_POS[r] <- snp_map$POSITION[n]
      }
    }
  }

  # Create biomart object containing SNP information to get GRCh38 coordinates for used SNPs
  snpmart = useEnsembl(biomart = "snp", dataset="hsapiens_snp")
  rs_list <- snp_map$MARKER
  snp_pos <- getBM(attributes = c("refsnp_id", "chrom_start"),
                   filters = "snp_filter",
                   values = rs_list,
                   mart = snpmart)

  # Add genomic position of markers to cumulative_LOD data.frame
  for (n in 1:nrow(snp_pos)){
    snp_start <- snp_pos$chrom_start[n]
    snp_name <- snp_pos$refsnp_id[n]
    for (r in 1:nrow(lod_table)) {
      if ( snp_name == lod_table$MARKER[r] & !is.na(lod_table$MARKER[r])) {
        lod_table$MARKER_COORD[r] <- snp_start
      }
    }
  }

  # Generate data frame to create GRange object, containing LOD scores and corresponding genomic position.
  LOD_grange_df <- data.frame(lod_table$LOD, lod_table$MARKER_COORD)
  colnames(LOD_grange_df) <- c("LOD", "Marker_Coord")
  LOD_grange_df["LOD"][is.na(LOD_grange_df["LOD"])] <- -3

  # Remove windows with no associated SNPs
  LOD_grange_df <- na.omit(LOD_grange_df)

  # Create objects for input into GRange object
  IRange_start <- LOD_grange_df$Marker_Coord
  LODscores <- LOD_grange_df$LOD
  LOD_IRange <- IRanges(start=IRange_start, width = 1)

  gr <- GRanges(
    seqnames = chrom,
    ranges = LOD_IRange,
    strand = "*",
    LOD = LODscores
  )

  return(gr)
}


# Creating function to read in data from Merlin linkage analysis to generated a plot summarising multiple iterations of linkage analysis in a single plot.
## This function takes as input a number of parametric linkage tables to be combined (Merlin output).

WGLinkage_plot_prep <-function(file_path) {

  # List all parametric.tbl files
  parametric_list <- list.files(path=file_path,
                                pattern="-parametric.tbl",
                                full.names=TRUE)

  # Read each of the listed files into data frame
  linkage_table <- NULL
  for (f in parametric_list) {
    print(f)
    dat <- read_tsv(f)
    linkage_table <- rbind(linkage_table, dat)
  }

  # Sort dataframe based on chromosome
  linkage_table <- linkage_table %>% arrange(CHR)
  return(linkage_table)

}

# Creating function to read in data from a single chromosome from Merlin data linkage analysis to generate a plot showing summed LOD scores for a single chromosome.
## This function takes as input any number of linkage analyses, along with a chromosome of interest.

singleCHR_combined_linkage_plot_prep <- function(..., chrom = 1) {

  # Generate list of input paths
  path_list <- list(...)

  # Initialise list to store iterations of linkage data
  table_list <- NULL

  # Populate list with data.frame names, 1 per linkage iteration
  for (n in 1:length(path_list)){
    table_list[[n]] <- paste("linkage_table", n, sep = "_")
  }

  # Store linkage data within data.frames stored in list
  for (n in 1:length(path_list)){
    # Read contents of '-parametic.tbl' from supplied directories into data.frames
    parametric_table <- list.files(path=path_list[[n]],
                                   pattern="-parametric.tbl",
                                   full.names=TRUE)

    # Store contents of "-parametric.tbl" in data.frames stored in above made list
    for (f in parametric_table) {
      dat <- read_tsv(f)
      #      print(dat)
      table_list[[n]] <- rbind(table_list[[n]], dat)
    }

    # Sort data.frame based on chromosome
    table_list[[n]] <- table_list[[n]] %>% arrange(CHR)
  }

  # Initialise data.frame with LOD scores for all iterations
  cumulative_LOD <- NULL

  # Store genomic position data and LOD scores from first linkage iteration
  cumulative_LOD <- rbind(cumulative_LOD,
                          table_list[[1]][c("CHR","POS","LABEL","MODEL","LOD")])

  # Convert LOD from char to num to allow plotting
  cumulative_LOD$LOD <- as.numeric(as.character(cumulative_LOD$LOD))

  # Generate list to rename final cumulative_LOD data.frame to give LOD columns different names
  cumulative_names <- c("CHR","POS","LABEL","MODEL","LOD")
  for (n in 2:length(path_list)){
    cumulative_names[[4+n]] <- paste("LOD", n, sep = "_")
  }

  # Convert all remaining LOD scores to num and add to cumulative_LOD data.frame
  for (n in 2:length(path_list)) {
    table_list[[n]]$LOD <- as.numeric(as.character(table_list[[n]]$LOD))
    cumulative_LOD <- cbind(cumulative_LOD, table_list[[n]]["LOD"])
  }

  # Rename columns using previously made list
  colnames(cumulative_LOD) <- cumulative_names

  # Somewhere in this code "linkage_table_1" gets included in data.frame, this removes it
  cumulative_LOD <- cumulative_LOD[!grepl("linkage",cumulative_LOD$CHR),]

  # Add all LOD columns together, place output in new "total" column
  cumulative_LOD <-cumulative_LOD %>%
    mutate(Total = select(., -1:-4) %>% rowSums(na.rm = TRUE))

  # Convert CHR column to num, and sort 'cumulative_LOD' numerically (chromosome order)
  cumulative_LOD$CHR <- as.numeric(as.character(cumulative_LOD$CHR))
  cumulative_LOD <- cumulative_LOD %>% arrange(cumulative_LOD$CHR)

  # Downsample to specific chromosome
  singleChr <- subset(cumulative_LOD, CHR == chrom)

  # Setting LOD scores below 0 to NA
  singleChr$LOD <- replace(singleChr$LOD, which(singleChr$LOD < 0), NA)
  singleChr$LOD_2 <- replace(singleChr$LOD_2, which(singleChr$LOD_2 < 0), NA)
  singleChr$LOD_3 <- replace(singleChr$LOD_3, which(singleChr$LOD_3 < 0), NA)
  singleChr$LOD_4 <- replace(singleChr$LOD_4, which(singleChr$LOD_4 < 0), NA)

  # Downsample columns of data table to contain only relevant fields
  keep <- c("CHR","POS","LABEL","MODEL","LOD","LOD_2","LOD_3","LOD_4")

  singleChr <- singleChr[keep]

  singleChr <- singleChr %>%
    mutate(Total = select(., -1:-4) %>% rowSums(na.rm = FALSE))

  singleChr["Total"][is.na(singleChr["Total"])] <- -3

  return(singleChr)
}