ADCP-Data-Cleaning/ADCP_processing.R at master · pkraska/ADCP-Data-Cleaning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Function to process ADCP data into two flat files (header/data)

# x <- "vr065_553.adcp"
# x <- 'data.txt'
# unit = 'COERS'
# unit = 'RiverRay'

ADCPproc <- function(x, unit = "RiverRay") {
  # x is the data you wish to process unit is the type of unit you used to collect
  # data. RiverRay is the current default as it was the first data used to create
  # this script.

  require(tidyverse)
  require(data.table)

  # read data as rows of characters and rename columns
  # SUPPRESSED WARNING ABOUT EXPECTED 13 PIECES, header rows have less columns than if
  # the data was rectangular, NAs used to fill gaps
    data <- suppressWarnings(readLines(x) %>%
    trimws(., which = 'both') %>%
    data.table(.) %>%
    select(read = 1) %>%
    separate(read, into = paste("V", c(1:13), sep = ""), sep = "\\s+"))


  # Header rows are identified by the 'cm' and 'BT' in the character strings.
  # RiverRay header is 6 lines, while the COERS ADCP is 5.
  regex.header <- regexpr("[a-z]+", readLines(x)) > 0
  header <- rep(NA, length(regex.header))

  # Loop over the dataframe looking for the head == TRUE and then change the
  # previous 4 header items to TRUE as well.
  if (unit == "COERS") {
    for (i in 1:length(header)) {
      if (regex.header[i] == TRUE) {
        header[(i - 4):i] <- TRUE
      } else {
        header[i] <- FALSE
      }
    }
    data$header <- header
  }
  if (unit == "RiverRay") {
      for (i in 1:length(header)) {
        if (regex.header[i] == TRUE) {
          header[(i - 5):i] <- TRUE
        } else {
          header[i] <- FALSE
        }
      }
    data$header <- header
    }

  # Create data frame of RLE (run length encoding) of the TRUE/FALSE header for the
  # length of each TRUE/FALSE section and trim off the first FALSE as it is part of
  # discarded data
  header.rle <- data.frame(lengths = unlist(rle(data$header)$lengths), values = unlist(rle(data$header)$values)) %>%
    slice(2:n())

  # Create a unique ID for each section of data to link to the header section
  UID.ref <- rep(1:length(header.rle$lengths[header.rle$values == FALSE]), header.rle$lengths[header.rle$values == FALSE])

  ensemble.data <- data %>%
    slice(4:n()) %>%
    filter(header == FALSE) %>%
    mutate(UID = UID.ref) %>%
    select(UID, everything()) %>%
    write_csv("ADCP_DATA.csv")

  ensemble.header.data <- data %>%
    slice(4:n()) %>%
    filter(header == TRUE) %>%
    select(-header)

  if (unit == "COERS") {
    ensemble.header <- ensemble.header.data[seq(from = 1, to = nrow(ensemble.header.data), by = 5), ] %>%
      cbind(ensemble.header.data[seq(from = 2, to = nrow(ensemble.header.data), by = 5), ]) %>%
      cbind(ensemble.header.data[seq(from = 3, to = nrow(ensemble.header.data), by = 5), ]) %>%
      cbind(ensemble.header.data[seq(from = 4, to = nrow(ensemble.header.data), by = 5), ]) %>%
      cbind(ensemble.header.data[seq(from = 5, to = nrow(ensemble.header.data), by = 5), ])

    # rename column names as they copied the same names each time in previous step
    colnames(ensemble.header) <- paste("v", seq(from = 1, to = ncol(ensemble.header),
                                                by = 1), sep = "")
    # Remove blank columns, and rename columns to human readable format
    ensemble.header.clean <- ensemble.header %>%
      select(-22:-26, -33:-39, -49:-52, -55:-65) %>%
      rename(year = v1,
             month = v2,
             day = v3,
             hour = v4,
             minute = v5,
             second = v6,
             hth_second = v7,
             ens_number = v8,
             ens_in_segment = v9,
             pitch = v10,
             roll = v11,
             cor_heading = v12,
             adcp_temp = v13) %>%
      mutate(UID = paste(year, month, day, hour, second, hth_second, sep = ".")) %>%
      select(UID, everything()) %>%
      write_csv("ADCP_HEADER.csv")
  }

  if (unit == "RiverRay") {
    ensemble.header <- ensemble.header.data[seq(from = 1, to = nrow(ensemble.header.data), by = 6), ] %>%
      cbind(ensemble.header.data[seq(from = 2, to = nrow(ensemble.header.data), by = 6), ]) %>%
      cbind(ensemble.header.data[seq(from = 3, to = nrow(ensemble.header.data), by = 6), ]) %>%
      cbind(ensemble.header.data[seq(from = 4, to = nrow(ensemble.header.data), by = 6), ]) %>%
      cbind(ensemble.header.data[seq(from = 5, to = nrow(ensemble.header.data), by = 6), ]) %>%
      cbind(ensemble.header.data[seq(from = 6, to = nrow(ensemble.header.data), by = 6), ])

    # rename column names as they copied the same names each time in previous step
    colnames(ensemble.header) <- paste("v", seq(from = 1, to = ncol(ensemble.header),
                                                by = 1), sep = "")

    ensemble.header.clean <- ensemble.header %>%
      select(-22:-26, -32:-39, -45:-52, -58:-65, -72:-78) %>%
      rename(year = v1,
             month = v2,
             day = v3,
             hour = v4,
             minute = v5,
             second = v6,
             hth_second = v7,
             ens_number = v8,
             ens_in_segment = v9,
             pitch = v10,
             roll = v11,
             cor_heading = v12,
             adcp_temp = v13,

             latitude = v40,
             longitude = v41) %>%
      mutate(UID = paste(year, month, day, hour, second, hth_second, sep = ".")) %>%
      select(UID, everything()) %>%
      write_csv("ADCP_HEADER.csv")
  }
cat(paste("Created 'ADCP_DATA.csv' and 'ADCP_HEADER.csv' in ", getwd(), sep = ""))
}