From 46a488d250e9480bbf5bf5efb274b4855dea4473 Mon Sep 17 00:00:00 2001 From: Gene Leynes Date: Mon, 4 Dec 2017 13:45:27 -0600 Subject: [PATCH 1/5] specifying namespace for file_ext, closes #140 --- R/RSocrata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/RSocrata.R b/R/RSocrata.R index 1966559..5587c11 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -505,7 +505,7 @@ export.socrata <- function(url, app_token = NULL) { content_disposition <- response$headers$`content-disposition` default_format_raw <- strsplit(content_disposition, "filename=")[[1]][2] default_format_cleaned <- gsub('"', "", default_format_raw) - default_format <- file_ext(default_format_cleaned) + default_format <- tools::file_ext(default_format_cleaned) downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename filename <- httr::parse_url(ls$identifier[i]) From aafcf150fe29147148ceda2eef833aad37122a52 Mon Sep 17 00:00:00 2001 From: Tom Schenk Jr Date: Fri, 26 Oct 2018 19:58:50 -0500 Subject: [PATCH 2/5] Save data.json to file system; handle non-data files Save data.json to file system ------------------------------ A copy of the data.json file at the beginning of the download process is saved alongside the actual downloaded data. Since `export.socrata()` uses data.json as the index to download data, this will allow users to cross-reference the downloaded data with other metadata associated with it available through [Project Open Data](https://project-open-data.cio.gov). Handle non-data file --------------------- Socrata lists non-data files, such as Socrata Stories--HTML websites that contain text but no machine-readable data--in the data.json file. This causes errors when trying to download those sites because they do not have a "distribution URL". While it's arguable that these "sites" should not be included in the first place, the script now simply skips those files. Since a copy of the data.json file is downloaded (see above), users will have transparency into which URLs were not downloaded. --- DESCRIPTION | 4 ++-- R/RSocrata.R | 39 ++++++++++++++++++++++++++------------- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 43b781a..fb92a45 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,8 +10,8 @@ Description: Provides easier interaction with format and manages throttling by 'Socrata'. Users can upload data to Socrata portals directly from R. -Version: 1.8.0-4 -Date: 2017-05-06 +Version: 1.8.0-5 +Date: 2018-10-27 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc Maintainer: "Tom Schenk Jr." Depends: diff --git a/R/RSocrata.R b/R/RSocrata.R index 5587c11..4715296 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -465,41 +465,55 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, #' will download all CSV files (no other files supported) and saved in #' a single directory named after the root URL (e.g., "data.cityofchicago.org/"). #' Downloaded files are compressed to GZip format and timestamped so the download -#' time is saved. No data is saved within the R workspace. +#' time is cataloged. The site's data.json file is downloaded as a canonical index +#' of data saved from the website. Users can cross-reference the data.json file +#' by matching the "four-by-four" in data.json with the first 5 letters of GZipped +#' files. #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org") #' @param app_token - a string; SODA API token used to query the data #' portal \url{http://dev.socrata.com/consumers/getting-started.html} #' @return a Gzipped file with the four-by-four and timestamp of when the download began in filename #' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} #' @importFrom httr GET +#' @importFrom jsonlite write_json #' @importFrom utils write.csv #' @export export.socrata <- function(url, app_token = NULL) { dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL - ls <- ls.socrata(url = url) + + downloadTime <- Sys.time() # Grab timestamp when data.json was downloaded + downloadTz <- Sys.timezone() # Timezone on system that downloaded data.json -- not used + ls <- ls.socrata(url = url) # Downloads data.json file + + downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore + downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename + ls_filename <- paste0(basename(url), "/", "data_json", "_", downloadTimeChr, ".json") # Creates path and filename for data.json file + jsonlite::write_json(ls, path = ls_filename) # Writes data.json contents to directory + for (i in 1:dim(ls)[1]) { # Track timestamp before download - downloadTime <- Sys.time() - downloadTz <- Sys.timezone() + downloadTime <- Sys.time() # Denotes when data began download + downloadTz <- Sys.timezone() # Timezone o n system that downloaded data.json -- not used # Download data downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element - if (grepl(".csv", downloadUrl)) { + if(is.null(downloadUrl)) { # Skips if not a data file (e.g., Socrata Pages) + next + } else if (grepl(".csv", downloadUrl)) { # Downloads if it's a CSV d <- read.socrata(downloadUrl, app_token) # Construct the filename output default_format <- "csv" - downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore + downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename - filename <- httr::parse_url(ls$identifier[i]) - filename$path <- substr(filename$path, 11, 19) + filename <- httr::parse_url(ls$identifier[i]) # Determines four-by-four for file name + filename$path <- substr(filename$path, 11, 19) # Determines four-by-four for file name filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format, ".gz") # Write file - write.csv(d, file = gzfile(filename)) - + write.csv(d, file = gzfile(filename)) # Writes g-zipped file } else { - response <- GET(downloadUrl) + response <- GET(downloadUrl) # Downloads non-CSVs # Construct the filename output content_disposition <- response$headers$`content-disposition` @@ -513,8 +527,7 @@ export.socrata <- function(url, app_token = NULL) { filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format) # Write file - writeBin(response$content, filename) + writeBin(response$content, filename) # Writes non-CSVs to directory } - } } \ No newline at end of file From 8b601c6f441d365940618128e6159315ec914b6f Mon Sep 17 00:00:00 2001 From: Tom Schenk Jr Date: Sun, 28 Oct 2018 01:04:36 -0500 Subject: [PATCH 3/5] Ignores HTML content Socrata supports external links which direct to web pages (e.g., HTML). These would cause an error when `export.socrata()` attempted to download them. This fix will simply skip those files and proceed to the next file. --- DESCRIPTION | 4 ++-- R/RSocrata.R | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fb92a45..26fade7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,8 +10,8 @@ Description: Provides easier interaction with format and manages throttling by 'Socrata'. Users can upload data to Socrata portals directly from R. -Version: 1.8.0-5 -Date: 2018-10-27 +Version: 1.8.0-6 +Date: 2018-10-28 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc Maintainer: "Tom Schenk Jr." Depends: diff --git a/R/RSocrata.R b/R/RSocrata.R index 4715296..b1b88d9 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -499,7 +499,7 @@ export.socrata <- function(url, app_token = NULL) { downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element if(is.null(downloadUrl)) { # Skips if not a data file (e.g., Socrata Pages) next - } else if (grepl(".csv", downloadUrl)) { # Downloads if it's a CSV + } else if(grepl(".csv", downloadUrl)) { # Downloads if it's a CSV d <- read.socrata(downloadUrl, app_token) # Construct the filename output @@ -516,6 +516,9 @@ export.socrata <- function(url, app_token = NULL) { response <- GET(downloadUrl) # Downloads non-CSVs # Construct the filename output + if(is.null(response$headers$`content-disposition`)) { + next + } content_disposition <- response$headers$`content-disposition` default_format_raw <- strsplit(content_disposition, "filename=")[[1]][2] default_format_cleaned <- gsub('"', "", default_format_raw) From ccc4c96e9d2a8e3c0ed8078a6a75d6378b04102e Mon Sep 17 00:00:00 2001 From: Tom Schenk Jr Date: Sun, 5 Jan 2020 11:14:57 -0600 Subject: [PATCH 4/5] Handles non-CSV file types #126 * Ignores HTML files (e.g., Socrata Pages) * Ignores on occassions there isn't any data * Will download (uncompressed) PDFs, Word, Excel, PowerPoint, plain text attachments. --- DESCRIPTION | 2 +- R/RSocrata.R | 29 ++++++++++++++++------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 26fade7..9f534e4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,7 +11,7 @@ Description: Provides easier interaction with Users can upload data to Socrata portals directly from R. Version: 1.8.0-6 -Date: 2018-10-28 +Date: 2019-01-05 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc Maintainer: "Tom Schenk Jr." Depends: diff --git a/R/RSocrata.R b/R/RSocrata.R index b1b88d9..a37c250 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -461,14 +461,14 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, #' Exports CSVs from Socrata data portals #' -#' Input the URL of a data portal (e.g., "data.cityofchicago.org") and -#' will download all CSV files (no other files supported) and saved in -#' a single directory named after the root URL (e.g., "data.cityofchicago.org/"). -#' Downloaded files are compressed to GZip format and timestamped so the download -#' time is cataloged. The site's data.json file is downloaded as a canonical index -#' of data saved from the website. Users can cross-reference the data.json file -#' by matching the "four-by-four" in data.json with the first 5 letters of GZipped -#' files. +#' Input the base URL of a data portal (e.g., "data.cityofchicago.org") and +#' will download CSVs, PDFs, Word, Excel, and PowerPoint files contained on +#' the respective data portal into a single directory named after the root URL. +#' Downloaded CSV files are compressed to GZip format and each file timestamped +#' so the download time is cataloged. The site's data.json file is downloaded +#' as a canonical index of data saved from the website. Users can cross-reference +#' the data.json file by matching the "four-by-four" in data.json with the first +#' 5 letters of downloaded files. #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org") #' @param app_token - a string; SODA API token used to query the data #' portal \url{http://dev.socrata.com/consumers/getting-started.html} @@ -478,7 +478,7 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, #' @importFrom jsonlite write_json #' @importFrom utils write.csv #' @export -export.socrata <- function(url, app_token = NULL) { +export.socrata <- function(url, path = getwd(), app_token = NULL) { dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL downloadTime <- Sys.time() # Grab timestamp when data.json was downloaded @@ -497,10 +497,11 @@ export.socrata <- function(url, app_token = NULL) { # Download data downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element - if(is.null(downloadUrl)) { # Skips if not a data file (e.g., Socrata Pages) + mediaType <- ls$distribution[[i]]$mediaType + if(is.null(downloadUrl)) { # Skips if there is no data or file next - } else if(grepl(".csv", downloadUrl)) { # Downloads if it's a CSV - d <- read.socrata(downloadUrl, app_token) + } else if(mediaType[1] == "text/csv") { # Downloads if it's a CSV + d <- RSocrata::read.socrata(downloadUrl, app_token) # Construct the filename output default_format <- "csv" @@ -512,8 +513,10 @@ export.socrata <- function(url, app_token = NULL) { # Write file write.csv(d, file = gzfile(filename)) # Writes g-zipped file + } else if(mediaType == "text/html") { # Skips file if it's an HTML page + next } else { - response <- GET(downloadUrl) # Downloads non-CSVs + response <- httr::GET(downloadUrl) # Downloads non-CSVs (e.g. PDF, Word, etc.) # Construct the filename output if(is.null(response$headers$`content-disposition`)) { From f9ec527a5f80e282f839581e41268265ffee36ad Mon Sep 17 00:00:00 2001 From: Tom Schenk Jr Date: Sun, 5 Jan 2020 18:54:15 -0600 Subject: [PATCH 5/5] Several clean-up items for `export.socrata()` * Removed user-defined option for file output (not available yet) * Clarified documentation where `export.socrata()` files will be located. * Fixed incorrect date in `DESCRIPTION` file. * Iterating build number. --- DESCRIPTION | 4 ++-- R/RSocrata.R | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2fddcb1..c71f8fe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,8 +10,8 @@ Description: Provides easier interaction with format and manages throttling by 'Socrata'. Users can upload data to 'Socrata' portals directly from R. -Version: 1.8.0-10 -Date: 2019-01-27 +Version: 1.8.0-11 +Date: 2019-01-05 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., Gene Leynes, Nick Lucius, John Malc, Mark Silverberg, and Peter Schmeideskamp Maintainer: "Tom Schenk Jr." Depends: diff --git a/R/RSocrata.R b/R/RSocrata.R index 61c34d9..092acf4 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -523,12 +523,12 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, #' #' Input the base URL of a data portal (e.g., "data.cityofchicago.org") and #' will download CSVs, PDFs, Word, Excel, and PowerPoint files contained on -#' the respective data portal into a single directory named after the root URL. -#' Downloaded CSV files are compressed to GZip format and each file timestamped -#' so the download time is cataloged. The site's data.json file is downloaded -#' as a canonical index of data saved from the website. Users can cross-reference -#' the data.json file by matching the "four-by-four" in data.json with the first -#' 5 letters of downloaded files. +#' the respective data portal into a single directory named after the root URL +#' in the current working directory. Downloaded CSV files are compressed to GZip +#' format and each file timestamped so the download time is cataloged. The site's +#' data.json file is downloaded as a canonical index of data saved from the website. +#' Users can cross-reference the data.json file by matching the "four-by-four" in +#' data.json with the first 5 letters of downloaded files. #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org") #' @param app_token - a string; SODA API token used to query the data #' portal \url{http://dev.socrata.com/consumers/getting-started.html} @@ -538,7 +538,7 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, #' @importFrom jsonlite write_json #' @importFrom utils write.csv #' @export -export.socrata <- function(url, path = getwd(), app_token = NULL) { +export.socrata <- function(url, app_token = NULL) { dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL downloadTime <- Sys.time() # Grab timestamp when data.json was downloaded @@ -557,7 +557,7 @@ export.socrata <- function(url, path = getwd(), app_token = NULL) { # Download data downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element - mediaType <- ls$distribution[[i]]$mediaType + mediaType <- ls$distribution[[i]]$mediaType[1] # Grabs the first if(is.null(downloadUrl)) { # Skips if there is no data or file next } else if(mediaType[1] == "text/csv") { # Downloads if it's a CSV