diff --git a/DESCRIPTION b/DESCRIPTION index 83b3f2e..c71f8fe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,8 +10,8 @@ Description: Provides easier interaction with format and manages throttling by 'Socrata'. Users can upload data to 'Socrata' portals directly from R. -Version: 1.8.0-10 -Date: 2019-01-27 +Version: 1.8.0-11 +Date: 2019-01-05 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., Gene Leynes, Nick Lucius, John Malc, Mark Silverberg, and Peter Schmeideskamp Maintainer: "Tom Schenk Jr." Depends: @@ -29,4 +29,4 @@ Suggests: License: MIT + file LICENSE URL: https://github.com/Chicago/RSocrata BugReports: https://github.com/Chicago/RSocrata/issues -RoxygenNote: 6.1.0 +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index 10a0282..dd22b6e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(export.socrata) export(fieldName) export(isFourByFour) export(ls.socrata) @@ -19,7 +20,9 @@ importFrom(httr,parse_url) importFrom(httr,stop_for_status) importFrom(httr,user_agent) importFrom(jsonlite,fromJSON) +importFrom(jsonlite,write_json) importFrom(mime,guess_type) importFrom(plyr,rbind.fill) importFrom(utils,packageVersion) importFrom(utils,read.csv) +importFrom(utils,write.csv) diff --git a/R/RSocrata.R b/R/RSocrata.R index 1c5698b..092acf4 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -518,3 +518,82 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email, return(response) } + +#' Exports CSVs from Socrata data portals +#' +#' Input the base URL of a data portal (e.g., "data.cityofchicago.org") and +#' will download CSVs, PDFs, Word, Excel, and PowerPoint files contained on +#' the respective data portal into a single directory named after the root URL +#' in the current working directory. Downloaded CSV files are compressed to GZip +#' format and each file timestamped so the download time is cataloged. The site's +#' data.json file is downloaded as a canonical index of data saved from the website. +#' Users can cross-reference the data.json file by matching the "four-by-four" in +#' data.json with the first 5 letters of downloaded files. +#' @param url - the base URL of a domain (e.g., "data.cityofchicago.org") +#' @param app_token - a string; SODA API token used to query the data +#' portal \url{http://dev.socrata.com/consumers/getting-started.html} +#' @return a Gzipped file with the four-by-four and timestamp of when the download began in filename +#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} +#' @importFrom httr GET +#' @importFrom jsonlite write_json +#' @importFrom utils write.csv +#' @export +export.socrata <- function(url, app_token = NULL) { + dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL + + downloadTime <- Sys.time() # Grab timestamp when data.json was downloaded + downloadTz <- Sys.timezone() # Timezone on system that downloaded data.json -- not used + ls <- ls.socrata(url = url) # Downloads data.json file + + downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore + downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename + ls_filename <- paste0(basename(url), "/", "data_json", "_", downloadTimeChr, ".json") # Creates path and filename for data.json file + jsonlite::write_json(ls, path = ls_filename) # Writes data.json contents to directory + + for (i in 1:dim(ls)[1]) { + # Track timestamp before download + downloadTime <- Sys.time() # Denotes when data began download + downloadTz <- Sys.timezone() # Timezone o n system that downloaded data.json -- not used + + # Download data + downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element + mediaType <- ls$distribution[[i]]$mediaType[1] # Grabs the first + if(is.null(downloadUrl)) { # Skips if there is no data or file + next + } else if(mediaType[1] == "text/csv") { # Downloads if it's a CSV + d <- RSocrata::read.socrata(downloadUrl, app_token) + + # Construct the filename output + default_format <- "csv" + downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore + downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename + filename <- httr::parse_url(ls$identifier[i]) # Determines four-by-four for file name + filename$path <- substr(filename$path, 11, 19) # Determines four-by-four for file name + filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format, ".gz") + + # Write file + write.csv(d, file = gzfile(filename)) # Writes g-zipped file + } else if(mediaType == "text/html") { # Skips file if it's an HTML page + next + } else { + response <- httr::GET(downloadUrl) # Downloads non-CSVs (e.g. PDF, Word, etc.) + + # Construct the filename output + if(is.null(response$headers$`content-disposition`)) { + next + } + content_disposition <- response$headers$`content-disposition` + default_format_raw <- strsplit(content_disposition, "filename=")[[1]][2] + default_format_cleaned <- gsub('"', "", default_format_raw) + default_format <- tools::file_ext(default_format_cleaned) + downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore + downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename + filename <- httr::parse_url(ls$identifier[i]) + filename$path <- substr(filename$path, 11, 19) + filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format) + + # Write file + writeBin(response$content, filename) # Writes non-CSVs to directory + } + } +} diff --git a/man/export.socrata.Rd b/man/export.socrata.Rd new file mode 100644 index 0000000..dfd8fca --- /dev/null +++ b/man/export.socrata.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RSocrata.R +\name{export.socrata} +\alias{export.socrata} +\title{Exports CSVs from Socrata data portals} +\usage{ +export.socrata(url, path = getwd(), app_token = NULL) +} +\arguments{ +\item{url}{- the base URL of a domain (e.g., "data.cityofchicago.org")} + +\item{app_token}{- a string; SODA API token used to query the data +portal \url{http://dev.socrata.com/consumers/getting-started.html}} +} +\value{ +a Gzipped file with the four-by-four and timestamp of when the download began in filename +} +\description{ +Input the base URL of a data portal (e.g., "data.cityofchicago.org") and +will download CSVs, PDFs, Word, Excel, and PowerPoint files contained on +the respective data portal into a single directory named after the root URL. +Downloaded CSV files are compressed to GZip format and each file timestamped +so the download time is cataloged. The site's data.json file is downloaded +as a canonical index of data saved from the website. Users can cross-reference +the data.json file by matching the "four-by-four" in data.json with the first +5 letters of downloaded files. +} +\author{ +Tom Schenk Jr \email{tom.schenk@cityofchicago.org} +} diff --git a/man/fieldName.Rd b/man/fieldName.Rd index 33f3d31..b1acb5b 100644 --- a/man/fieldName.Rd +++ b/man/fieldName.Rd @@ -24,4 +24,3 @@ fieldName("Number.of.Stations") # number_of_stations \author{ Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@cityofchicago.org} } - diff --git a/man/isFourByFour.Rd b/man/isFourByFour.Rd index 611a075..ace5723 100644 --- a/man/isFourByFour.Rd +++ b/man/isFourByFour.Rd @@ -21,4 +21,3 @@ the validity of the syntax, but does not check if it actually exists. \author{ Tom Schenk Jr \email{tom.schenk@cityofchicago.org} } - diff --git a/man/posixify.Rd b/man/posixify.Rd index 11c8c47..ddb079c 100644 --- a/man/posixify.Rd +++ b/man/posixify.Rd @@ -18,4 +18,3 @@ Convert Socrata calendar_date string to POSIX \author{ Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@cityofchicago.org} } - diff --git a/man/validateUrl.Rd b/man/validateUrl.Rd index 504f0f6..32e4bdd 100644 --- a/man/validateUrl.Rd +++ b/man/validateUrl.Rd @@ -26,4 +26,3 @@ resolve conflicting API token by deferring to original URL. \author{ Tom Schenk Jr \email{tom.schenk@cityofchicago.org} } - diff --git a/man/write.socrata.Rd b/man/write.socrata.Rd index 35de583..ae24b56 100644 --- a/man/write.socrata.Rd +++ b/man/write.socrata.Rd @@ -4,8 +4,8 @@ \alias{write.socrata} \title{Write to a Socrata dataset (full replace or upsert)} \usage{ -write.socrata(dataframe, dataset_json_endpoint, update_mode, email, password, - app_token = NULL) +write.socrata(dataframe, dataset_json_endpoint, update_mode, email, + password, app_token = NULL) } \arguments{ \item{dataframe}{- dataframe to upload to Socrata} @@ -42,4 +42,3 @@ write.socrata(df_in,datasetToAddToUrl,"UPSERT",socrataEmail,socrataPassword) \author{ Mark Silverberg \email{mark.silverberg@socrata.com} } -