From 46a488d250e9480bbf5bf5efb274b4855dea4473 Mon Sep 17 00:00:00 2001
From: Gene Leynes <gleynes@gmail.com>
Date: Mon, 4 Dec 2017 13:45:27 -0600
Subject: [PATCH 1/5] specifying namespace for file_ext, closes #140

---
 R/RSocrata.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/RSocrata.R b/R/RSocrata.R
index 1966559..5587c11 100644
--- a/R/RSocrata.R
+++ b/R/RSocrata.R
@@ -505,7 +505,7 @@ export.socrata <- function(url, app_token = NULL) {
       content_disposition <- response$headers$`content-disposition`
       default_format_raw <- strsplit(content_disposition, "filename=")[[1]][2]
       default_format_cleaned <- gsub('"', "", default_format_raw)
-      default_format <- file_ext(default_format_cleaned)
+      default_format <- tools::file_ext(default_format_cleaned)
       downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore
       downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename
       filename <- httr::parse_url(ls$identifier[i])

From aafcf150fe29147148ceda2eef833aad37122a52 Mon Sep 17 00:00:00 2001
From: Tom Schenk Jr <tschenk@kpmg.com>
Date: Fri, 26 Oct 2018 19:58:50 -0500
Subject: [PATCH 2/5] Save data.json to file system; handle non-data files

Save data.json to file system
------------------------------
A copy of the data.json file at the beginning of the download process is
saved alongside the actual downloaded data. Since `export.socrata()` uses
data.json as the index to download data, this will allow users to
cross-reference the downloaded data with other metadata associated with it
available through [Project Open Data](https://project-open-data.cio.gov).

Handle non-data file
---------------------
Socrata lists non-data files, such as Socrata Stories--HTML websites that
contain text but no machine-readable data--in the data.json file. This
causes errors when trying to download those sites because they do not have
a "distribution URL". While it's arguable that these "sites" should not be
included in the first place, the script now simply skips those files.

Since a copy of the data.json file is downloaded (see above), users will
have transparency into which URLs were not downloaded.
---
 DESCRIPTION  |  4 ++--
 R/RSocrata.R | 39 ++++++++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 43b781a..fb92a45 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -10,8 +10,8 @@ Description: Provides easier interaction with
     format and manages throttling by 'Socrata'.
     Users can upload data to Socrata portals directly
     from R.
-Version: 1.8.0-4
-Date: 2017-05-06
+Version: 1.8.0-5
+Date: 2018-10-27
 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc
 Maintainer: "Tom Schenk Jr." <developers@cityofchicago.org>
 Depends:
diff --git a/R/RSocrata.R b/R/RSocrata.R
index 5587c11..4715296 100644
--- a/R/RSocrata.R
+++ b/R/RSocrata.R
@@ -465,41 +465,55 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email,
 #' will download all CSV files (no other files supported) and saved in
 #' a single directory named after the root URL (e.g., "data.cityofchicago.org/").
 #' Downloaded files are compressed to GZip format and timestamped so the download
-#' time is saved. No data is saved within the R workspace.
+#' time is cataloged. The site's data.json file is downloaded as a canonical index
+#' of data saved from the website. Users can cross-reference the data.json file
+#' by matching the "four-by-four" in data.json with the first 5 letters of GZipped
+#' files.
 #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org")
 #' @param app_token - a string; SODA API token used to query the data 
 #' portal \url{http://dev.socrata.com/consumers/getting-started.html} 
 #' @return a Gzipped file with the four-by-four and timestamp of when the download began in filename
 #' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
 #' @importFrom httr GET
+#' @importFrom jsonlite write_json
 #' @importFrom utils write.csv
 #' @export
 export.socrata <- function(url, app_token = NULL) {
   dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL
-  ls <- ls.socrata(url = url)
+  
+  downloadTime <- Sys.time()     # Grab timestamp when data.json was downloaded
+  downloadTz <- Sys.timezone()   # Timezone on system that downloaded data.json -- not used
+  ls <- ls.socrata(url = url)    # Downloads data.json file
+  
+  downloadTimeChr <- gsub('\\s+','_',downloadTime)  # Remove spaces and replaces with underscore
+  downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename
+  ls_filename <- paste0(basename(url), "/", "data_json", "_", downloadTimeChr, ".json")  # Creates path and filename for data.json file
+  jsonlite::write_json(ls, path = ls_filename)      # Writes data.json contents to directory
+  
   for (i in 1:dim(ls)[1]) {
     # Track timestamp before download
-    downloadTime <- Sys.time()
-    downloadTz <- Sys.timezone()
+    downloadTime <- Sys.time()    # Denotes when data began download
+    downloadTz <- Sys.timezone()  # Timezone o n system that downloaded data.json -- not used
     
     # Download data
     downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element
-    if (grepl(".csv", downloadUrl)) {
+    if(is.null(downloadUrl)) {                         # Skips if not a data file (e.g., Socrata Pages)
+      next
+    } else if (grepl(".csv", downloadUrl)) {           # Downloads if it's a CSV
       d <- read.socrata(downloadUrl, app_token)
       
       # Construct the filename output
       default_format <- "csv"
-      downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore
+      downloadTimeChr <- gsub('\\s+','_',downloadTime)  # Remove spaces and replaces with underscore
       downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename
-      filename <- httr::parse_url(ls$identifier[i])
-      filename$path <- substr(filename$path, 11, 19)
+      filename <- httr::parse_url(ls$identifier[i])     # Determines four-by-four for file name
+      filename$path <- substr(filename$path, 11, 19)    # Determines four-by-four for file name
       filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format, ".gz")
       
       # Write file
-      write.csv(d, file = gzfile(filename))
-      
+      write.csv(d, file = gzfile(filename))             # Writes g-zipped file
     } else {
-      response <- GET(downloadUrl)
+      response <- GET(downloadUrl)                      # Downloads non-CSVs
 
       # Construct the filename output
       content_disposition <- response$headers$`content-disposition`
@@ -513,8 +527,7 @@ export.socrata <- function(url, app_token = NULL) {
       filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format)
       
       # Write file
-      writeBin(response$content, filename)
+      writeBin(response$content, filename)              # Writes non-CSVs to directory
     }
-
   }
 }
\ No newline at end of file

From 8b601c6f441d365940618128e6159315ec914b6f Mon Sep 17 00:00:00 2001
From: Tom Schenk Jr <tschenk@kpmg.com>
Date: Sun, 28 Oct 2018 01:04:36 -0500
Subject: [PATCH 3/5] Ignores HTML content

Socrata supports external links which direct to web pages (e.g., HTML).
These would cause an error when `export.socrata()` attempted to download
them. This fix will simply skip those files and proceed to the next file.
---
 DESCRIPTION  | 4 ++--
 R/RSocrata.R | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fb92a45..26fade7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -10,8 +10,8 @@ Description: Provides easier interaction with
     format and manages throttling by 'Socrata'.
     Users can upload data to Socrata portals directly
     from R.
-Version: 1.8.0-5
-Date: 2018-10-27
+Version: 1.8.0-6
+Date: 2018-10-28
 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc
 Maintainer: "Tom Schenk Jr." <developers@cityofchicago.org>
 Depends:
diff --git a/R/RSocrata.R b/R/RSocrata.R
index 4715296..b1b88d9 100644
--- a/R/RSocrata.R
+++ b/R/RSocrata.R
@@ -499,7 +499,7 @@ export.socrata <- function(url, app_token = NULL) {
     downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element
     if(is.null(downloadUrl)) {                         # Skips if not a data file (e.g., Socrata Pages)
       next
-    } else if (grepl(".csv", downloadUrl)) {           # Downloads if it's a CSV
+    } else if(grepl(".csv", downloadUrl)) {           # Downloads if it's a CSV
       d <- read.socrata(downloadUrl, app_token)
       
       # Construct the filename output
@@ -516,6 +516,9 @@ export.socrata <- function(url, app_token = NULL) {
       response <- GET(downloadUrl)                      # Downloads non-CSVs
 
       # Construct the filename output
+      if(is.null(response$headers$`content-disposition`)) {
+        next
+      }
       content_disposition <- response$headers$`content-disposition`
       default_format_raw <- strsplit(content_disposition, "filename=")[[1]][2]
       default_format_cleaned <- gsub('"', "", default_format_raw)

From ccc4c96e9d2a8e3c0ed8078a6a75d6378b04102e Mon Sep 17 00:00:00 2001
From: Tom Schenk Jr <tschenk@kpmg.com>
Date: Sun, 5 Jan 2020 11:14:57 -0600
Subject: [PATCH 4/5] Handles non-CSV file types #126

  * Ignores HTML files (e.g., Socrata Pages)
  * Ignores on occassions there isn't any data
  * Will download (uncompressed) PDFs, Word, Excel, PowerPoint, plain text attachments.
---
 DESCRIPTION  |  2 +-
 R/RSocrata.R | 29 ++++++++++++++++-------------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 26fade7..9f534e4 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -11,7 +11,7 @@ Description: Provides easier interaction with
     Users can upload data to Socrata portals directly
     from R.
 Version: 1.8.0-6
-Date: 2018-10-28
+Date: 2019-01-05
 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc
 Maintainer: "Tom Schenk Jr." <developers@cityofchicago.org>
 Depends:
diff --git a/R/RSocrata.R b/R/RSocrata.R
index b1b88d9..a37c250 100644
--- a/R/RSocrata.R
+++ b/R/RSocrata.R
@@ -461,14 +461,14 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email,
 
 #' Exports CSVs from Socrata data portals
 #' 
-#' Input the URL of a data portal (e.g., "data.cityofchicago.org") and
-#' will download all CSV files (no other files supported) and saved in
-#' a single directory named after the root URL (e.g., "data.cityofchicago.org/").
-#' Downloaded files are compressed to GZip format and timestamped so the download
-#' time is cataloged. The site's data.json file is downloaded as a canonical index
-#' of data saved from the website. Users can cross-reference the data.json file
-#' by matching the "four-by-four" in data.json with the first 5 letters of GZipped
-#' files.
+#' Input the base URL of a data portal (e.g., "data.cityofchicago.org") and
+#' will download CSVs, PDFs, Word, Excel,  and PowerPoint files contained on
+#' the respective data portal into a single directory named after the root URL.
+#' Downloaded CSV files are compressed to GZip format and each file timestamped 
+#' so the download time is cataloged. The site's data.json file is downloaded 
+#' as a canonical index of data saved from the website. Users can cross-reference 
+#' the data.json file by matching the "four-by-four" in data.json with the first 
+#' 5 letters of downloaded files.
 #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org")
 #' @param app_token - a string; SODA API token used to query the data 
 #' portal \url{http://dev.socrata.com/consumers/getting-started.html} 
@@ -478,7 +478,7 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email,
 #' @importFrom jsonlite write_json
 #' @importFrom utils write.csv
 #' @export
-export.socrata <- function(url, app_token = NULL) {
+export.socrata <- function(url, path = getwd(), app_token = NULL) {
   dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL
   
   downloadTime <- Sys.time()     # Grab timestamp when data.json was downloaded
@@ -497,10 +497,11 @@ export.socrata <- function(url, app_token = NULL) {
     
     # Download data
     downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element
-    if(is.null(downloadUrl)) {                         # Skips if not a data file (e.g., Socrata Pages)
+    mediaType <- ls$distribution[[i]]$mediaType
+    if(is.null(downloadUrl)) {                         # Skips if there is no data or file
       next
-    } else if(grepl(".csv", downloadUrl)) {           # Downloads if it's a CSV
-      d <- read.socrata(downloadUrl, app_token)
+    } else if(mediaType[1] == "text/csv") {            # Downloads if it's a CSV
+      d <- RSocrata::read.socrata(downloadUrl, app_token)
       
       # Construct the filename output
       default_format <- "csv"
@@ -512,8 +513,10 @@ export.socrata <- function(url, app_token = NULL) {
       
       # Write file
       write.csv(d, file = gzfile(filename))             # Writes g-zipped file
+    } else if(mediaType == "text/html") {               # Skips file if it's an HTML page
+      next
     } else {
-      response <- GET(downloadUrl)                      # Downloads non-CSVs
+      response <- httr::GET(downloadUrl)                # Downloads non-CSVs (e.g. PDF, Word, etc.)
 
       # Construct the filename output
       if(is.null(response$headers$`content-disposition`)) {

From f9ec527a5f80e282f839581e41268265ffee36ad Mon Sep 17 00:00:00 2001
From: Tom Schenk Jr <tschenk@kpmg.com>
Date: Sun, 5 Jan 2020 18:54:15 -0600
Subject: [PATCH 5/5] Several clean-up items for `export.socrata()`

* Removed user-defined option for file output (not available yet)
* Clarified documentation where `export.socrata()` files will be located.
* Fixed incorrect date in `DESCRIPTION` file.
* Iterating build number.
---
 DESCRIPTION  |  4 ++--
 R/RSocrata.R | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2fddcb1..c71f8fe 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -10,8 +10,8 @@ Description: Provides easier interaction with
     format and manages throttling by 'Socrata'.
     Users can upload data to 'Socrata' portals directly
     from R.
-Version: 1.8.0-10
-Date: 2019-01-27
+Version: 1.8.0-11
+Date: 2019-01-05
 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., Gene Leynes, Nick Lucius, John Malc, Mark Silverberg, and Peter Schmeideskamp
 Maintainer: "Tom Schenk Jr." <developers@cityofchicago.org>
 Depends:
diff --git a/R/RSocrata.R b/R/RSocrata.R
index 61c34d9..092acf4 100644
--- a/R/RSocrata.R
+++ b/R/RSocrata.R
@@ -523,12 +523,12 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email,
 #' 
 #' Input the base URL of a data portal (e.g., "data.cityofchicago.org") and
 #' will download CSVs, PDFs, Word, Excel,  and PowerPoint files contained on
-#' the respective data portal into a single directory named after the root URL.
-#' Downloaded CSV files are compressed to GZip format and each file timestamped 
-#' so the download time is cataloged. The site's data.json file is downloaded 
-#' as a canonical index of data saved from the website. Users can cross-reference 
-#' the data.json file by matching the "four-by-four" in data.json with the first 
-#' 5 letters of downloaded files.
+#' the respective data portal into a single directory named after the root URL
+#' in the current working directory. Downloaded CSV files are compressed to GZip 
+#' format and each file timestamped  so the download time is cataloged. The site's 
+#' data.json file is downloaded as a canonical index of data saved from the website. 
+#' Users can cross-reference the data.json file by matching the "four-by-four" in 
+#' data.json with the first 5 letters of downloaded files.
 #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org")
 #' @param app_token - a string; SODA API token used to query the data 
 #' portal \url{http://dev.socrata.com/consumers/getting-started.html} 
@@ -538,7 +538,7 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email,
 #' @importFrom jsonlite write_json
 #' @importFrom utils write.csv
 #' @export
-export.socrata <- function(url, path = getwd(), app_token = NULL) {
+export.socrata <- function(url, app_token = NULL) {
   dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL
   
   downloadTime <- Sys.time()     # Grab timestamp when data.json was downloaded
@@ -557,7 +557,7 @@ export.socrata <- function(url, path = getwd(), app_token = NULL) {
     
     # Download data
     downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element
-    mediaType <- ls$distribution[[i]]$mediaType
+    mediaType <- ls$distribution[[i]]$mediaType[1]     # Grabs the first 
     if(is.null(downloadUrl)) {                         # Skips if there is no data or file
       next
     } else if(mediaType[1] == "text/csv") {            # Downloads if it's a CSV