diff --git a/.Rbuildignore b/.Rbuildignore index e19c31c..90b06bc 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,7 +8,9 @@ RSocrata.Rcheck ^.*\.Rproj$ ^\.Rproj\.user$ .DS_Store -README.md ^\.travis\.yml$ appveyor.yml -CONTRIBUTING.md \ No newline at end of file +CONTRIBUTING.md +vignettes/rsconnect +vignettes/bench.rmd +^.*\.o$ diff --git a/.gitignore b/.gitignore index 50471e5..70a3405 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ /.README.md.html /.settings /DESCRIPTION.Rcheck -/man /out /RCheck /RSocrata_*.tar.gz @@ -14,3 +13,5 @@ *.Rproj.user *.Rhistory .Rproj.user +inst/doc +/vignettes/rsconnect diff --git a/.travis.yml b/.travis.yml index be609b7..ea090a5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,26 +1,46 @@ -# Sample .travis.yml for R projects. -# -# See README.md for instructions, or for more configuration options, -# see the wiki: -# http://docs.travis-ci.com/user/languages/r/ - -language: R +language: c sudo: required -warnings_are_errors: true +dist: trusty +script: ./travis-tool.sh run_tests + +before_script: + - sudo apt-get --yes --force-yes update -qq + - sudo apt-get install -y gdal-bin libgdal-dev libgdal1-dev netcdf-bin libproj-dev libv8-dev + - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh + - chmod 755 ./travis-tool.sh + - ./travis-tool.sh bootstrap + - ./travis-tool.sh install_deps r_github_packages: - hadley/httr - jeroenooms/jsonlite - jeroenooms/curl - klutometis/roxygen + - jimhester/covr + - yihui/mime + - ropensci/geojsonio + +after_failure: + - ./travis-tool.sh dump_logs + +env: + global: + - R_LIBS="http://cran.rstudio.com" + - R_BUILD_ARGS="--no-build-vignettes --no-manual" + - R_CHECK_ARGS="--no-build-vignettes --no-manual --as-cran" + - BOOTSTRAP_LATEX="" + +#language: R +#warnings_are_errors: true + +#before_install: +# - sudo apt-get --yes --force-yes update -qq +# - sudo apt-get install -y gdal-bin libgdal-dev libgdal1-dev netcdf-bin libproj-dev libv8-dev -before_install: - - Rscript -e "install.packages('roxygen2', repos='http://cran.us.r-project.org'); library(roxygen2); roxygen2::roxygenize(package.dir='.', roclets=c('rd', 'collate', 'namespace'))" - -after_success: - - Rscript -e 'source("R/tests/testRSocrata.R"); runAllTestsCI()' +#after_success: +# - Rscript -e 'library(covr);coveralls()' notifications: email: on_success: change - on_failure: change + on_failure: change \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9737e87..8ef743d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,3 @@ - # How to contribute We really appreciate when users [fix bugs](https://github.com/Chicago/RSocrata/pull/25) or [provide new features](https://github.com/Chicago/RSocrata/pull/21). When submitting changes, please read below to help the development team keep on top of issues and changes. @@ -18,8 +17,8 @@ If you have multiple issues, please submit multiple requests. Once you submit yo When you want to make a change, either to fix a bug or introduce a new feature, please follow the instructions below -* Create a branch or fork of the project based off of the `dev` branch. +* Create a branch or fork of the project based off of the **`dev`** branch. * Make commits of logical units * Add unit tests for any new features -* Run all tests in `R/tests/testRSocrata.R` +* Run all tests in `tests/testthat/` (CTRL+SHIFT+T) * Create a pull request with a robust description or [reference the issue number](https://github.com/Chicago/RSocrata/issues) \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 0ba45af..46b1c45 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,25 +1,30 @@ Package: RSocrata +Type: Package Title: Download 'Socrata' Data Sets as R Data Frames Description: Provides easier interaction with Socrata open data portals http://dev.socrata.com. Users can provide a 'Socrata' data set resource URL, or a 'Socrata' Open Data API (SoDA) web query, - or a 'Socrata' "human-friendly" URL, - returns an R data frame. - Converts dates to 'POSIX' format. - Manages throttling by 'Socrata'. -Version: 1.6.1-4 -Date: 2015-7-10 + or a 'Socrata' "human-friendly" URL, all of which + return a R data frame. + Additionally, it converts dates to 'POSIX' format, + manages throttling by 'Socrata' and supports geospacial data. +Version: 1.7.0-9 +Date: 2015-10-26 +Author: Hugh Devlin, Ph. D., Tom Schenk, Jr. and John Malc (@dmpe) +Maintainer: "Tom Schenk Jr." +Depends: + R (>= 3.2.2) +Imports: + httr (>= 1.0.0), + jsonlite (>= 0.9.19), + mime (>= 0.4), + geojsonio (>= 0.1.4), + plyr (>= 1.8.3) +Suggests: + testthat (>= 0.11.0), + roxygen2 (>= 5.0.1), +License: MIT + file LICENSE URL: https://github.com/Chicago/RSocrata BugReports: https://github.com/Chicago/RSocrata/issues -Imports: - httr (>= 0.3), - jsonlite (>= 0.9.14), - mime (>= 0.2), -Depends: - curl (>= 0.5) -Suggests: - testthat (>= 0.10.0) -Author: Hugh Devlin, Ph. D. and Tom Schenk, Jr. -Maintainer: Tom Schenk Jr -License: MIT + file LICENSE +RoxygenNote: 5.0.1 \ No newline at end of file diff --git a/NAMESPACE b/NAMESPACE index a722644..1132be4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,19 @@ -export(fieldName) -export(posixify) -export(read.socrata) +# Generated by roxygen2: do not edit by hand + +export(getMetadata) +export(isFourByFour) export(ls.socrata) -importFrom("httr", "parse_url", "build_url", "http_status", "stop_for_status", "GET", "content") -importFrom("mime", "guess_type") -importFrom("jsonlite", "fromJSON") -import("curl") +export(read.socrata) +export(read.socrataGEO) +export(validateUrl) +importFrom(geojsonio,geojson_read) +importFrom(httr,GET) +importFrom(httr,add_headers) +importFrom(httr,build_url) +importFrom(httr,config) +importFrom(httr,content) +importFrom(httr,parse_url) +importFrom(httr,stop_for_status) +importFrom(jsonlite,fromJSON) +importFrom(mime,guess_type) +importFrom(plyr,rbind.fill) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..6e3191d --- /dev/null +++ b/NEWS.md @@ -0,0 +1,48 @@ +### 1.1 +Add check for valid Socrata resource URL. Add check for supported download file format. Add support for Socrata short dates. + +### 1.2 +Use comma-separated file format for Socrata downloads. + +### 1.3 +* Added support for human-readable URL. Users can now copy and paste URLs of Socrata-hosted datasets, which will be transformed into a valid SoDA API web query. + +* Added additional RUnit tests to validate new functionality. + +### 1.4 +Add json file format for Socrata downloads. Switch to `RJSONIO` from `rjson`. + +### 1.5 Several changes: + +* Swapped to ```jsonlite``` from ```RJSONIO``` +* Added handling for long and short dates +* Added unit test for reading private datasets + +### 1.5.1 +Deprecated ```httr::guess_media()``` and implemented ```mime::guess_type()``` + +### 1.6.0 Several changes: + +* New function, ```ls.socrata``` to list all datasets on a Socrata portal. +* New optional argument, ```app_token```, which lets users supply an API token while using ```read.socrata()``` to minimize throttling. +* Repairs a bug where ```read.socrata``` failed when reading in a date with a column, but there are null values in that column. +* Minor changes to the DESCRIPTION documentation to point users to GitHub for issues and provides new contact information. + +### 1.6.1 Bug fixes: + +* Resolved potential [name collision issue](https://github.com/Chicago/RSocrata/issues/42) +* Cleaned-up documentation with contributor instructions [#23](https://github.com/Chicago/RSocrata/issues/23) and [#28](https://github.com/Chicago/RSocrata/issues/28)) +* Moved test coverage in `RUnit` to `testthat` and implemented code coverage monitoring ([#41](https://github.com/Chicago/RSocrata/issues/41)) +* Clean-up DESCRIPTION ([#40](https://github.com/Chicago/RSocrata/issues/40)) +* Add continuous integration for Windows ([#39](https://github.com/Chicago/RSocrata/issues/39)) +* Migrate Travis-CI to "proper" R YAML ([#46](https://github.com/Chicago/RSocrata/issues/46)) + + +### 1.7.0 Several changes, bug fixes and new features: + +* New function, `read.socrataGEO()`, which allows downloading GeoJSON data from Socrata data portals. +* Downloads now default to JSON for any requests for improved speed ([#53](https://github.com/Chicago/RSocrata/pull/53)) +* Add support of a "floating timestamp" +* Improved error handling and warnings +* Fixed errors when "NA" was present in date columns ([#27](https://github.com/Chicago/RSocrata/issues/27) and [#24](https://github.com/Chicago/RSocrata/pull/25)) +* Use `plyr` to improve the speed and performance of downloads [in some cases](https://github.com/Chicago/RSocrata/pull/56). \ No newline at end of file diff --git a/R/RSocrata.R b/R/RSocrata.R deleted file mode 100644 index d543b40..0000000 --- a/R/RSocrata.R +++ /dev/null @@ -1,233 +0,0 @@ -# An interface to data hosted online in Socrata data repositories -# -# Author: Hugh J. Devlin, Ph. D. 2013-08-28 -############################################################################### - -# library('httr') # for access to the HTTP header -# library('jsonlite') # for parsing data types from Socrata -# library('mime') # for guessing mime type - -#' Time-stamped message -#' -#' Issue a time-stamped, origin-stamped log message. -#' @param s a string -#' @return None (invisible NULL) as per cat -#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} -logMsg <- function(s) { - cat(format(Sys.time(), "%Y-%m-%d %H:%M:%OS3 "), as.character(sys.call(-1))[1], ": ", s, '\n', sep='') -} - -#' Checks the validity of the syntax for a potential Socrata dataset Unique Identifier, also known as a 4x4. -#' -#' Will check the validity of a potential dataset unique identifier -#' supported by Socrata. It will provide an exception if the syntax -#' does not align to Socrata unique identifiers. It only checks for -#' the validity of the syntax, but does not check if it actually exists. -#' @param fourByFour a string; character vector of length one -#' @return TRUE if is valid Socrata unique identifier, FALSE otherwise -#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} -isFourByFour <- function(fourByFour) { - fourByFour <- as.character(fourByFour) - if(nchar(fourByFour) != 9) - return(FALSE) - if(regexpr("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour) == -1) - return(FALSE) - TRUE -} - -#' Convert, if necessary, URL to valid REST API URL supported by Socrata. -#' -#' Will convert a human-readable URL to a valid REST API call -#' supported by Socrata. It will accept a valid API URL if provided -#' by users and will also convert a human-readable URL to a valid API -#' URL. Will accept queries with optional API token as a separate -#' argument or will also accept API token in the URL query. Will -#' resolve conflicting API token by deferring to original URL. -#' @param url a string; character vector of length one -#' @param app_token a string; SODA API token used to query the data -#' portal \url{http://dev.socrata.com/consumers/getting-started.html} -#' @return a valid Url -#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} -validateUrl <- function(url, app_token) { - url <- as.character(url) - parsedUrl <- httr::parse_url(url) - if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path)) - stop(url, " does not appear to be a valid URL.") - if(!is.null(app_token)) { # Handles the addition of API token and resolves invalid uses - if(is.null(parsedUrl$query[["$$app_token"]])) { - token_inclusion <- "valid_use" - } else { - token_inclusion <- "already_included" } - switch(token_inclusion, - "already_included"={ # Token already included in url argument - warning(url, " already contains an API token in url. Ignoring user-defined token.") - }, - "valid_use"={ # app_token argument is used, not duplicative. - parsedUrl$query[["app_token"]] <- as.character(paste("%24%24app_token=", app_token, sep="")) - }) - } - if(substr(parsedUrl$path, 1, 9) == 'resource/') { - return(httr::build_url(parsedUrl)) # resource url already - } - fourByFour <- basename(parsedUrl$path) - if(!isFourByFour(fourByFour)) - stop(fourByFour, " is not a valid Socrata dataset unique identifier.") - else { - parsedUrl$path <- paste('resource/', fourByFour, '.csv', sep="") - httr::build_url(parsedUrl) - } -} - -#' Convert Socrata human-readable column name to field name -#' -#' Convert Socrata human-readable column name, -#' as it might appear in the first row of data, -#' to field name as it might appear in the HTTP header; -#' that is, lower case, periods replaced with underscores#' -#' @param humanName a Socrata human-readable column name -#' @return Socrata field name -#' @export -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -#' @examples -#' #fieldName("Number.of.Stations") # number_of_stations -fieldName <- function(humanName) { - tolower(gsub('\\.', '_', as.character(humanName))) -} - -#' Convert Socrata calendar_date string to POSIX -#' -#' @param x character vector in one of two Socrata calendar_date formats -#' @return a POSIX date -#' @export -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -posixify <- function(x) { - x <- as.character(x) - if (length(x)==0) return(x) - # Two calendar date formats supplied by Socrata - if(any(regexpr("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}$", x[1])[1] == 1)) - strptime(x, format="%m/%d/%Y") # short date format - else - strptime(x, format="%m/%d/%Y %I:%M:%S %p") # long date-time format -} - -# Wrap httr GET in some diagnostics -# -# In case of failure, report error details from Socrata -# -# @param url Socrata Open Data Application Program Interface (SODA) query -# @return httr response object -# @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -getResponse <- function(url) { - response <- httr::GET(url) - status <- httr::http_status(response) - if(response$status_code != 200) { - msg <- paste("Error in httr GET:", response$status_code, response$headers$statusmessage, url) - if(!is.null(response$headers$`content-length`) && (response$headers$`content-length` > 0)) { - details <- httr::content(response) - msg <- paste(msg, details$code[1], details$message[1]) - } - logMsg(msg) - } - httr::stop_for_status(response) - response -} - -# Content parsers -# -# Return a data frame for csv -# -# @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} -# @param an httr response object -# @return data frame, possibly empty -getContentAsDataFrame <- function(response) { UseMethod('response') } -getContentAsDataFrame <- function(response) { - mimeType <- response$header$'content-type' - # skip optional parameters - sep <- regexpr(';', mimeType)[1] - if(sep != -1) mimeType <- substr(mimeType, 0, sep[1] - 1) - switch(mimeType, - 'text/csv' = - httr::content(response), # automatic parsing - 'application/json' = - if(httr::content(response, as='text') == "[ ]") # empty json? - data.frame() # empty data frame - else - data.frame(t(sapply(httr::content(response), unlist)), stringsAsFactors=FALSE) - ) # end switch -} - -# Get the SoDA 2 data types -# -# Get the Socrata Open Data Application Program Interface data types from the http response header -# @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -# @param responseHeaders headers attribute from an httr response object -# @return a named vector mapping field names to data types -getSodaTypes <- function(response) { UseMethod('response') } -getSodaTypes <- function(response) { - result <- jsonlite::fromJSON(response$headers[['x-soda2-types']]) - names(result) <- jsonlite::fromJSON(response$headers[['x-soda2-fields']]) - result -} - -#' Get a full Socrata data set as an R data frame -#' -#' Manages throttling and POSIX date-time conversions -#' -#' @param url A Socrata resource URL, -#' or a Socrata "human-friendly" URL, -#' or Socrata Open Data Application Program Interface (SODA) query -#' requesting a comma-separated download format (.csv suffix), -#' May include SoQL parameters, -#' but is assumed to not include a SODA offset parameter -#' @param app_token a string; SODA API token used to query the data -#' portal \url{http://dev.socrata.com/consumers/getting-started.html} -#' @return an R data frame with POSIX dates -#' @export -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -#' @examples -#' df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv") -read.socrata <- function(url, app_token = NULL) { - validUrl <- validateUrl(url, app_token) # check url syntax, allow human-readable Socrata url - parsedUrl <- httr::parse_url(validUrl) - mimeType <- mime::guess_type(parsedUrl$path) - if(!(mimeType %in% c('text/csv','application/json'))) - stop("Error in read.socrata: ", mimeType, " not a supported data format.") - response <- getResponse(validUrl) - page <- getContentAsDataFrame(response) - result <- page - dataTypes <- getSodaTypes(response) - while (nrow(page) > 0) { # more to come maybe? - query <- paste(validUrl, if(is.null(parsedUrl$query)) {'?'} else {"&"}, '$offset=', nrow(result), sep='') - response <- getResponse(query) - page <- getContentAsDataFrame(response) - result <- rbind(result, page) # accumulate - } - # convert Socrata calendar dates to posix format - for(columnName in colnames(page)[!is.na(dataTypes[fieldName(colnames(page))]) & dataTypes[fieldName(colnames(page))] == 'calendar_date']) { - result[[columnName]] <- posixify(result[[columnName]]) - } - result -} - -#' List datasets available from a Socrata domain -#' -#' @param url A Socrata URL. This simply points to the site root. -#' @return an R data frame containing a listing of datasets along with -#' various metadata. -#' @export -#' @author Peter Schmiedeskamp \email{pschmied@@uw.edu} -#' @examples -#' df <- ls.socrata("http://soda.demo.socrata.com") -ls.socrata <- function(url) { - url <- as.character(url) - parsedUrl <- httr::parse_url(url) - if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname)) - stop(url, " does not appear to be a valid URL.") - parsedUrl$path <- "data.json" - df <- jsonlite::fromJSON(httr::build_url(parsedUrl)) - df <- as.data.frame(df$dataset) - df$issued <- as.POSIXct(df$issued) - df$modified <- as.POSIXct(df$modified) - df$theme <- as.character(df$theme) - df -} \ No newline at end of file diff --git a/R/errorHandling.R b/R/errorHandling.R new file mode 100644 index 0000000..2a99025 --- /dev/null +++ b/R/errorHandling.R @@ -0,0 +1,45 @@ +# Provides error handling functionality +# +# @description Based on \url{http://dev.socrata.com/docs/response-codes.html} +# +# @section TODO: Add messages that alert the user on the URL being valid, +# but one that is not compatible with RSocrata. +# See \url{https://github.com/Chicago/RSocrata/issues/16} +# +# @param url - SOPA url +# @param app_token - token for private data +#' @importFrom httr GET add_headers stop_for_status config +errorHandling <- function(url = "", app_token = NULL) { + rsp <- httr::GET(url, httr::add_headers("X-App-Token" = app_token), config(fresh_connect = 1L)) + + if (rsp$status_code == 200) { + invisible("OK. Your request was successful.") + + } else if (rsp$status_code == 202) { + warning("202 Request processing. You can retry your request, and when it's complete, you'll get a 200 instead.") + + } else if (rsp$status_code == 400) { + stop("400 Bad request. Most probably was your request malformed (e.g URL with ?)") + + } else if (rsp$status_code == 401) { + # only necessary when accessing datasets that have been marked as private or when making write requests (PUT, POST, and DELETE) + stop("Unauthorized. You attempted to authenticate but something went wrong.") + + } else if (rsp$status_code == 403) { + stop("Forbidden. You're not authorized to access this resource. Make sure you authenticate to access private datasets.") + + } else if (rsp$status_code == 404) { + stop("Not found. The resource requested doesn't exist.") + + } else if (rsp$status_code == 429) { + stop("Too Many Requests. Your client is currently being rate limited. Make sure you're using an app token.") + + } else if (rsp$status_code == 500) { + stop("Server error. Try later.") + + } else { + httr::stop_for_status(rsp) + } + + return(rsp) +} \ No newline at end of file diff --git a/R/listDatasets.R b/R/listDatasets.R new file mode 100644 index 0000000..d023261 --- /dev/null +++ b/R/listDatasets.R @@ -0,0 +1,32 @@ +#' List datasets available from a Socrata domain +#' +#' @param url - A Socrata URL. This simply points to the site root. +#' @return an R data frame containing a listing of datasets along with +#' various metadata. +#' @author Peter Schmiedeskamp \email{pschmied@@uw.edu} +#' @note URLs such as \code{"soda.demo.socrata.com"} are not supported +#' @examples +#' df <- ls.socrata(url = "http://soda.demo.socrata.com") +#' ## df.ny <- ls.socrata("https://data.ny.gov/") +#' +#' @importFrom jsonlite fromJSON +#' @importFrom httr parse_url build_url +#' +#' @export +ls.socrata <- function(url = "") { + + parsedUrl <- httr::parse_url(url) + + if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname)) { + stop(url, " does not appear to be a valid URL.") + } + parsedUrl$path <- "data.json" + + df <- jsonlite::fromJSON(httr::build_url(parsedUrl)) + df <- as.data.frame(df$dataset, stringsAsFactors = FALSE) + df$issued <- as.POSIXct(df$issued) + df$modified <- as.POSIXct(df$modified) + df$theme <- as.character(df$theme) + + return(df) +} \ No newline at end of file diff --git a/R/metadata.R b/R/metadata.R new file mode 100644 index 0000000..6c55cd2 --- /dev/null +++ b/R/metadata.R @@ -0,0 +1,102 @@ +#' Return metadata about a Socrata dataset +#' +#' This function returns metadata about a dataset. Generally, such metadata can be accessed +#' with browser at \code{http://DOMAIN/api/views/FOUR-FOUR/rows.json} or +#' \code{http://DOMAIN/api/views/FOUR-FOUR/columns.json}, which is used here. +#' +#' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL! +#' +#' @source \url{http://stackoverflow.com/a/29782941} +#' +#' @examples +#' \dontrun{ +#' gM1 <- getMetadata(url = "http://data.cityofchicago.org/resource/y93d-d9e3.json") +#' gM3 <- getMetadata(url = "https://data.cityofchicago.org/resource/6zsd-86xi") +#' gM2 <- getMetadata(url = "https://data.cityofboston.gov/resource/awu8-dc52") +#' } +#' +#' @return a list (!) containing a number of rows & columns and a data frame of metadata +#' +#' @importFrom jsonlite fromJSON +#' @importFrom httr parse_url build_url +#' +#' @author John Malc \email{cincenko@@outlook.com} +#' +#' @export +getMetadata <- function(url = "") { + + # use function below to get them using =COUNT(*) SODA query + gQRC <- getQueryRowCount(url) + + urlParsedBase <- httr::parse_url(url) + urlParsed <- urlParsedBase + + # create URL for metadata data frame + fourByFour <- substr(basename(urlParsedBase$path), 1, 9) + urlParsed$path <- paste0("api/views/", fourByFour, "/columns.json") + + # execute it + URL <- httr::build_url(urlParsed) + df <- jsonlite::fromJSON(URL) + + # number of rows can be sometimes "cached". If yes, then below we calculate the maximum number of + # rows from all non-null and null fields. + # If not, then it uses "getQueryRowCount" fnct with SODA =COUNT(*) SODA query. + rows <- if (suppressWarnings(max(df$cachedContents$non_null + df$cachedContents$null)) > 0) { + suppressWarnings(max(df$cachedContents$non_null + df$cachedContents$null)) + } else { + # as.numeric(ifelse(is.null(gQRC$count), gQRC$COUNT, gQRC$count)) # the reason + as.numeric(gQRC) + } + + columns <- as.numeric(nrow(df)) + + return(list(rows = rows, cols = columns, df)) +} + +#' Return (always & only) number of rows as specified in the metadata of the data set +#' +#' @source Taken from \link{https://github.com/Chicago/RSocrata/blob/sprint7/R/getQueryRowCount.R} +#' @author Gene Leynes \email{gleynes@@gmail.com} +#' @return number of rows the dataset has +#' +#' @importFrom httr parse_url build_url content +#' @importFrom mime guess_type +#' @noRd +getQueryRowCount <- function(validUrl) { + + urlParsed <- httr::parse_url(validUrl) + + ## Construct the count query based on the URL, + if (is.null(urlParsed[['query']])) { + ## If there is no query at all, create a simple count + + cntQueryText <- "?$SELECT=COUNT(*)" + } else { + ## Otherwise, construct the query text with a COUNT command at the beginning of any other + ## limiting commands. Reconstitute the httr url into a string + cntQueryText <- httr::build_url(structure(list(query = urlParsed[['query']]), class = "url")) + ## Add the COUNT command to the beginning of the query + cntQueryText <- gsub(pattern = ".+\\?", replacement = "?$SELECT=COUNT(*)&", cntQueryText) + } + + ## Combine the count query with the rest of the URL + cntUrl <- paste0(urlParsed[[c('scheme')]], "://", urlParsed[[c('hostname')]], "/", + urlParsed[[c('path')]], cntQueryText) + + ## Execute the query to count the rows + totalRowsResult <- errorHandling(cntUrl, app_token = NULL) + + ## Parsing the result depends on the mime type + mimeType <- mime::guess_type(urlParsed$path) + if (mimeType == "application/json") { + totalRows <- httr::content(totalRowsResult)[[1]] + } else { + totalRows <- httr::content(totalRowsResult) + } + + ## Limit the row count to $limit (if the $limit existed). + # totalRows <- min(totalRows, as.numeric(rowLimit)) + + return(totalRows[[1]]) +} \ No newline at end of file diff --git a/R/returnData.R b/R/returnData.R new file mode 100644 index 0000000..3365eff --- /dev/null +++ b/R/returnData.R @@ -0,0 +1,191 @@ +# An interface to data hosted online in Socrata data repositories +# This is the main file which uses other functions to download data from a Socrata repositories +# +# Author: Hugh J. Devlin, Ph. D. et al. +############################################################################### + +#' Converts to data frame even with missing columns +#' +#' @source https://github.com/DASpringate/RSocrata/blob/master/R/RSocrata.R#L130 +#' @source https://github.com/Chicago/RSocrata/pull/3/files +#' +#' If all items are of the same length, just goes ahead and converts to df. +#' If the items are of different lengths, assume the longest has all the columns, +#' fill in the gaps with NA in the other columns and return in the original column order. +#' +#' @param con - a list as output by content(response) +#' @return a dataframe +#' @author David A Springate \email{daspringate@@gmail.com} +#' @noRd +content_to_df <- function(con){ + lengths <- sapply(con, length) + if (all(lengths == length(con[[1]]))) { + data.frame(t(sapply(con, unlist)), stringsAsFactors = FALSE) + } else { + all_cols <- names(con[[which(sapply(con, length) == max(sapply(con, length)))[1]]]) + con <- lapply(con, function(x) { + r <- c(x, sapply(all_cols[!all_cols %in% names(x)], function(xx) NA, simplify = FALSE)) + r[all_cols] + }) + data.frame(t(sapply(con, unlist)), stringsAsFactors = FALSE) + } +} + +#' Content parsers +#' +#' Return a data frame for csv or json. GeoJSON is used extra in its own function. +#' +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +#' @importFrom httr content +#' @param response - an httr response object +#' @return data frame, possibly empty +#' @noRd +getContentAsDataFrame <- function(response) { + + mimeType <- response$header$'content-type' + + # skip optional parameters + sep <- regexpr(';', mimeType)[1] + + if (sep != -1) { + mimeType <- substr(mimeType, 0, sep[1] - 1) + } + + switch(mimeType, + "text/csv" = + httr::content(response), # automatic parsing + "application/json" = + if (httr::content(response, as = "text") == "[ ]") { # empty json? + data.frame() # empty data frame + } else { + content_to_df(httr::content(response)) + } + ) + +} + + +#' Get a full Socrata data set as an R data frame +#' +#' @description Manages throttling and POSIX date-time conversions. We support only .json suffix. +#' +#' @param url - A Socrata resource URL, or a Socrata "human-friendly" URL, +#' or Socrata Open Data Application Program Interface (SODA) query +#' requesting a comma-separated download format (.json suffix), +#' May include SoQL parameters, and it is now assumed to include SODA \code{limit} +#' & \code{offset} parameters. +#' Either use a compelete URL or use parameters below to construct your URL. +#' @param app_token - a (non-required) string; SODA API token can be used to query the data +#' portal \url{http://dev.socrata.com/consumers/getting-started.html} +#' @param query - Based on query language called the "Socrata Query Language" ("SoQL"), see +#' \url{http://dev.socrata.com/docs/queries.html}. +#' @param limit - defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}. +#' @param domain - A Socrata domain, e.g \url{http://data.cityofchicago.org} +#' @param fourByFour - a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}} +#' +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +#' +#' @examples +#' \dontrun{ +#' df_1 <- read.socrata(url = "http://soda.demo.socrata.com/resource/4334-bgaj.csv") +#' df_2 <- read.socrata(domain = "http://data.cityofchicago.org/", fourByFour = "ydr8-5enu") +#' df_3 <- read.socrata(url = "http://data.cityofchicago.org/resource/ydr8-5enu.json") +#' } +#' @importFrom httr parse_url +#' @importFrom plyr rbind.fill +#' +#' @export +read.socrata <- function(url = NULL, app_token = NULL, limit = 50000, domain = NULL, fourByFour = NULL, + query = NULL) { + + if (is.null(url) == TRUE) { + buildUrl <- paste0(domain, "/resource/", fourByFour, ".json") + url <- httr::parse_url(buildUrl) + } + + # check url syntax, allow human-readable Socrata url + validUrl <- validateUrl(url) + parsedUrl <- httr::parse_url(validUrl) + + response <- errorHandling(validUrl, app_token) + results <- getContentAsDataFrame(response) + dataTypes <- getSodaTypes(response) + + rowCount <- as.numeric(getQueryRowCount(validUrl)) + + ## More to come? Loop over pages implicitly + while (nrow(results) < rowCount) { + query_url <- paste0(validUrl, ifelse(is.null(parsedUrl$query), "?", "&"), "$offset=", nrow(results), "&$limit=", limit) + response <- errorHandling(query_url, app_token) + page <- getContentAsDataFrame(response) + results <- plyr::rbind.fill(results, page) # accumulate data + } + + # Convert Socrata calendar dates to POSIX format + # If sodaTypes are not null, check for column names that are not NA and which dataType + # is a "calendar_date". If there are some, then convert them to POSIX format + if (!is.null(dataTypes)) { + for (columnName in colnames(results)[!is.na(dataTypes[fieldName(colnames(results))]) + & dataTypes[fieldName(colnames(results))] == "calendar_date"]) { + results[[columnName]] <- posixify(results[[columnName]]) + } + } + + return(results) +} + + +#' Download GeoJSON data using geojsonio package +#' +#' @param ... - other arguments from \link{geojsonio} package for geojson_read method +#' @param url - A Socrata resource URL, requiring a .geojson suffix. +#' +#' @return Returns a list, which is the default option here. +#' +#' @examples +#' \dontrun{ +#' df_geo <- read.socrataGEO(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +#' } +#' +#' @importFrom geojsonio geojson_read +#' @importFrom httr parse_url +#' @importFrom mime guess_type +#' +#' @export +read.socrataGEO <- function(url = "", ...) { + + parseUrl <- httr::parse_url(url) + mimeType <- mime::guess_type(parseUrl$path) + + if (mimeType == "application/vnd.geo+json") { + results <- geojsonio::geojson_read(url, method = "local", what = "list", parse = FALSE, ...) + } + + return(results) +} + +#' Get the SoDA 2 data types +#' +#' Get the Socrata Open Data Application Program Interface data types from the http response header. +#' Used only for CSV and JSON, not GeoJSON +#' +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +#' @param response - headers attribute from an httr response object +#' @return a named vector mapping field names to data types +#' @importFrom jsonlite fromJSON +#' @noRd +getSodaTypes <- function(response) { + + # check if types and fields are not null + if (!is.null(response$headers[['x-soda2-types']]) | !is.null(response$headers[['x-soda2-fields']])) { + + result <- jsonlite::fromJSON(response$headers[['x-soda2-types']]) + names(result) <- jsonlite::fromJSON(response$headers[['x-soda2-fields']]) + return(result) + + } else { + NULL + } + +} + diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..f3bbdbb --- /dev/null +++ b/R/utils.R @@ -0,0 +1,115 @@ +#' Checks the validity of the syntax for a potential Socrata dataset Unique Identifier, also known as a 4x4. +#' +#' @description Will check the validity of a potential dataset unique identifier +#' supported by Socrata. It will provide an exception if the syntax +#' does not align to Socrata unique identifiers. It only checks for +#' the validity of the syntax, but does not check if it actually exists. +#' +#' @param fourByFour - a string; character vector of length one +#' @return TRUE if is valid Socrata unique identifier, FALSE otherwise +#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} et al. +#' @examples +#' isFourByFour(fourByFour = "4334-bgaj") +#' isFourByFour("433-bgaj") +#' isFourByFour(fourByFour = "4334-!gaj") +#' @export +isFourByFour <- function(fourByFour = "") { + + if (nchar(fourByFour) == 9) { + if (identical(grepl("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour), TRUE)) { + return(TRUE) + } else { + return(FALSE) + } + } else { + return(FALSE) + } + +} + +# Convert Socrata human-readable column name to field name +# +# @description Convert Socrata human-readable column name, +# as it might appear in the first row of data, +# to field name as it might appear in the HTTP header; +# that is, lower case, periods replaced with underscores +# +# @param humanName - a Socrata human-readable column name +# @return Socrata field name in lower case +# @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +# @examples +# fieldName("Number.of.Stations") # number_of_stations +# @noRd +# @export +fieldName <- function(humanName = "") { + tolower(gsub('\\.', '_', humanName)) +} + +# Convert Socrata calendar_date string to POSIX +# +# @description Datasets will either specify what timezone they should be interpreted in, +# or you can usually assume they are in the timezone of the publisher. See examples below too. +# +# @seealso \url{http://dev.socrata.com/docs/datatypes/floating_timestamp.html} +# @param x - character vector in one of possible Socrata calendar_date formats +# @return a POSIX date +# @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} et al. +# @examples +# posixify("2014-10-13T23:00:00") +# posixify("09/14/2012 10:38:01 PM") +# posixify("09/14/2012") +# @noRd +# @export +posixify <- function(x = "") { + + # https://github.com/Chicago/RSocrata/issues/24 + # If a query with a date column returns no data (e.g. NA), posixify would fail without this + if (length(x) == 0) { + return(x) + } + + # Three calendar date formats supplied by Socrata + # See https://github.com/GregDThomas/jquery-localtime/issues/1 for the floating timestamp regex + + if (any(regexpr("^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[0-1]|0[1-9]|[1-2][0-9])T(2[0-3]|[0-1][0-9]):([0-5][0-9]):([0-5][0-9])(.[0-9]+)?(Z|[+-](?:2[0-3]|[0-1][0-9]):[0-5][0-9])?$", x)[1] == TRUE)) { + # floating timestamp + strptime(x, format = "%Y-%m-%dT%H:%M:%S") + + } else if (any(regexpr("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}$", x)[1] == TRUE)) { + # short date format + strptime(x, format = "%m/%d/%Y") + + } else { + # long date-time format + strptime(x, format = "%m/%d/%Y %I:%M:%S %p") + + } + +} + +# Clean everything after "?", "&" or "." +# +# @source https://stackoverflow.com/questions/5631384/remove-everything-after-a-certain-character +# @source http://rfunction.com/archives/1499 +# +# @examples +# cleanQuest(url = "http://data.cityofchicago.org/resource/y93d-d9e3.csv?%24order=debarment_date&%24limit=50000") +# @returns http://data.cityofchicago.org/resource/y93d-d9e3.csv +# @author John Malc \email{cincenko@@outlook.com} +# @export +cleanQuest <- function(url = "") { + cleanURL <- strsplit(url, "?", fixed = TRUE) + return(cleanURL[[1]][1]) +} + +# @export +cleanAmp <- function(url = "") { + cleanURL <- strsplit(url, "&", fixed = TRUE) + return(cleanURL[[1]][1]) +} + +# @export +cleanDot <- function(url = "") { + cleanURL <- strsplit(url, ".", fixed = TRUE) + return(cleanURL[[1]][1]) +} diff --git a/R/validateURL.R b/R/validateURL.R new file mode 100644 index 0000000..83f45e7 --- /dev/null +++ b/R/validateURL.R @@ -0,0 +1,80 @@ +#' Check if the URL is a valid one and supported by RSocrata. +#' +#' @description Will convert a human-readable URL to a valid REST API call +#' supported by Socrata. It will accept a valid API URL if provided +#' by users and will also convert a human-readable URL to a valid API +#' URL. Will accept queries with optional API token as a separate +#' argument or will also accept API token in the URL query. Will +#' resolve conflicting API token by deferring to original URL. +#' +#' @param url - a string; character vector of length one +#' @return a valid URL used for downloading data +#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} et al. +#' +#' @examples +#' # Returns FALSE +#' ## validateUrl(url = "a.fake.url.being.tested") +#' # Returns TRUE +#' validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj") +#' # Returns TRUE +#' validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json") +#' # Returns TRUE +#' validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.csv") +#' # Returns TRUE +#' validateUrl(url = "http://soda.demo.socrata.com/resource/4334-bgaj.json") +#' # Returns TRUE +#' validateUrl(url = "http://soda.demo.socrata.com/resource/4334-bgaj.xml") +#' # Returns TRUE +#' validateUrl(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +#' # Returns TRUE +#' validateUrl(url = "http://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj.csv") +#' +#' @importFrom httr parse_url build_url content +#' @importFrom mime guess_type +#' +#' @export +validateUrl <- function(url = "") { + parsedUrl <- httr::parse_url(url) + + if ( is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path)) { + stop(url, " does not appear to be a valid URL.") + } + + fourByFour <- basename(parsedUrl$path) + if (!isFourByFour(cleanDot(fourByFour))) { + stop(fourByFour, " is not a valid Socrata dataset unique identifier.") + } + + if ( parsedUrl$scheme == "http") { + parsedUrl$scheme <- "https" + } + + # First, if suffix is CSV/XML, delete it and replace with JSON. + # Later, check if URL doesn't have JSON, i.e. has empty suffix, and if it does append JSON. + mimeType <- mime::guess_type(parsedUrl$path) + + if ( mimeType %in% c("text/csv", "application/xml")) { + parsedUrl$path <- substr(parsedUrl$path, 1, nchar(parsedUrl$path) - 4) # delete + parsedUrl$path <- paste0(parsedUrl$path, ".json") # add + message("BEWARE: Your suffix is no longer supported. Thus, we will automatically replace it with JSON.") + + } else if ( mimeType == "application/json") { + # do nothing + } else if ( mimeType == "application/vnd.geo+json") { + message("For GeoJSON, you must use a new method: read.socrataGEO") + + } else if ( mimeType == "text/plain") { + parsedUrl$path <- paste0(parsedUrl$path, ".json") + + } else { + stop(mimeType, " has never been supported. Use JSON instead. For GeoJSON use a new method: read.socrataGEO") + } + + if ( substr(parsedUrl$path, 1, 9) == "resource/") { + return(httr::build_url(parsedUrl)) # resource url already + } else { + parsedUrl$path <- paste0("resource/", cleanDot(fourByFour), ".json") + return(httr::build_url(parsedUrl)) # resource url already + } + +} diff --git a/README.md b/README.md index 64ecfec..09da00b 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,11 @@ RSocrata **Master** -[![Linux build - Master](https://img.shields.io/travis/Chicago/RSocrata/master.svg?style=flat-square&label=Linux build)](https://travis-ci.org/Chicago/RSocrata)[![Windows build - Master](https://img.shields.io/appveyor/ci/tomschenkjr/RSocrata/master.svg?style=flat-square&label=Windows build)](https://ci.appveyor.com/project/tomschenkjr/rsocrata/branch/master) +[![Linux build - Master](https://img.shields.io/travis/Chicago/RSocrata/master.svg?style=flat-square&label=Linux build)](https://travis-ci.org/Chicago/RSocrata)[![Windows build - Master](https://img.shields.io/appveyor/ci/tomschenkjr/RSocrata/master.svg?style=flat-square&label=Windows build)](https://ci.appveyor.com/project/tomschenkjr/rsocrata/branch/master)[![Coverage - Master](https://img.shields.io/coveralls/Chicago/RSocrata/master.svg?style=flat-square&label=Coverage - Master)](https://coveralls.io/r/Chicago/RSocrata?branch=master) **Dev** -[![Linux build - Dev](https://img.shields.io/travis/Chicago/RSocrata/dev.svg?style=flat-square&label=Linux build)](https://travis-ci.org/Chicago/RSocrata)[![Windows build - Dev](https://img.shields.io/appveyor/ci/tomschenkjr/RSocrata/dev.svg?style=flat-square&label=Windows build)](https://ci.appveyor.com/project/tomschenkjr/rsocrata/branch/dev) +[![Linux build - Dev](https://img.shields.io/travis/Chicago/RSocrata/dev.svg?style=flat-square&label=Linux build)](https://travis-ci.org/Chicago/RSocrata)[![Windows build - Dev](https://img.shields.io/appveyor/ci/tomschenkjr/RSocrata/dev.svg?style=flat-square&label=Windows build)](https://ci.appveyor.com/project/tomschenkjr/rsocrata/branch/dev)[![Coverage - Dev](https://img.shields.io/coveralls/Chicago/RSocrata/dev.svg?style=flat-square&label=Coverage - Dev)](https://coveralls.io/r/Chicago/RSocrata?branch=dev) A tool for downloading Socrata datasets as R data frames -------------------------------------------------------- @@ -23,66 +23,46 @@ Supports [SoDA query parameters](http://dev.socrata.com/docs/queries.html) in th Use ```ls.socrata()``` to list all datasets available on a Socrata webserver. -[RUnit](http://cran.r-project.org/web/packages/RUnit/index.html) test coverage. +This package uses [`testthat`](http://cran.r-project.org/package=testthat) test coverage. -### Example: Reading SoDA valid URLs -```r -earthquakesDataFrame <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv") -nrow(earthquakesDataFrame) # 1007 (two "pages") -class(earthquakesDataFrame$Datetime[1]) # POSIXlt -``` +### Installation -### Example: Reading "human-readable" URLs -```r -earthquakesDataFrame <- read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquakes-for-2012-11-01-API-School-Demo/4334-bgaj") -nrow(earthquakesDataFrame) # 1007 (two "pages") -class(earthquakesDataFrame$Datetime[1]) # POSIXlt -``` +Use `devtools` to install the latest version from Github: -### Example: Using API key to read datasets -```r -token <- "ew2rEMuESuzWPqMkyPfOSGJgE" -earthquakesDataFrame <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv", app_token = token) -nrow(earthquakesDataFrame) ``` - -### Example: List all datasets on portal -```r -allSitesDataFrame <- ls.socrata("https://soda.demo.socrata.com") -nrow(allSitesDataFrame) # Number of datasets -allSitesDataFrame$title # Names of each dataset +library(devtools) +devtools::install_github("Chicago/RSocrata") ``` -### Issues +**OR** -Please report issues, request enhancements or fork us at the [City of Chicago github](https://github.com/Chicago/RSocrata/issues). +from [CRAN](http://cran.r-project.org/package=RSocrata): -### Contributing +``` +install.packages("RSocrata") +``` + +**Beware**: -If you would like to contribute to this project, please see the [contributing documentation](CONTRIBUTING.md) +In order to support `GeoJSON` (which is semi-optional), it is necessary to install [geojsonio](https://github.com/ropensci/geojsonio) correctly! +This depends on packages such as `rgdal` & `rgeos` (both on CRAN), which additionally on Linux you will need to install through `apt-get`: -### Change log +`sudo apt-get install libgdal1-dev libgdal-dev libgeos-c1 libproj-dev` -1.1 Add check for valid Socrata resource URL. Add check for supported download file format. Add support for Socrata short dates. +Then install both CRAN packages using: -1.2 Use comma-separated file format for Socrata downloads. +``` +install.packages(c("rgdal", "rgeos")) +``` -1.3 Added support for human-readable URL. +### Examples & Chanelog -1.4 Add json file format for Socrata downloads. Switch to RJSONIO rom rjson. +Look for examples in the [`vignette` folder](https://github.com/Chicago/RSocrata/blob/dev/vignettes/Examples.Rmd) and see `NEWS` in the root of this repository. -1.5 Several changes: -* Swapped ```jsonlite``` to ```RJSONIO``` -* Added handling for long and short dates -* Added unit test for reading private datasets +### Issues -1.5.1 Deprecated ```httr::guess_media()``` and implemented ```httr::guess_type()``` +**Please report issues**, request enhancements or fork us at the [City of Chicago github](https://github.com/Chicago/RSocrata/issues). -1.6.0 Several changes: -* New function, ```ls.socrata``` to list all datasets on a Socrata portal. -* New optional argument, ```app_token```, which lets users supply an API token while using ```read.socrata()``` to minimize throttling. -* Repairs a bug where ```read.socrata``` failed when reading in a date with a column, but there are null values in that column. +### Contributing -1.6.1 Bug fixes: -* Resolved potential [name collision issue](https://github.com/Chicago/RSocrata/issues/42) -* Cleaned-up documentation with contributor instructions +If you would like to contribute to this project, please see the [contributing documentation](CONTRIBUTING.md) and the [product roadmap](https://github.com/Chicago/RSocrata/wiki/Roadmap#planned-releases). diff --git a/RSocrata.Rproj b/RSocrata.Rproj index c3e4e80..c9f9fb2 100644 --- a/RSocrata.Rproj +++ b/RSocrata.Rproj @@ -9,10 +9,11 @@ UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 -RnwWeave: Sweave +RnwWeave: knitr LaTeX: pdfLaTeX BuildType: Package +PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageCheckArgs: --as-cran PackageRoxygenize: rd,collate,namespace diff --git a/appveyor.yml b/appveyor.yml index 00265ab..acf5dcc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -15,7 +15,6 @@ build_script: test_script: - Rscript -e "install.packages('roxygen2', repos='http://cran.us.r-project.org'); library(roxygen2); roxygen2::roxygenize(package.dir='.', roclets=c('rd', 'collate', 'namespace'))" - travis-tool.sh run_tests - - Rscript -e "source('R/tests/testRSocrata.R'); runAllTestsCI()" on_failure: - travis-tool.sh dump_logs diff --git a/man/getMetadata.Rd b/man/getMetadata.Rd new file mode 100644 index 0000000..8fdc2f4 --- /dev/null +++ b/man/getMetadata.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metadata.R +\name{getMetadata} +\alias{getMetadata} +\title{Return metadata about a Socrata dataset} +\source{ +\url{http://stackoverflow.com/a/29782941} +} +\usage{ +getMetadata(url = "") +} +\arguments{ +\item{url}{- A Socrata resource URL, or a Socrata "human-friendly" URL!} +} +\value{ +a list (!) containing a number of rows & columns and a data frame of metadata +} +\description{ +This function returns metadata about a dataset. Generally, such metadata can be accessed +with browser at \code{http://DOMAIN/api/views/FOUR-FOUR/rows.json} or +\code{http://DOMAIN/api/views/FOUR-FOUR/columns.json}, which is used here. +} +\examples{ +\dontrun{ +gM1 <- getMetadata(url = "http://data.cityofchicago.org/resource/y93d-d9e3.json") +gM3 <- getMetadata(url = "https://data.cityofchicago.org/resource/6zsd-86xi") +gM2 <- getMetadata(url = "https://data.cityofboston.gov/resource/awu8-dc52") +} + +} +\author{ +John Malc \email{cincenko@outlook.com} +} + diff --git a/man/isFourByFour.Rd b/man/isFourByFour.Rd new file mode 100644 index 0000000..9ff3dc3 --- /dev/null +++ b/man/isFourByFour.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{isFourByFour} +\alias{isFourByFour} +\title{Checks the validity of the syntax for a potential Socrata dataset Unique Identifier, also known as a 4x4.} +\usage{ +isFourByFour(fourByFour = "") +} +\arguments{ +\item{fourByFour}{- a string; character vector of length one} +} +\value{ +TRUE if is valid Socrata unique identifier, FALSE otherwise +} +\description{ +Will check the validity of a potential dataset unique identifier +supported by Socrata. It will provide an exception if the syntax +does not align to Socrata unique identifiers. It only checks for +the validity of the syntax, but does not check if it actually exists. +} +\examples{ +isFourByFour(fourByFour = "4334-bgaj") +isFourByFour("433-bgaj") +isFourByFour(fourByFour = "4334-!gaj") +} +\author{ +Tom Schenk Jr \email{tom.schenk@cityofchicago.org} et al. +} + diff --git a/man/ls.socrata.Rd b/man/ls.socrata.Rd new file mode 100644 index 0000000..6e26ca8 --- /dev/null +++ b/man/ls.socrata.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/listDatasets.R +\name{ls.socrata} +\alias{ls.socrata} +\title{List datasets available from a Socrata domain} +\usage{ +ls.socrata(url = "") +} +\arguments{ +\item{url}{- A Socrata URL. This simply points to the site root.} +} +\value{ +an R data frame containing a listing of datasets along with +various metadata. +} +\description{ +List datasets available from a Socrata domain +} +\note{ +URLs such as \code{"soda.demo.socrata.com"} are not supported +} +\examples{ +df <- ls.socrata(url = "http://soda.demo.socrata.com") +## df.ny <- ls.socrata("https://data.ny.gov/") + +} +\author{ +Peter Schmiedeskamp \email{pschmied@uw.edu} +} + diff --git a/man/read.socrata.Rd b/man/read.socrata.Rd new file mode 100644 index 0000000..dafb166 --- /dev/null +++ b/man/read.socrata.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/returnData.R +\name{read.socrata} +\alias{read.socrata} +\title{Get a full Socrata data set as an R data frame} +\usage{ +read.socrata(url = NULL, app_token = NULL, limit = 50000, domain = NULL, + fourByFour = NULL, query = NULL) +} +\arguments{ +\item{url}{- A Socrata resource URL, or a Socrata "human-friendly" URL, +or Socrata Open Data Application Program Interface (SODA) query +requesting a comma-separated download format (.json suffix), +May include SoQL parameters, and it is now assumed to include SODA \code{limit} +& \code{offset} parameters. +Either use a compelete URL or use parameters below to construct your URL.} + +\item{app_token}{- a (non-required) string; SODA API token can be used to query the data +portal \url{http://dev.socrata.com/consumers/getting-started.html}} + +\item{limit}{- defaults to the max of 50000. See \url{http://dev.socrata.com/docs/paging.html}.} + +\item{domain}{- A Socrata domain, e.g \url{http://data.cityofchicago.org}} + +\item{fourByFour}{- a unique 4x4 identifier, e.g. "ydr8-5enu". See more \code{\link{isFourByFour}}} + +\item{query}{- Based on query language called the "Socrata Query Language" ("SoQL"), see +\url{http://dev.socrata.com/docs/queries.html}.} +} +\description{ +Manages throttling and POSIX date-time conversions. We support only .json suffix. +} +\examples{ +\dontrun{ +df_1 <- read.socrata(url = "http://soda.demo.socrata.com/resource/4334-bgaj.csv") +df_2 <- read.socrata(domain = "http://data.cityofchicago.org/", fourByFour = "ydr8-5enu") +df_3 <- read.socrata(url = "http://data.cityofchicago.org/resource/ydr8-5enu.json") +} +} +\author{ +Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@cityofchicago.org} +} + diff --git a/man/read.socrataGEO.Rd b/man/read.socrataGEO.Rd new file mode 100644 index 0000000..b5d1899 --- /dev/null +++ b/man/read.socrataGEO.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/returnData.R +\name{read.socrataGEO} +\alias{read.socrataGEO} +\title{Download GeoJSON data using geojsonio package} +\usage{ +read.socrataGEO(url = "", ...) +} +\arguments{ +\item{url}{- A Socrata resource URL, requiring a .geojson suffix.} + +\item{...}{- other arguments from \link{geojsonio} package for geojson_read method} +} +\value{ +Returns a list, which is the default option here. +} +\description{ +Download GeoJSON data using geojsonio package +} +\examples{ +\dontrun{ +df_geo <- read.socrataGEO(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +} + +} + diff --git a/man/validateUrl.Rd b/man/validateUrl.Rd new file mode 100644 index 0000000..62ed2d7 --- /dev/null +++ b/man/validateUrl.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validateURL.R +\name{validateUrl} +\alias{validateUrl} +\title{Check if the URL is a valid one and supported by RSocrata.} +\usage{ +validateUrl(url = "") +} +\arguments{ +\item{url}{- a string; character vector of length one} +} +\value{ +a valid URL used for downloading data +} +\description{ +Will convert a human-readable URL to a valid REST API call +supported by Socrata. It will accept a valid API URL if provided +by users and will also convert a human-readable URL to a valid API +URL. Will accept queries with optional API token as a separate +argument or will also accept API token in the URL query. Will +resolve conflicting API token by deferring to original URL. +} +\examples{ +# Returns FALSE +## validateUrl(url = "a.fake.url.being.tested") +# Returns TRUE +validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj") +# Returns TRUE +validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json") +# Returns TRUE +validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.csv") +# Returns TRUE +validateUrl(url = "http://soda.demo.socrata.com/resource/4334-bgaj.json") +# Returns TRUE +validateUrl(url = "http://soda.demo.socrata.com/resource/4334-bgaj.xml") +# Returns TRUE +validateUrl(url = "https://data.cityofchicago.org/resource/6zsd-86xi.geojson") +# Returns TRUE +validateUrl(url = "http://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj.csv") + +} +\author{ +Tom Schenk Jr \email{tom.schenk@cityofchicago.org} et al. +} + diff --git a/tests/testthat.R b/tests/testthat.R index 8604f6c..189e139 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,4 @@ library(testthat) library(RSocrata) -test_check("RSocrata") +test_check("RSocrata") \ No newline at end of file diff --git a/tests/testthat/test-all.R b/tests/testthat/test-all.R index add785d..5d8f84d 100644 --- a/tests/testthat/test-all.R +++ b/tests/testthat/test-all.R @@ -1,203 +1,74 @@ library(testthat) library(RSocrata) -library(curl) -library(httr) -library(jsonlite) -library(mime) - -context("posixify function") - -test_that("posixify returns Long format", { - dt <- posixify("09/14/2012 10:38:01 PM") - expect_equal("POSIXlt", class(dt)[1], "first data type of a date") - expect_equal(2012, dt$year + 1900, "year") - expect_equal(9, dt$mon + 1, "month") - expect_equal(14, dt$mday, "day") - expect_equal(22, dt$hour, "hours") - expect_equal(38, dt$min, "minutes") - expect_equal(1, dt$sec, "seconds") -}) - - -test_that("posixify returns Short format", { - dt <- posixify("09/14/2012") - expect_equal("POSIXlt", class(dt)[1], "first data type of a date") - expect_equal(2012, dt$year + 1900, "year") - expect_equal(9, dt$mon + 1, "month") - expect_equal(14, dt$mday, "day") - expect_equal(0, dt$hour, "hours") - expect_equal(0, dt$min, "minutes") - expect_equal(0, dt$sec, "seconds") -}) context("read Socrata") test_that("read Socrata CSV", { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.csv') - expect_equal(1007, nrow(df), "rows") - expect_equal(9, ncol(df), "columns") + df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") }) -test_that("read Socrata JSON", { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.json') - expect_equal(1007, nrow(df), "rows") - expect_equal(11, ncol(df), "columns") +test_that("read Socrata JSON with HTTP", { + df <- read.socrata(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") }) test_that("read Socrata No Scheme", { - expect_error(read.socrata('soda.demo.socrata.com/resource/4334-bgaj.csv')) + expect_error(read.socrata("soda.demo.socrata.com/resource/4334-bgaj.csv")) }) -test_that("readSoQL", { - df <- read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=region') - expect_equal(1007, nrow(df), "rows") - expect_equal(1, ncol(df), "columns") +test_that("read SoQL", { + df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.json", query = "region") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") }) -test_that("readSoQLColumnNotFound (will fail)", { +test_that("read SoQL Column Not Found (will fail)", { # SoQL API uses field names, not human names - expect_error(read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=Region')) + expect_message(expect_error(read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=Region"))) }) test_that("URL is private (Unauthorized) (will fail)", { - expect_error(read.socrata('http://data.cityofchicago.org/resource/j8vp-2qpg.json')) + expect_error(read.socrata("http://data.cityofchicago.org/resource/j8vp-2qpg.json")) }) -test_that("readSocrataHumanReadable", { +test_that("read human-readable Socrata URL", { df <- read.socrata('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj') - expect_equal(1007, nrow(df), "rows") - expect_equal(9, ncol(df), "columns") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") }) test_that("format is not supported", { # Unsupported data formats - expect_error(read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.xml')) -}) - -context("Socrata Calendar") - -test_that("Calendar Date Long", { - df <- read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv') - dt <- df$Datetime[1] # "2012-09-14 22:38:01" - expect_equal("POSIXlt", class(dt)[1], "data type of a date") - expect_equal(2012, dt$year + 1900, "year") - expect_equal(9, dt$mon + 1, "month") - expect_equal(14, dt$mday, "day") - expect_equal(22, dt$hour, "hours") - expect_equal(38, dt$min, "minutes") - expect_equal(1, dt$sec, "seconds") -}) - -test_that("Calendar Date Short", { - df <- read.socrata('http://data.cityofchicago.org/resource/y93d-d9e3.csv?$order=debarment_date') - dt <- df$DEBARMENT.DATE[1] # "05/21/1981" - expect_equal("POSIXlt", class(dt)[1], "data type of a date") - expect_equal(81, dt$year, "year") - expect_equal(5, dt$mon + 1, "month") - expect_equal(21, dt$mday, "day") - expect_equal(0, dt$hour, "hours") - expect_equal(0, dt$min, "minutes") - expect_equal(0, dt$sec, "seconds") -}) - -context("Checks the validity of 4x4") - -test_that("is 4x4", { - expect_true(isFourByFour("4334-bgaj"), "ok") - expect_false(isFourByFour("4334c-bgajc"), "11 characters instead of 9") - expect_false(isFourByFour("433-bga"), "7 characters instead of 9") - expect_false(isFourByFour("433-bgaj"), "3 characters before dash instead of 4") - expect_false(isFourByFour("4334-!gaj"), "non-alphanumeric character") -}) - - -test_that("is 4x4 URL", { - expect_error(read.socrata("https://soda.demo.socrata.com/api/views/4334c-bgajc"), "11 characters instead of 9") - expect_error(read.socrata("https://soda.demo.socrata.com/api/views/433-bga"), "7 characters instead of 9") - expect_error(read.socrata("https://soda.demo.socrata.com/api/views/433-bgaj"), "3 characters before dash instead of 4") - expect_error(read.socrata("https://soda.demo.socrata.com/api/views/4334-!gaj"), "non-alphanumeric character") -}) - -test_that("Invalid URL", { - expect_error(read.socrata("a.fake.url.being.tested"), "invalid url") -}) - -context("Test Socrata with Token") - -test_that("CSV with Token", { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.csv', app_token="ew2rEMuESuzWPqMkyPfOSGJgE") - expect_equal(1007, nrow(df), "rows") - expect_equal(9, ncol(df), "columns") -}) - - -test_that("readSocrataHumanReadableToken", { - df <- read.socrata('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj', app_token="ew2rEMuESuzWPqMkyPfOSGJgE") - expect_equal(1007, nrow(df), "rows") - expect_equal(9, ncol(df), "columns") -}) - -test_that("API Conflict", { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.csv?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE', app_token="ew2rEMuESuzWPqMkyPfOSUSER") - expect_equal(1007, nrow(df), "rows") - expect_equal(9, ncol(df), "columns") - # Check that function is calling the API token specified in url - expect_true(substr(validateUrl('https://soda.demo.socrata.com/resource/4334-bgaj.csv?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE', app_token="ew2rEMuESuzWPqMkyPfOSUSER"), 70, 94)=="ew2rEMuESuzWPqMkyPfOSGJgE") -}) - -test_that("readAPIConflictHumanReadable", { - df <- read.socrata('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE', app_token="ew2rEMuESuzWPqMkyPfOSUSER") - expect_equal(1007, nrow(df), "rows") - expect_equal(9, ncol(df), "columns") - # Check that function is calling the API token specified in url - expect_true(substr(validateUrl('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE', app_token="ew2rEMuESuzWPqMkyPfOSUSER"), 70, 94)=="ew2rEMuESuzWPqMkyPfOSGJgE") -}) - -test_that("incorrect API Query", { - # The query below is missing a $ before app_token. - expect_error(read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv?$app_token=ew2rEMuESuzWPqMkyPfOSGJgE")) - # Check that it was only because of missing $ - df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE") - expect_equal(1007, nrow(df), "rows") - expect_equal(9, ncol(df), "columns") -}) - - -test_that("incorrect API Query Human Readable", { - # The query below is missing a $ before app_token. - expect_error(read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$app_token=ew2rEMuESuzWPqMkyPfOSGJgE")) - # Check that it was only because of missing $ - df <- read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE") - expect_equal(1007, nrow(df), "rows") - expect_equal(9, ncol(df), "columns") -}) - -test_that("List datasets available from a Socrata domain", { - # Makes some potentially erroneous assumptions about availability - # of soda.demo.socrata.com - df <- ls.socrata("https://soda.demo.socrata.com") - expect_equal(TRUE, nrow(df) > 0) - # Test comparing columns against data.json specifications: - # https://project-open-data.cio.gov/v1.1/schema/ - core_names <- as.character(c("issued", "modified", "keyword", "landingPage", "theme", - "title", "accessLevel", "distribution", "description", - "identifier", "publisher", "contactPoint", "license")) - expect_equal(as.logical(rep(TRUE, length(core_names))), core_names %in% names(df)) - # Check that all names in data.json are accounted for in ls.socrata return - expect_equal(as.logical(rep(TRUE, length(names(df)))), names(df) %in% c(core_names)) -}) - - -test_that("Invalid Socrata domain", { - expect_error(read.socrata("a.fake.url.being.tested"), "invalid url") -}) - - - - - - - - - - + expect_message(read.socrata(url="http://soda.demo.socrata.com/resource/4334-bgaj.xml"), + "BEWARE: Your suffix is no longer supported. Thus, we will automatically replace it with JSON.") +}) + +# https://github.com/Chicago/RSocrata/issues/19 +test_that("A JSON test with uneven row lengths", { + data <- read.socrata(url = "https://data.cityofchicago.org/resource/kn9c-c2s2.json") + awqe <- read.socrata(url = "http://data.ny.gov/resource/eda3-in2f.json") + # df_manual3 <- read.socrata(url="http://data.cityofchicago.org/resource/ydr8-5enu.json") + expect_more_than(ncol(awqe), 26) + expect_more_than(ncol(data), 8) +}) + +# https://github.com/Chicago/RSocrata/issues/14 +test_that("RSocrata hangs when passing along SoDA queries with small number of results ", { + skip_on_cran() + skip_on_travis() + skip_on_appveyor() + skip("Test works, but is just to large & long to run it") + + df500 <- read.socrata(url = "https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =500) + df250 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =250) + df100 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =100) + df50 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =50) + df25 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =25) + df10 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =10) + df5 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =5) + df1 <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json", limit =1) + df <- read.socrata("https://data.cityofchicago.org/resource/xzkq-xp2w.json") +}) \ No newline at end of file diff --git a/tests/testthat/test-dateTime.R b/tests/testthat/test-dateTime.R new file mode 100644 index 0000000..5bad71b --- /dev/null +++ b/tests/testthat/test-dateTime.R @@ -0,0 +1,76 @@ +library(testthat) +library(RSocrata) + +# http://www.noamross.net/blog/2014/2/10/using-times-and-dates-in-r---presentation-code.html + +context("Test posixify function") + +test_that("posixify returns Long format", { + dt <- posixify("09/14/2012 10:38:01 PM") + expect_equal("POSIXlt", class(dt)[1], label = "first data type of a date") + expect_equal(2012, dt$year + 1900, label = "year") + expect_equal(9, dt$mon + 1, label = "month") + expect_equal(14, dt$mday, label = "day") + expect_equal(22, dt$hour, label = "hours") + expect_equal(38, dt$min, label = "minutes") + expect_equal(1, dt$sec, label = "seconds") +}) + +test_that("posixify returns Short format", { + dt <- posixify("09/14/2012") + expect_equal("POSIXlt", class(dt)[1], label = "first data type of a date") + expect_equal(2012, dt$year + 1900, label = "year") + expect_equal(9, dt$mon + 1, label = "month") + expect_equal(14, dt$mday, label = "day") + expect_equal(0, dt$hour, label = "hours") + expect_equal(0, dt$min, label = "minutes") + expect_equal(0, dt$sec, label = "seconds") +}) + +test_that("posixify new Floating Timestamp format", { + dt <- posixify("2014-10-13T23:25:47") + expect_equal("POSIXlt", class(dt)[1], label = "first data type of a date") + expect_equal(2014, dt$year + 1900, label = "year") + expect_equal(25, dt$min, label = "minutes") + expect_equal(47, dt$sec, label = "seconds") +}) + +test_that("NA datetime in source (JSON)", { + # https://github.com/Chicago/RSocrata/issues/24 + # https://github.com/Chicago/RSocrata/issues/27 + skip("Dataframe is just to big, over 600k. rows") + skip_on_cran() + skip_on_travis() + df <- read.socrata(url = "https://data.cityofboston.gov/resource/awu8-dc52.json") + df_met <- getMetadata("https://data.cityofboston.gov/City-Services/311-Service-Requests/awu8-dc52") + # expect_equal(sum(is.na(df$target_dt)), 194) + expect_more_than(ncol(df_met[[2]]), 10) +}) + + +context("Socrata Calendar") + +test_that("Calendar Date Long", { + df <- read.socrata(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json") + dt <- df$datetime[1] # "2012-09-14 22:38:01" + expect_equal("POSIXlt", class(dt)[1], label = "data type of a date") + expect_equal(2012, dt$year + 1900, label = "year") + expect_equal(9, dt$mon + 1, label = "month") + expect_equal(14, dt$mday, label = "day") + expect_equal(22, dt$hour, label = "hours") + expect_equal(38, dt$min, label = "minutes") + expect_equal(1, dt$sec, label = "seconds") +}) + +test_that("Calendar Date Short", { + df <- read.socrata(url = "http://data.cityofchicago.org/resource/y93d-d9e3.json") + df <- df[with(df, order(debarment_date)), ] + dt <- df$debarment_date[1] # "05/21/1981" + expect_equal("POSIXlt", class(dt)[1], label = "data type of a date") + expect_equal(81, dt$year, label = "year") + expect_equal(5, dt$mon + 1, label = "month") + expect_equal(21, dt$mday, label = "day") + expect_equal(0, dt$hour, label = "hours") + expect_equal(0, dt$min, label = "minutes") + expect_equal(0, dt$sec, label = "seconds") +}) \ No newline at end of file diff --git a/tests/testthat/test-fourByFour.R b/tests/testthat/test-fourByFour.R new file mode 100644 index 0000000..e8148d3 --- /dev/null +++ b/tests/testthat/test-fourByFour.R @@ -0,0 +1,20 @@ +library(testthat) +library(RSocrata) + +context("Checks the validity of 4x4") + +test_that("is 4x4", { + expect_true(isFourByFour("4334-bgaj"), label="ok") + expect_false(isFourByFour("4334c-bgajc"), label="11 characters instead of 9") + expect_false(isFourByFour("433-bga"), label="7 characters instead of 9") + expect_false(isFourByFour("433-bgaj"), label="3 characters before dash instead of 4") + expect_false(isFourByFour("4334-!gaj"), label="non-alphanumeric character") +}) + + +test_that("URLs contain 4x4 format", { + expect_error(read.socrata("https://soda.demo.socrata.com/api/views/4334c-bgajc")) + expect_error(read.socrata("https://soda.demo.socrata.com/api/views/433-bga")) + expect_error(read.socrata("https://soda.demo.socrata.com/api/views/433-bgaj")) + expect_error(read.socrata("https://soda.demo.socrata.com/api/views/4334-!gaj")) +}) diff --git a/tests/testthat/test-geo.R b/tests/testthat/test-geo.R new file mode 100644 index 0000000..25be62a --- /dev/null +++ b/tests/testthat/test-geo.R @@ -0,0 +1,9 @@ +library(testthat) +library(RSocrata) + +context("Geospatial JSON") + +test_that("fetches GeoJSON data", { + geodf <- read.socrataGEO("https://data.cityofchicago.org/resource/6zsd-86xi.geojson") + expect_equal(geodf$type, "FeatureCollection") +}) \ No newline at end of file diff --git a/tests/testthat/test-listDatasets.R b/tests/testthat/test-listDatasets.R new file mode 100644 index 0000000..d89efde --- /dev/null +++ b/tests/testthat/test-listDatasets.R @@ -0,0 +1,28 @@ +library(testthat) +library(RSocrata) + +context("List datasets available from a Socrata domain") + +test_that("More than 0 datasets are available from a Socrata domain to download", { + # Makes some potentially erroneous assumptions about availability of soda.demo.socrata.com + + df <- ls.socrata("https://soda.demo.socrata.com") + df.ny <- ls.socrata("https://data.ny.gov/") + + expect_true(nrow(df) > 0) + expect_true(nrow(df.ny) > 10) +}) + + +test_that("Test comparing columns against data.json specifications", { + # https://project-open-data.cio.gov/v1.1/schema/ + + df <- ls.socrata("https://soda.demo.socrata.com") + core_names <- c("issued", "modified", "keyword", "landingPage", "theme", "title", + "accessLevel", "distribution", "description", "identifier", + "publisher", "contactPoint", "license") + + expect_equal(rep(TRUE, length(core_names)), core_names %in% names(df)) + # Check that all names in data.json are accounted for in ls.socrata return + expect_equal(rep(TRUE, length(names(df))), names(df) %in% c(core_names)) +}) diff --git a/tests/testthat/test-metadata.R b/tests/testthat/test-metadata.R new file mode 100644 index 0000000..d160ab2 --- /dev/null +++ b/tests/testthat/test-metadata.R @@ -0,0 +1,14 @@ +library(testthat) +library(RSocrata) + +context("Checks metadata") + +test_that("it returns some number of rows", { + nr <- getMetadata(url = "http://data.cityofchicago.org/resource/y93d-d9e3.json") + expect_more_than(nr[[1]], 141) + nr2 <- getMetadata(url = "https://data.cityofchicago.org/resource/6zsd-86xi.json") + expect_more_than(nr2[[1]], 5878398) +}) + + + diff --git a/tests/testthat/test-token.R b/tests/testthat/test-token.R new file mode 100644 index 0000000..fb0bca5 --- /dev/null +++ b/tests/testthat/test-token.R @@ -0,0 +1,56 @@ +library(testthat) +library(RSocrata) + +context("Test Socrata with Token") + +test_that("CSV with Token", { + df <- read.socrata(url = "https://soda.demo.socrata.com/resource/4334-bgaj.csv", + app_token="ew2rEMuESuzWPqMkyPfOSGJgE") + + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + + +test_that("it will read Socrata Human Readable URL with Token", { + df <- read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj", + app_token="ew2rEMuESuzWPqMkyPfOSGJgE") + + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + +test_that("API Conflict", { + expect_error(read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE", + app_token="ew2rEMuESuzWPqMkyPfOSUSER")) + + # expect_equal(1007, nrow(df), label="rows") + # expect_equal(9, ncol(df), label="columns") +}) + +test_that("read API Conflict HumanReadable", { + expect_error(read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$$app_token=ew2rEMuESuzWPqMkyPfOSGJgE", + app_token="ew2rEMuESuzWPqMkyPfOSUSER")) + + # expect_equal(1007, nrow(df), label="rows") + # expect_equal(11, ncol(df), label="columns") +}) + +test_that("incorrect API Query", { + # The query below is missing a $ before app_token. + expect_error(read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv?$app_token=ew2rEMuESuzWPqMkyPfOSGJgE")) + + df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj.csv", app_token= "ew2rEMuESuzWPqMkyPfOSGJgE") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + + +test_that("incorrect API Query Human Readable", { + # The query below is missing a $ before app_token. + expect_error(read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj?$app_token=ew2rEMuESuzWPqMkyPfOSGJgE")) + + df <- read.socrata("https://soda.demo.socrata.com/resource/4334-bgaj", app_token= "ew2rEMuESuzWPqMkyPfOSGJgE") + expect_equal(1007, nrow(df), label = "rows") + expect_equal(11, ncol(df), label = "columns") +}) \ No newline at end of file diff --git a/tests/testthat/test-urlBreakout.R b/tests/testthat/test-urlBreakout.R new file mode 100644 index 0000000..4f95b79 --- /dev/null +++ b/tests/testthat/test-urlBreakout.R @@ -0,0 +1,32 @@ +library(testthat) +library(RSocrata) + +context("Parsimonious URL calls") + +test_that("read from Socrata domain", { + df <- read.socrata(domain = "http://soda.demo.socrata.com", + fourByFour = "4334-bgaj") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + +test_that("read from Socrata domain, trailing slash", { + df <- read.socrata(domain = "http://soda.demo.socrata.com/", + fourByFour = "4334-bgaj") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + +test_that("read from Socrata domain, SSL", { + df <- read.socrata(domain = "https://soda.demo.socrata.com", + fourByFour = "4334-bgaj") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) + +test_that("read from Socrata domain, SSL with trailing slash", { + df <- read.socrata(domain = "http://soda.demo.socrata.com/", + fourByFour = "4334-bgaj") + expect_equal(1007, nrow(df), label="rows") + expect_equal(11, ncol(df), label="columns") +}) \ No newline at end of file diff --git a/tests/testthat/test-validURL.R b/tests/testthat/test-validURL.R new file mode 100644 index 0000000..3626a91 --- /dev/null +++ b/tests/testthat/test-validURL.R @@ -0,0 +1,33 @@ +library(testthat) +library(RSocrata) + +context("Validate URL") + +test_that("Invalid URL", { + expect_error(read.socrata("a.fake.url.being.tested")) +}) + +test_that("human readable URLs are not supported", { + expect_output(validateUrl("https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + +test_that("http will get replaced with HTTPS and JSON", { + expect_output(validateUrl("http://soda.demo.socrata.com/resource/4334-bgaj.csv"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + +test_that("URL with no suffix will get JSON one", { + expect_output(validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + +test_that("nothing happens with URL", { + expect_output(validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.json"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) + +test_that("CSV will get replaced with JSON", { + expect_output(validateUrl(url = "https://soda.demo.socrata.com/resource/4334-bgaj.csv"), + "https://soda.demo.socrata.com/resource/4334-bgaj.json") +}) \ No newline at end of file