From a577adf9eb14a86c29c1ea274b1022f2022c3c13 Mon Sep 17 00:00:00 2001 From: Malcolm Morgan Date: Sat, 9 Sep 2023 00:25:12 +0100 Subject: [PATCH 1/3] memory tweaks tweaks to reduce memory use, still WIP --- R/batch_read.R | 27 +++++++++++++-------------- R/json2sf_cs.R | 14 ++++++++++---- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/R/batch_read.R b/R/batch_read.R index c0b13c5..c39381e 100644 --- a/R/batch_read.R +++ b/R/batch_read.R @@ -12,13 +12,12 @@ batch_read = function( cols_to_keep = c( "name", # not used currently but could be handy "distances", - "gradient_smooth", + "elevations", "quietness" ) ) { message("Reading in the following file:\n", file) res = readr::read_csv(file, show_col_types = FALSE) - res$route_number = seq(nrow(res)) n_char = nchar(res$json) n_char[is.na(n_char)] = 0 if(all(n_char == 0)) { @@ -31,8 +30,8 @@ batch_read = function( res = res[-which_min_ncar, ] } - res_df = json2sf_cs(results_raw = res$json, - id = res$route_number, + res = json2sf_cs(results_raw = res$json, + id = seq(nrow(res)), segments = segments, cols_to_keep = cols_to_keep ) @@ -52,33 +51,33 @@ batch_read = function( } for(i in seq(1, length(nms))){ - if(nms[i] %in% names(res_df$routes)){ - res_df$routes[[nms[i]]] = as.numeric(res_df$routes[[nms[i]]]) + if(nms[i] %in% names(res$routes)){ + res$routes[[nms[i]]] = as.numeric(res$routes[[nms[i]]]) } } - names(res_df$routes)[names(res_df$routes) == "id"] = "route_number" + names(res$routes)[names(res$routes) == "id"] = "route_number" for(i in seq(1, length(nms))){ - if(nms[i] %in% names(res_df$segments)){ - res_df$segments[[nms[i]]] = as.numeric(res_df$segments[[nms[i]]]) + if(nms[i] %in% names(res$segments)){ + res$segments[[nms[i]]] = as.numeric(res$segments[[nms[i]]]) } } - names(res_df$segments)[names(res_df$segments) == "id"] = "route_number" + names(res$segments)[names(res$segments) == "id"] = "route_number" } else { for(i in seq(1, length(nms))){ - if(nms[i] %in% names(res_df)){ - res_df[[nms[i]]] = as.numeric(res_df[[nms[i]]]) + if(nms[i] %in% names(res)){ + res[[nms[i]]] = as.numeric(res[[nms[i]]]) } } - names(res_df)[names(res_df) == "id"] = "route_number" + names(res)[names(res) == "id"] = "route_number" } - res_df + res } diff --git a/R/json2sf_cs.R b/R/json2sf_cs.R index 4ca0839..2dd8cb3 100644 --- a/R/json2sf_cs.R +++ b/R/json2sf_cs.R @@ -61,8 +61,7 @@ json2sf_cs = function( "start_latitude", "finish_longitude", "finish_latitude", "crow_fly_distance", "event", "whence", "speed", "itinerary", "plan", "note", "length", "west", "south", "east", "north", "leaving", "arriving", "grammesCO2saved", - "calories", "edition", "gradient_segment", "elevation_change", - "gradient_smooth") + "calories", "edition") ){ # Support both @@ -81,6 +80,7 @@ json2sf_cs = function( # browser() results = RcppSimdJson::fparse(results_raw, query = "/marker", query_error_ok = TRUE, always_list = TRUE) results_error = RcppSimdJson::fparse(results_raw, query = "/error", query_error_ok = TRUE, always_list = TRUE) + rm(results_raw) results_error = unlist(results_error, use.names = FALSE) if(length(results_error) > 0){ message(length(results_error)," routes returned errors. Unique error messages are:\n") @@ -96,8 +96,14 @@ json2sf_cs = function( if(!is.null(id)){ names(results) = as.character(id) } - # TODO: subset to keep only columns of relevance - results = lapply(results, data.table::rbindlist, fill = TRUE) + + cols_to_keep2 = unique(c(cols_to_keep,"type","start","points")) + + results = lapply(results, function(x){ + x = lapply(x, function(y){y[cols_to_keep2]}) + data.table::rbindlist(x, fill = TRUE) + }) + results = data.table::rbindlist(results, idcol = "id", fill = TRUE) if(nrow(results) == 0){ stop("No valid results returned") From 56c5d15e29048d52d72082ac18d14e43742a6d16 Mon Sep 17 00:00:00 2001 From: Malcolm Morgan Date: Sun, 10 Sep 2023 10:10:31 +0100 Subject: [PATCH 2/3] bug fixes and speedup --- R/batch_read.R | 12 +++++++----- R/json2sf_cs.R | 14 +++++++------- R/utils.R | 4 ++-- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/R/batch_read.R b/R/batch_read.R index c39381e..2236fb8 100644 --- a/R/batch_read.R +++ b/R/batch_read.R @@ -17,8 +17,10 @@ batch_read = function( ) ) { message("Reading in the following file:\n", file) - res = readr::read_csv(file, show_col_types = FALSE) - n_char = nchar(res$json) + + res = data.table::fread(file, select = "json") + res = stringi::stri_replace_all_fixed(res$json, '""', '"', vectorize_all = FALSE) + n_char = nchar(res) n_char[is.na(n_char)] = 0 if(all(n_char == 0)) { stop("No routes returned: does CycleStreets operate where you requested data?") @@ -27,11 +29,11 @@ batch_read = function( if(min_nchar == 0) { which_min_ncar = which(n_char == 0) message("Removing NA routes: ", paste(which_min_ncar, collapse = " ")) - res = res[-which_min_ncar, ] + res = res[-which_min_ncar] } - res = json2sf_cs(results_raw = res$json, - id = seq(nrow(res)), + res = json2sf_cs(results_raw = res, + id = seq(length(res)), segments = segments, cols_to_keep = cols_to_keep ) diff --git a/R/json2sf_cs.R b/R/json2sf_cs.R index 2dd8cb3..667c46f 100644 --- a/R/json2sf_cs.R +++ b/R/json2sf_cs.R @@ -78,9 +78,7 @@ json2sf_cs = function( } # browser() - results = RcppSimdJson::fparse(results_raw, query = "/marker", query_error_ok = TRUE, always_list = TRUE) results_error = RcppSimdJson::fparse(results_raw, query = "/error", query_error_ok = TRUE, always_list = TRUE) - rm(results_raw) results_error = unlist(results_error, use.names = FALSE) if(length(results_error) > 0){ message(length(results_error)," routes returned errors. Unique error messages are:\n") @@ -90,17 +88,18 @@ json2sf_cs = function( message(results_error$Freq[msgs],'x messages: "',results_error$results_error[msgs],'"\n') } } - + results = RcppSimdJson::fparse(results_raw, query = "/marker", query_error_ok = TRUE, always_list = TRUE) + #rm(results_raw) # Process Marker results = lapply(results, `[[`, "@attributes") if(!is.null(id)){ names(results) = as.character(id) } - cols_to_keep2 = unique(c(cols_to_keep,"type","start","points")) + cols_to_keep2 = unique(c(cols_to_keep,"type","start","points","coordinates")) results = lapply(results, function(x){ - x = lapply(x, function(y){y[cols_to_keep2]}) + x = lapply(x, function(y){y[names(y) %in% cols_to_keep2]}) data.table::rbindlist(x, fill = TRUE) }) @@ -147,6 +146,7 @@ cleanup_results <- function(x, cols_to_keep){ x = add_columns(x) x = sf::st_as_sf(x) x$SPECIALIDFORINTERNAL2 <- NULL - cols = cols_to_keep %in% names(x) - x[cols_to_keep] + cols_to_keep3 = unique(c(cols_to_keep,"gradient_segment","elevation_change","gradient_smooth")) + cols = cols_to_keep3 %in% names(x) + x[cols] } diff --git a/R/utils.R b/R/utils.R index 4e44144..57895d3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -44,10 +44,10 @@ route_rolling_average = function(x, n = 3) { get_values = function(v, fun) { - sapply(v, function(x) fun(as.numeric(x))) + vapply(v, function(x) fun(as.numeric(x)), 1) } -extract_values = function(x) stringr::str_split(x, pattern = ",") +extract_values = function(x) stringi::stri_split_fixed(x, pattern = ",") get_mean = function(v) get_values(v, fun = mean) get_sum = function(v) get_values(v, fun = sum) get_min = function(v) get_values(v, fun = min) From f3f5571550824d4f875eff340e88113e150528f8 Mon Sep 17 00:00:00 2001 From: Malcolm Morgan Date: Sun, 10 Sep 2023 10:42:56 +0100 Subject: [PATCH 3/3] Fix warnings --- DESCRIPTION | 1 - R/json2sf_cs.R | 2 +- man/batch.Rd | 4 +++- man/json2sf_cs.Rd | 4 +--- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 53f9ecd..56eef7c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,6 @@ Imports: magrittr, progressr, RcppSimdJson, - readr, sf, stringr, stringi diff --git a/R/json2sf_cs.R b/R/json2sf_cs.R index 667c46f..26a5696 100644 --- a/R/json2sf_cs.R +++ b/R/json2sf_cs.R @@ -96,7 +96,7 @@ json2sf_cs = function( names(results) = as.character(id) } - cols_to_keep2 = unique(c(cols_to_keep,"type","start","points","coordinates")) + cols_to_keep2 = unique(c(cols_to_keep,"type","start","points","coordinates", "distances","elevations")) results = lapply(results, function(x){ x = lapply(x, function(y){y[names(y) %in% cols_to_keep2]}) diff --git a/man/batch.Rd b/man/batch.Rd index 42bd264..97c5c54 100644 --- a/man/batch.Rd +++ b/man/batch.Rd @@ -128,7 +128,9 @@ routes_id = batch(desire_lines_huge, username = "robinlovelace", wait = FALSE) names(routes) plot(routes$geometry) plot(desire_lines$geometry, add = TRUE, col = "red") -routes = batch(desire_lines, username = "robinlovelace", wait_time = 5) +routes = batch(desire_lines, username = "robinlovelace", wait_time = 5, segments = FALSE) +segments = batch(desire_lines, username = "robinlovelace", wait_time = 5, segments = TRUE) +both = batch(desire_lines, username = "robinlovelace", wait_time = 5, segments = "both") # profvis::profvis(batch_read("test-data.csv.gz")) } } diff --git a/man/json2sf_cs.Rd b/man/json2sf_cs.Rd index 398045b..95f1c0c 100644 --- a/man/json2sf_cs.Rd +++ b/man/json2sf_cs.Rd @@ -18,9 +18,7 @@ json2sf_cs( "start_longitude", "start_latitude", "finish_longitude", "finish_latitude", "crow_fly_distance", "event", "whence", "speed", "itinerary", "plan", "note", "length", "west", "south", "east", "north", "leaving", "arriving", "grammesCO2saved", - "calories", "edition", "gradient_segment", - "elevation_change", - "gradient_smooth") + "calories", "edition") ) } \arguments{