From a577adf9eb14a86c29c1ea274b1022f2022c3c13 Mon Sep 17 00:00:00 2001
From: Malcolm Morgan <mem48@users.noreply.github.com>
Date: Sat, 9 Sep 2023 00:25:12 +0100
Subject: [PATCH 1/3] memory tweaks

tweaks to reduce memory use, still WIP
---
 R/batch_read.R | 27 +++++++++++++--------------
 R/json2sf_cs.R | 14 ++++++++++----
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/R/batch_read.R b/R/batch_read.R
index c0b13c5..c39381e 100644
--- a/R/batch_read.R
+++ b/R/batch_read.R
@@ -12,13 +12,12 @@ batch_read = function(
     cols_to_keep = c(
       "name", # not used currently but could be handy
       "distances",
-      "gradient_smooth",
+      "elevations",
       "quietness"
     )
     ) {
   message("Reading in the following file:\n", file)
   res = readr::read_csv(file, show_col_types = FALSE)
-  res$route_number = seq(nrow(res))
   n_char = nchar(res$json)
   n_char[is.na(n_char)] = 0
   if(all(n_char == 0)) {
@@ -31,8 +30,8 @@ batch_read = function(
     res = res[-which_min_ncar, ]
   }
 
-  res_df = json2sf_cs(results_raw = res$json,
-                       id = res$route_number,
+  res = json2sf_cs(results_raw = res$json,
+                       id = seq(nrow(res)),
                        segments = segments,
                       cols_to_keep = cols_to_keep
                       )
@@ -52,33 +51,33 @@ batch_read = function(
     }
 
     for(i in seq(1, length(nms))){
-      if(nms[i] %in% names(res_df$routes)){
-        res_df$routes[[nms[i]]] = as.numeric(res_df$routes[[nms[i]]])
+      if(nms[i] %in% names(res$routes)){
+        res$routes[[nms[i]]] = as.numeric(res$routes[[nms[i]]])
       }
     }
-    names(res_df$routes)[names(res_df$routes) == "id"] = "route_number"
+    names(res$routes)[names(res$routes) == "id"] = "route_number"
 
     for(i in seq(1, length(nms))){
-      if(nms[i] %in% names(res_df$segments)){
-        res_df$segments[[nms[i]]] = as.numeric(res_df$segments[[nms[i]]])
+      if(nms[i] %in% names(res$segments)){
+        res$segments[[nms[i]]] = as.numeric(res$segments[[nms[i]]])
       }
     }
-    names(res_df$segments)[names(res_df$segments) == "id"] = "route_number"
+    names(res$segments)[names(res$segments) == "id"] = "route_number"
 
   } else {
 
     for(i in seq(1, length(nms))){
-      if(nms[i] %in% names(res_df)){
-        res_df[[nms[i]]] = as.numeric(res_df[[nms[i]]])
+      if(nms[i] %in% names(res)){
+        res[[nms[i]]] = as.numeric(res[[nms[i]]])
       }
     }
 
-    names(res_df)[names(res_df) == "id"] = "route_number"
+    names(res)[names(res) == "id"] = "route_number"
 
 
   }
 
-  res_df
+  res
 
 }
 
diff --git a/R/json2sf_cs.R b/R/json2sf_cs.R
index 4ca0839..2dd8cb3 100644
--- a/R/json2sf_cs.R
+++ b/R/json2sf_cs.R
@@ -61,8 +61,7 @@ json2sf_cs = function(
                      "start_latitude", "finish_longitude", "finish_latitude", "crow_fly_distance",
                      "event", "whence", "speed", "itinerary", "plan", "note", "length",
                      "west", "south", "east", "north", "leaving", "arriving", "grammesCO2saved",
-                     "calories", "edition", "gradient_segment", "elevation_change",
-                     "gradient_smooth")
+                     "calories", "edition")
 ){
 
   # Support both
@@ -81,6 +80,7 @@ json2sf_cs = function(
   # browser()
   results = RcppSimdJson::fparse(results_raw, query = "/marker", query_error_ok = TRUE, always_list = TRUE)
   results_error = RcppSimdJson::fparse(results_raw, query = "/error", query_error_ok = TRUE, always_list = TRUE)
+  rm(results_raw)
   results_error = unlist(results_error, use.names = FALSE)
   if(length(results_error) > 0){
     message(length(results_error)," routes returned errors. Unique error messages are:\n")
@@ -96,8 +96,14 @@ json2sf_cs = function(
   if(!is.null(id)){
     names(results) = as.character(id)
   }
-  # TODO: subset to keep only columns of relevance
-  results = lapply(results, data.table::rbindlist, fill = TRUE)
+
+  cols_to_keep2 = unique(c(cols_to_keep,"type","start","points"))
+
+  results = lapply(results, function(x){
+    x = lapply(x, function(y){y[cols_to_keep2]})
+    data.table::rbindlist(x, fill = TRUE)
+  })
+
   results = data.table::rbindlist(results, idcol = "id", fill = TRUE)
   if(nrow(results) == 0){
     stop("No valid results returned")

From 56c5d15e29048d52d72082ac18d14e43742a6d16 Mon Sep 17 00:00:00 2001
From: Malcolm Morgan <mem48@users.noreply.github.com>
Date: Sun, 10 Sep 2023 10:10:31 +0100
Subject: [PATCH 2/3] bug fixes and speedup

---
 R/batch_read.R | 12 +++++++-----
 R/json2sf_cs.R | 14 +++++++-------
 R/utils.R      |  4 ++--
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/R/batch_read.R b/R/batch_read.R
index c39381e..2236fb8 100644
--- a/R/batch_read.R
+++ b/R/batch_read.R
@@ -17,8 +17,10 @@ batch_read = function(
     )
     ) {
   message("Reading in the following file:\n", file)
-  res = readr::read_csv(file, show_col_types = FALSE)
-  n_char = nchar(res$json)
+
+  res = data.table::fread(file, select = "json")
+  res = stringi::stri_replace_all_fixed(res$json, '""', '"', vectorize_all = FALSE)
+  n_char = nchar(res)
   n_char[is.na(n_char)] = 0
   if(all(n_char == 0)) {
     stop("No routes returned: does CycleStreets operate where you requested data?")
@@ -27,11 +29,11 @@ batch_read = function(
   if(min_nchar == 0) {
     which_min_ncar = which(n_char == 0)
     message("Removing NA routes: ", paste(which_min_ncar, collapse = " "))
-    res = res[-which_min_ncar, ]
+    res = res[-which_min_ncar]
   }
 
-  res = json2sf_cs(results_raw = res$json,
-                       id = seq(nrow(res)),
+  res = json2sf_cs(results_raw = res,
+                       id = seq(length(res)),
                        segments = segments,
                       cols_to_keep = cols_to_keep
                       )
diff --git a/R/json2sf_cs.R b/R/json2sf_cs.R
index 2dd8cb3..667c46f 100644
--- a/R/json2sf_cs.R
+++ b/R/json2sf_cs.R
@@ -78,9 +78,7 @@ json2sf_cs = function(
   }
 
   # browser()
-  results = RcppSimdJson::fparse(results_raw, query = "/marker", query_error_ok = TRUE, always_list = TRUE)
   results_error = RcppSimdJson::fparse(results_raw, query = "/error", query_error_ok = TRUE, always_list = TRUE)
-  rm(results_raw)
   results_error = unlist(results_error, use.names = FALSE)
   if(length(results_error) > 0){
     message(length(results_error)," routes returned errors. Unique error messages are:\n")
@@ -90,17 +88,18 @@ json2sf_cs = function(
       message(results_error$Freq[msgs],'x messages: "',results_error$results_error[msgs],'"\n')
     }
   }
-
+  results = RcppSimdJson::fparse(results_raw, query = "/marker", query_error_ok = TRUE, always_list = TRUE)
+  #rm(results_raw)
   # Process Marker
   results = lapply(results, `[[`, "@attributes")
   if(!is.null(id)){
     names(results) = as.character(id)
   }
 
-  cols_to_keep2 = unique(c(cols_to_keep,"type","start","points"))
+  cols_to_keep2 = unique(c(cols_to_keep,"type","start","points","coordinates"))
 
   results = lapply(results, function(x){
-    x = lapply(x, function(y){y[cols_to_keep2]})
+    x = lapply(x, function(y){y[names(y) %in% cols_to_keep2]})
     data.table::rbindlist(x, fill = TRUE)
   })
 
@@ -147,6 +146,7 @@ cleanup_results <- function(x, cols_to_keep){
   x = add_columns(x)
   x = sf::st_as_sf(x)
   x$SPECIALIDFORINTERNAL2 <- NULL
-  cols = cols_to_keep %in% names(x)
-  x[cols_to_keep]
+  cols_to_keep3 = unique(c(cols_to_keep,"gradient_segment","elevation_change","gradient_smooth"))
+  cols = cols_to_keep3 %in% names(x)
+  x[cols]
 }
diff --git a/R/utils.R b/R/utils.R
index 4e44144..57895d3 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -44,10 +44,10 @@ route_rolling_average = function(x, n = 3) {
 
 
 get_values = function(v, fun) {
-  sapply(v, function(x) fun(as.numeric(x)))
+  vapply(v, function(x) fun(as.numeric(x)), 1)
 }
 
-extract_values = function(x) stringr::str_split(x, pattern = ",")
+extract_values = function(x) stringi::stri_split_fixed(x, pattern = ",")
 get_mean = function(v) get_values(v, fun = mean)
 get_sum = function(v) get_values(v, fun = sum)
 get_min = function(v) get_values(v, fun = min)

From f3f5571550824d4f875eff340e88113e150528f8 Mon Sep 17 00:00:00 2001
From: Malcolm Morgan <mem48@users.noreply.github.com>
Date: Sun, 10 Sep 2023 10:42:56 +0100
Subject: [PATCH 3/3] Fix warnings

---
 DESCRIPTION       | 1 -
 R/json2sf_cs.R    | 2 +-
 man/batch.Rd      | 4 +++-
 man/json2sf_cs.Rd | 4 +---
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 53f9ecd..56eef7c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -39,7 +39,6 @@ Imports:
     magrittr,
     progressr,
     RcppSimdJson,
-    readr,
     sf,
     stringr,
     stringi
diff --git a/R/json2sf_cs.R b/R/json2sf_cs.R
index 667c46f..26a5696 100644
--- a/R/json2sf_cs.R
+++ b/R/json2sf_cs.R
@@ -96,7 +96,7 @@ json2sf_cs = function(
     names(results) = as.character(id)
   }
 
-  cols_to_keep2 = unique(c(cols_to_keep,"type","start","points","coordinates"))
+  cols_to_keep2 = unique(c(cols_to_keep,"type","start","points","coordinates", "distances","elevations"))
 
   results = lapply(results, function(x){
     x = lapply(x, function(y){y[names(y) %in% cols_to_keep2]})
diff --git a/man/batch.Rd b/man/batch.Rd
index 42bd264..97c5c54 100644
--- a/man/batch.Rd
+++ b/man/batch.Rd
@@ -128,7 +128,9 @@ routes_id = batch(desire_lines_huge, username = "robinlovelace", wait = FALSE)
 names(routes)
 plot(routes$geometry)
 plot(desire_lines$geometry, add = TRUE, col = "red")
-routes = batch(desire_lines, username = "robinlovelace", wait_time = 5)
+routes = batch(desire_lines, username = "robinlovelace", wait_time = 5, segments = FALSE)
+segments = batch(desire_lines, username = "robinlovelace", wait_time = 5, segments = TRUE)
+both = batch(desire_lines, username = "robinlovelace", wait_time = 5, segments = "both")
 # profvis::profvis(batch_read("test-data.csv.gz"))
 }
 }
diff --git a/man/json2sf_cs.Rd b/man/json2sf_cs.Rd
index 398045b..95f1c0c 100644
--- a/man/json2sf_cs.Rd
+++ b/man/json2sf_cs.Rd
@@ -18,9 +18,7 @@ json2sf_cs(
     "start_longitude", "start_latitude", "finish_longitude", "finish_latitude",
     "crow_fly_distance", "event", "whence", "speed", "itinerary", "plan", "note",
     "length", "west", "south", "east", "north", "leaving", "arriving", "grammesCO2saved",
-    "calories", "edition", "gradient_segment", 
-     "elevation_change",
-    "gradient_smooth")
+    "calories", "edition")
 )
 }
 \arguments{