From 865ff6599b093cd6f2c6845ec09924131f8e864f Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Sat, 23 Mar 2019 13:09:44 +0000 Subject: [PATCH 1/3] Works: satisfied so far I think the underlying implementation could be improved; it eats stack. I'm going to see if I can fix that. --- .gitignore | 3 +- README.md | 34 ++++++++++-- project.clj | 36 +++++++------ src/clojure_csv/core.clj | 35 ++++++++++--- src/clojure_csv/data_cleaning.clj | 54 +++++++++++++++++++ test/clojure_csv/test/core.clj | 38 ++++++++++++++ test/clojure_csv/test/data_cleaning.clj | 70 +++++++++++++++++++++++++ 7 files changed, 239 insertions(+), 31 deletions(-) create mode 100644 src/clojure_csv/data_cleaning.clj create mode 100644 test/clojure_csv/test/data_cleaning.clj diff --git a/.gitignore b/.gitignore index 655764e..9e655b8 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ target .lein-failures .lein-deps-sum .lein-repl-history -benchmarks/data \ No newline at end of file +benchmarks/data +*.csv diff --git a/README.md b/README.md index 68c1485..509b8b5 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,11 @@ The API has changed in the 2.0 series; see below for details. Recent Updates -------------- +* Updated to `2.0.3-SNAPSHOT`, with +1. Optional recognition of numbers in data; +2. Optional recognition of dates/times in data; +3. Optional recognition of first row as field names; +4. Option to supply field names. * Updated library to 2.0.2, with a bug fix for malformed input by [attil-io](https://github.com/attil-io). * Updated library to 2.0.1, which adds the :force-quote option to write-csv. @@ -52,7 +57,7 @@ Recent Updates * Now has support for Clojure 1.3. * Some speed improvements to take advantage of Clojure 1.3. Nearly twice as fast in my tests. -* Updated library to 1.2.4. +* Updated library to 1.2.4. * Added the char-seq multimethod, which provides a variety of implementations for easily creating the char seqs that parse-csv uses on input from various similar objects. Big thanks to [Slawek Gwizdowski](https://github.com/i0cus) @@ -98,7 +103,7 @@ A character that contains the cell separator for each column in a row. #### :end-of-line A string containing the end-of-line character for reading CSV files. If this setting is nil then \\n and \\r\\n are both -accepted. +accepted. ##### Default value: nil #### :quote-char A character that is used to begin and end a quoted cell. @@ -106,8 +111,27 @@ A character that is used to begin and end a quoted cell. #### :strict If this variable is true, the parser will throw an exception on parse errors that are recoverable but not to spec or otherwise -nonsensical. +nonsensical. ##### Default value: false +#### :numbers +Optional; if non `nil`, fields which are numbers will be returned as numbers, +not strings. +#### :date-format +Optional; if a valid value as specified below, fields which are dates/times +will be returned as `org.joda.time.DateTime` objects, not strings. + +A valid value is one of: + 1. A string in the format understood by `clj-time.formatters/formatter`, or + 2. A keyword representing one of `clj-time.formatters` built-in formatters, + 3. A custom formatter as constructed by `clj-time.formatters/formatter`" +##### Default value: nil +#### :field-names +Optional; + 1. if `true`, the first row of the input will be treated as field names (and +read as keywords); + 2. if a list or vector, the value will be used as field names. +In either case, rows will be returned as `map`s, not `list`s. +##### Default value: nil ### write-csv Takes a sequence of sequences of strings, basically a table of strings, @@ -116,10 +140,10 @@ call this function repeatedly row-by-row and concatenate the results yourself. Takes the following keyword arguments to change the written file: #### :delimiter -A character that contains the cell separator for each column in a row. +A character that contains the cell separator for each column in a row. ##### Default value: \\, #### :end-of-line -A string containing the end-of-line character for writing CSV files. +A string containing the end-of-line character for writing CSV files. ##### Default value: \\n #### :quote-char A character that is used to begin and end a quoted cell. diff --git a/project.clj b/project.clj index d4e4b6f..fe16454 100644 --- a/project.clj +++ b/project.clj @@ -1,19 +1,21 @@ -(defproject clojure-csv "2.0.2" +(defproject clojure-csv "2.0.3-SNAPSHOT" :description "A simple library to read and write CSV files." - :dependencies [[org.clojure/clojure "1.3.0"]] + :dependencies [[org.clojure/clojure "1.8.0"] + [clj-time "0.15.0"]] :plugins [[perforate "0.3.2"]] - :jvm-opts ["-Xmx1g"] - :profiles {:current {:source-paths ["src/"]} - :clj1.4 {:dependencies [[org.clojure/clojure "1.4.0-beta5"]]} - :clj1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} - :csv1.3 {:dependencies [[clojure-csv "1.3.0"]]} - :csv2.0 {:dependencies [[clojure-csv "2.0.0-alpha1"]]}} - :perforate {:environments [{:name :clojure-csv2 - :profiles [:clj1.3 :csv2.0] - :namespaces [csv.benchmarks.core]} - {:name :clojure-csv1 - :profiles [:clj1.3 :csv1.3] - :namespaces [csv.benchmarks.core]} - {:name :current - :profiles [:clj1.4 :current] - :namespaces [csv.benchmarks.core]}]}) + :jvm-opts ["-Xmx1g"]) +;; :profiles {:current {:source-paths ["src/"]} +;; :clj1.8 {:dependencies [[org.clojure/clojure "1.8.0"]]} +;; :clj1.4 {:dependencies [[org.clojure/clojure "1.4.0-beta5"]]} +;; :clj1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} +;; :csv1.3 {:dependencies [[clojure-csv "1.3.0"]]} +;; :csv2.0 {:dependencies [[clojure-csv "2.0.0-alpha1"]]}} +;; :perforate {:environments [{:name :clojure-csv2 +;; :profiles [:clj1.3 :csv2.0] +;; :namespaces [csv.benchmarks.core]} +;; {:name :clojure-csv1 +;; :profiles [:clj1.3 :csv1.3] +;; :namespaces [csv.benchmarks.core]} +;; {:name :current +;; :profiles [:clj1.8 :current] +;; :namespaces [csv.benchmarks.core]}]}) diff --git a/src/clojure_csv/core.clj b/src/clojure_csv/core.clj index 96c39ad..cb72a5f 100644 --- a/src/clojure_csv/core.clj +++ b/src/clojure_csv/core.clj @@ -4,7 +4,8 @@ It correctly handles common CSV edge-cases, such as embedded newlines, commas, and quotes. The main functions are parse-csv and write-csv."} clojure-csv.core - (:require [clojure.string :as string]) + (:require [clojure.string :as string] + [clojure-csv.data-cleaning :refer [dates-as-dates numbers-as-numbers]]) (:import [java.io Reader StringReader])) @@ -185,16 +186,24 @@ and quotes. The main functions are parse-csv and write-csv."} (throw (Exception. (str "Unexpected character found: " look-ahead))))))) (defn- parse-csv-with-options - ([csv-reader {:keys [delimiter quote-char strict end-of-line]}] + ([csv-reader {:keys [delimiter quote-char strict end-of-line date-format numbers field-names]}] + (let [fields (cond + (true? field-names) + (map keyword (parse-csv-line csv-reader delimiter quote-char + strict end-of-line)) + (or (list? field-names) (vector? field-names)) field-names)] (parse-csv-with-options csv-reader delimiter quote-char - strict end-of-line)) - ([csv-reader delimiter quote-char strict end-of-line] + strict end-of-line date-format numbers fields))) + ([csv-reader delimiter quote-char strict end-of-line date-format numbers fields] (lazy-seq (when (not (== -1 (reader-peek csv-reader))) - (let [row (parse-csv-line csv-reader delimiter quote-char - strict end-of-line)] + (let [raw (parse-csv-line csv-reader delimiter quote-char + strict end-of-line) + with-numbers (if numbers (numbers-as-numbers raw) raw) + with-dates (if date-format (dates-as-dates numbers date-format) with-numbers) + row (if fields (apply hash-map (interleave fields with-dates)) with-dates)] (cons row (parse-csv-with-options csv-reader delimiter quote-char - strict end-of-line))))))) + strict end-of-line date-format numbers fields))))))) (defn parse-csv "Takes a CSV as a string or Reader and returns a seq of the parsed CSV rows, @@ -211,7 +220,17 @@ and quotes. The main functions are parse-csv and write-csv."} Default value: \\\" :strict - If this variable is true, the parser will throw an exception on parse errors that are recoverable but - not to spec or otherwise nonsensical. Default value: false" + not to spec or otherwise nonsensical. Default value: false + :date-format - if provided, and value is a string, keyword or `clj-time` + formatter, recognise dates having the specified format and + return them as `org.joda.time.DateTime` objects. Default value: + nil + :numbers - if provided and value is non-nil, recognise numbers (integers, + floats and rationals, but TODO: not yet bignums) and return them + as numbers. + :field-names - if provided and value is true, treats the first row as + field names; if provided and value is a sequence, treats that sequence as + field names. In either case returns a list of maps, not lists." ([csv & {:as opts}] (let [csv-reader (if (string? csv) (StringReader. csv) csv)] (parse-csv-with-options csv-reader (merge {:strict false diff --git a/src/clojure_csv/data_cleaning.clj b/src/clojure_csv/data_cleaning.clj new file mode 100644 index 0000000..e6da4c1 --- /dev/null +++ b/src/clojure_csv/data_cleaning.clj @@ -0,0 +1,54 @@ +(ns + ^{:author "Simon Brooke", + :doc "Recognise numbers as numbers, and (#TODO) + dates/times as dates times, etc"} + clojure-csv.data-cleaning + (:require [clj-time.core :as t] + [clj-time.format :as f])) + +(defn number-as-number + "if `o` is the string representation of a number, return that number; else + return `o`." + [o] + (if + (string? o) + (try + (let [n (read-string o)] + (if (number? n) n o)) + (catch Exception e o)) + o)) + +(defmacro numbers-as-numbers + "Return a list like the sequence `l`, but with all those elements + which are string representations of numbers replaced with numbers." + [l] + `(map number-as-number ~l)) + +(defn date-as-date + "if `o` is the string representation of a date or timestamp comforming to + `date-format`, return that timestamp; else return `o`. `date-format` is + expected to be either + 1. A string in the format understood by `clj-time.formatters/formatter`, or + 2. A keyword representing one of `clj-time.formatters` built-in formatters, + 3. A custom formatter as constructed by `clj-time.formatters/formatter`" + [o date-format] + (if + (string? o) + (try + (let [f (cond + (string? date-format) (f/formatter date-format) + (keyword? date-format) (f/formatters date-format) + (= + (type date-format) + org.joda.time.format.DateTimeFormatter) date-format)] + (f/parse f o)) + (catch Exception e + o)) + o)) + +(defmacro dates-as-dates + "Return a list like the sequence `l`, but with all those elements + which are string representations of numbers replaced with numbers." + [l date-format] + `(map #(date-as-date % ~date-format) ~l)) + diff --git a/test/clojure_csv/test/core.clj b/test/clojure_csv/test/core.clj index 4251f6f..5801f76 100644 --- a/test/clojure_csv/test/core.clj +++ b/test/clojure_csv/test/core.clj @@ -129,3 +129,41 @@ :end-of-line "HELLO"))) (is (= [["a" "b\r"] ["c" "d"]] (parse-csv "a,|b\r|\rc,d" :end-of-line "\r" :quote-char \|)))) + +(deftest data-cleansing + (let [data "Name;MP;Area;County;Electorate;CON;LAB;LIB;UKIP;Green;NAT;MIN;OTH + Aldershot;Leo Docherty;12;Hampshire;76205;26955;15477;3637;1796;1090;0;0;0 + Aldridge-Brownhills;Wendy Morton;7;Black Country;60363;26317;12010;1343;0;0;0;0;565 + Altrincham and Sale West;Graham Brady;4;Central Manchester;73220;26933;20507;4051;0;1000;0;0;299 + Amber Valley;Nigel Mills;8;Derbyshire;68065;25905;17605;1100;0;650;0;0;551 + Arundel and South Downs;Nick Herbert;12;West Sussex;80766;37573;13690;4783;1668;2542;0;0;0 + Ashfield;Gloria De Piero;8;Nottinghamshire;78099;20844;21285;969;1885;398;0;4612;0 + Ashford;Damian Green;12;Kent;87396;35318;17840;3101;2218;1402;0;0;0"] + (testing "number recognition" + (let [expected "76205" + actual (nth (nth (parse-csv data :delimiter \;) 1) 4)] + (is (= actual expected) "Number recognition off")) + (let [expected 76205 + actual (nth (nth (parse-csv data :delimiter \; :numbers true) 1) 4)] + (is (= actual expected) "Number recognition on"))) + (testing "field names" + (let [expected 76205 + actual (:Electorate (first (parse-csv data + :delimiter \; + :numbers true + :field-names true)))] + (is (= actual expected) "Field names from first row")) + (let [expected 76205 + actual (:e (nth (parse-csv data + :delimiter \; + :numbers true + :field-names [:a :b :c :d :e]) 1))] + (is (= actual expected) "Field names passed as vector")) + (let [expected 60363 + actual (:e (nth (parse-csv data + :delimiter \; + :numbers true + :field-names '(:a :b :c :d :e)) 2))] + (is (= actual expected) "Field names passed as list"))))) + + diff --git a/test/clojure_csv/test/data_cleaning.clj b/test/clojure_csv/test/data_cleaning.clj new file mode 100644 index 0000000..999c1f1 --- /dev/null +++ b/test/clojure_csv/test/data_cleaning.clj @@ -0,0 +1,70 @@ +(ns clojure-csv.test.data-cleaning + (:require [clojure.test :refer :all] + [clojure-csv.data-cleaning :refer :all] + [clj-time.core :as t] + [clj-time.format :as f])) + + +(deftest number-recognition + (testing "Recognition of integers" + (let [expected 123456 + actual (number-as-number "123456")] + (is (= actual expected) "integer 123456")) + (let [expected -1 + actual (number-as-number "-1")] + (is (= actual expected) "integer negative one"))) + (testing "Recognition of floats" + (let [expected 0.1 + actual (number-as-number "0.1")] + (is (= actual expected) "float zero point one")) + (let [expected -0.1 + actual (number-as-number "-0.1")] + (is (= actual expected) "float negative zero point one")) + (let [expected 3.142857 + actual (number-as-number "3.142857")] + (is (= actual expected) "float approximation of π"))) + (testing "Recognition of rationals" + (let [expected 22/7 + actual (number-as-number "22/7")] + (is (= actual expected) "rational approximation of π")) + (let [expected 1/4 + actual (number-as-number "2/8")] + (is (= actual expected) "two eighths -> one quarter"))) + (testing "Recognition of numbers" + (let [expected '("Fred" "2019-03-23" 22/7 3.142857 123456 -8) + actual (numbers-as-numbers + '("Fred" "2019-03-23" "22/7" "3.142857" "123456" "-8"))] + (is (= actual expected) "List including numbers in various formats")))) + +(deftest date-recognition + (testing "recognition of dates; format is string" + (let [expected "class org.joda.time.DateTime" + actual (str (type (date-as-date "2019-03-23" "yyyy-MM-dd")))] + (is (= actual expected) "format is string; match expected")) + (let [expected "class java.lang.String" + actual (str (type (date-as-date "2019/03/23" "yyyy-MM-dd")))] + (is (= actual expected) "format is string; match not expected"))) + (testing "recognition of dates; format is keyword" + (let [expected "class org.joda.time.DateTime" + actual (str (type (date-as-date "2019-03-23" :date)))] + (is (= actual expected) "format is keyword; match expected")) + (let [expected "class java.lang.String" + actual (str (type (date-as-date "2019/03/23" :date)))] + (is (= actual expected) "format is keyword; match not expected"))) + (testing "recognition of dates; format is formatter" + (let [expected "class org.joda.time.DateTime" + actual (str (type (date-as-date "2019-03-23" (f/formatter "2019-03-23" ))))] + (is (= actual expected) "format is formatter; match expected")) + (let [expected "class java.lang.String" + actual (str (type (date-as-date "2019/03/23" (f/formatter "2019-03-23" ))))] + (is (= actual expected) "format is formatter; match not expected")) + (let [expected "class org.joda.time.DateTime" + actual (str + (type + (date-as-date + "2019/03/23" + (f/formatter + (t/default-time-zone) + "YYYY-MM-dd" + "YYYY/MM/dd"))))] + (is (= actual expected) "format is composite formatter; match expected")))) From 129b11d26ac0d61e3adec55bc5e50ea43dc7647e Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Sat, 23 Mar 2019 13:22:32 +0000 Subject: [PATCH 2/3] Uncommented the bits I'd commented out of the project file --- .gitignore | 2 ++ project.clj | 32 ++++++++++++++++---------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 9e655b8..d5c34a0 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ target .lein-repl-history benchmarks/data *.csv + +\.nrepl-port diff --git a/project.clj b/project.clj index fe16454..91c5149 100644 --- a/project.clj +++ b/project.clj @@ -3,19 +3,19 @@ :dependencies [[org.clojure/clojure "1.8.0"] [clj-time "0.15.0"]] :plugins [[perforate "0.3.2"]] - :jvm-opts ["-Xmx1g"]) -;; :profiles {:current {:source-paths ["src/"]} -;; :clj1.8 {:dependencies [[org.clojure/clojure "1.8.0"]]} -;; :clj1.4 {:dependencies [[org.clojure/clojure "1.4.0-beta5"]]} -;; :clj1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} -;; :csv1.3 {:dependencies [[clojure-csv "1.3.0"]]} -;; :csv2.0 {:dependencies [[clojure-csv "2.0.0-alpha1"]]}} -;; :perforate {:environments [{:name :clojure-csv2 -;; :profiles [:clj1.3 :csv2.0] -;; :namespaces [csv.benchmarks.core]} -;; {:name :clojure-csv1 -;; :profiles [:clj1.3 :csv1.3] -;; :namespaces [csv.benchmarks.core]} -;; {:name :current -;; :profiles [:clj1.8 :current] -;; :namespaces [csv.benchmarks.core]}]}) + :jvm-opts ["-Xmx1g"] + :profiles {:current {:source-paths ["src/"]} + :clj1.8 {:dependencies [[org.clojure/clojure "1.8.0"]]} + :clj1.4 {:dependencies [[org.clojure/clojure "1.4.0-beta5"]]} + :clj1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} + :csv1.3 {:dependencies [[clojure-csv "1.3.0"]]} + :csv2.0 {:dependencies [[clojure-csv "2.0.0-alpha1"]]}} + :perforate {:environments [{:name :clojure-csv2 + :profiles [:clj1.3 :csv2.0] + :namespaces [csv.benchmarks.core]} + {:name :clojure-csv1 + :profiles [:clj1.3 :csv1.3] + :namespaces [csv.benchmarks.core]} + {:name :current + :profiles [:clj1.8 :current] + :namespaces [csv.benchmarks.core]}]}) From c6a027451b5002325043d6e8d2593d216c8f16f2 Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Sat, 23 Mar 2019 18:41:06 +0000 Subject: [PATCH 3/3] Minor documentation changes --- README.md | 1 + src/clojure_csv/data_cleaning.clj | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2340b1f..ff16ce3 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ Recent Updates 2. Optional recognition of dates/times in data; 3. Optional recognition of first row as field names; 4. Option to supply field names. +* Now has support for Clojure 1.8. * Updated library to 2.0.2, with a bug fix for malformed input by [attil-io](https://github.com/attil-io). * Updated library to 2.0.1, which adds the :force-quote option to write-csv. diff --git a/src/clojure_csv/data_cleaning.clj b/src/clojure_csv/data_cleaning.clj index e6da4c1..41a71b4 100644 --- a/src/clojure_csv/data_cleaning.clj +++ b/src/clojure_csv/data_cleaning.clj @@ -1,7 +1,7 @@ (ns ^{:author "Simon Brooke", - :doc "Recognise numbers as numbers, and (#TODO) - dates/times as dates times, etc"} + :doc "Recognise numbers as numbers, and + dates/times as dates/times"} clojure-csv.data-cleaning (:require [clj-time.core :as t] [clj-time.format :as f]))