diff --git a/.github/workflows/clojure-master.yml b/.github/workflows/clojure-master.yml index e6d0a17..d388140 100644 --- a/.github/workflows/clojure-master.yml +++ b/.github/workflows/clojure-master.yml @@ -2,18 +2,39 @@ name: Clojure CI for master on: push: - branches: [ master ] + branches: [ trunk ] pull_request: - branches: [ master ] + branches: [ trunk ] jobs: build: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Install dependencies - run: lein deps - - name: Run tests - run: lein test + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Java + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + + - name: Install Leiningen + run: | + sudo apt-get update + sudo apt-get install -y leiningen + + - name: Cache Maven dependencies + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-lein-${{ hashFiles('project.clj') }} + restore-keys: | + ${{ runner.os }}-lein- + + - name: Install dependencies + run: lein deps + + - name: Run tests + run: lein test diff --git a/.github/workflows/clojure-trunk.yml b/.github/workflows/clojure-trunk.yml index 64d450e..280f0bd 100644 --- a/.github/workflows/clojure-trunk.yml +++ b/.github/workflows/clojure-trunk.yml @@ -8,12 +8,33 @@ on: jobs: build: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Install dependencies - run: lein deps - - name: Run tests - run: lein test + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Java + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '21' + + - name: Install Leiningen + run: | + sudo apt-get update + sudo apt-get install -y leiningen + + - name: Cache Maven dependencies + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-lein-${{ hashFiles('project.clj') }} + restore-keys: | + ${{ runner.os }}-lein- + + - name: Install dependencies + run: lein deps + + - name: Run tests + run: lein test diff --git a/README.adoc b/README.adoc index d05fad9..680ae0d 100644 --- a/README.adoc +++ b/README.adoc @@ -96,6 +96,33 @@ CAUTION: From version v0.3.2 and upward the library may require Java class versi => 152961502 ---- +=== Streaming API for Large Data + +For processing large files without loading them entirely into memory: + +[source,clojure] +---- +;; Streaming compression example +(require '[clojure.java.io :as io]) +(require '[zlib-tiny.core :as z]) + +;; Compress a large file +(with-open [input (io/input-stream "large-data.txt") + output (io/output-stream "large-data.gz")] + (z/copy-compress input output z/gzip-stream)) + +;; Decompress a large file +(with-open [input (io/input-stream "large-data.gz") + output (io/output-stream "large-data-decompressed.txt")] + (z/copy-decompress input output z/gunzip-stream)) + +;; Direct stream creation for custom processing +(with-open [input (io/input-stream "data.txt") + compressed (z/deflate-stream input)] + ;; Process compressed stream + ) +---- + ==== Digests [source,shell] @@ -152,7 +179,11 @@ CRC64 checks: lein test zlib-tiny.compress -Ran 3 tests containing 13 assertions. +lein test zlib-tiny.performance +... + +Ran 4 tests containing 14 assertions. +... ---- == Manual Build @@ -164,7 +195,7 @@ $ lein install == License -Copyright © 2017-2023 +Copyright © 2017-2025 Distributed under the http://www.apache.org/licenses/LICENSE-2.0[Apache License v 2.0] diff --git a/profiles.clj b/profiles.clj index b5993a7..bb7126f 100644 --- a/profiles.clj +++ b/profiles.clj @@ -5,12 +5,12 @@ :plugins []} - :provided {:dependencies [[org.clojure/clojure "1.11.1"]] + :provided {:dependencies [[org.clojure/clojure "1.12.1"]] :source-paths #{"src-clj"} :java-source-paths #{"src-java"} :resource-paths ["resources"] - :javac-options ["-source" "9" "-target" "9" "-g:none"] + :javac-options ["--release" "9" "-g:none"] :jar-exclusions [#"\.java"]} diff --git a/project.clj b/project.clj index c5d18c2..583d7c3 100644 --- a/project.clj +++ b/project.clj @@ -1,6 +1,6 @@ -(defproject net.tbt-post/zlib-tiny "0.5.2" +(defproject net.tbt-post/zlib-tiny "0.6.0" :description "Tiny Clojure ZLib helper" :url "https://github.com/source-c/zlib-tiny" :license {:name "Apache License v2.0" :url "http://www.apache.org/licenses/LICENSE-2.0"} - :dependencies [[commons-io "2.15.1"]]) + :dependencies [[commons-io "2.20.0"]]) diff --git a/src-clj/zlib_tiny/core.clj b/src-clj/zlib_tiny/core.clj index 9d83b1e..ecd084a 100644 --- a/src-clj/zlib_tiny/core.clj +++ b/src-clj/zlib_tiny/core.clj @@ -16,17 +16,30 @@ BufferedInputStream InputStream))) +(def ^:private ^:const STREAM_MARK_LIMIT 512) +(def ^:private ^:const DEFAULT_BUFFER_SIZE 8192) + +(def ^:private ^ThreadLocal buffer-pool + (proxy [ThreadLocal] [] + (initialValue [] + (byte-array DEFAULT_BUFFER_SIZE)))) + +(defn- get-buffer + "Gets a reusable buffer from the thread-local pool" + [] + (.get buffer-pool)) + (defn str->bytes "Returns the encoding's bytes corresponding to the given string. If no encoding is specified, UTF-8 is used." [^String s & [^String encoding]] - (.getBytes s (or encoding "UTF-8"))) + (.getBytes s ^String (or encoding "UTF-8"))) (defn bytes->str "Returns the String corresponding to the given encoding's decoding of the given bytes. If no encoding is specified, UTF-8 is used." [^bytes b & [^String encoding]] - (String. b (or encoding "UTF-8"))) + (String. b ^String (or encoding "UTF-8"))) (defn gunzip "Returns a gunzip'd version of the given byte array." @@ -43,8 +56,14 @@ [b] (when b (let [baos (ByteArrayOutputStream.) - gos (GZIPOutputStream. baos)] - (IOUtils/copy (ByteArrayInputStream. b) gos) + gos (GZIPOutputStream. baos ^int DEFAULT_BUFFER_SIZE) + buffer (get-buffer) + bis (ByteArrayInputStream. b)] + (loop [] + (let [n (.read bis buffer 0 DEFAULT_BUFFER_SIZE)] + (when (pos? n) + (.write gos buffer 0 n) + (recur)))) (.close gos) (.toByteArray baos)))) @@ -66,7 +85,7 @@ (let [stream (BufferedInputStream. (if (instance? InputStream b) b (ByteArrayInputStream. b))) - _ (.mark stream 512) + _ (.mark stream STREAM_MARK_LIMIT) iis (InflaterInputStream. stream) readable? (try (.read iis) true (catch ZipException _ false))] @@ -138,3 +157,78 @@ (defn sha-512 ^bytes [^bytes b] (wrap-digest "SHA-512" b)) + +;; Streaming API for large data + +(defn deflate-stream + "Returns a DeflaterInputStream for streaming deflation. + Useful for large files that shouldn't be loaded entirely into memory." + ([^InputStream input-stream] + (DeflaterInputStream. input-stream)) + ([^InputStream input-stream level] + (DeflaterInputStream. input-stream (Deflater. level)))) + +(defn inflate-stream + "Returns an InflaterInputStream for streaming inflation. + Useful for large files that shouldn't be loaded entirely into memory." + [^InputStream input-stream] + (let [stream (BufferedInputStream. input-stream) + _ (.mark stream STREAM_MARK_LIMIT) + iis (InflaterInputStream. stream) + readable? (try (.read iis) true + (catch ZipException _ false))] + (.reset stream) + (if readable? + (InflaterInputStream. stream) + (InflaterInputStream. stream (Inflater. true))))) + +(defn gzip-stream + "Returns a GZIPOutputStream for streaming gzip compression. + Useful for large files that shouldn't be loaded entirely into memory." + ^GZIPOutputStream + ([^java.io.OutputStream output-stream] + (GZIPOutputStream. output-stream ^int DEFAULT_BUFFER_SIZE)) + ([^java.io.OutputStream output-stream ^Integer buffer-size] + (GZIPOutputStream. output-stream ^int buffer-size))) + +(defn gunzip-stream + "Returns a GZIPInputStream for streaming gzip decompression. + Useful for large files that shouldn't be loaded entirely into memory." + ([^InputStream input-stream] + (GZIPInputStream. input-stream ^int DEFAULT_BUFFER_SIZE)) + ([^InputStream input-stream buffer-size] + (GZIPInputStream. input-stream ^int buffer-size))) + +(defn copy-compress + "Copies data from input-stream to output-stream with compression. + Returns the number of bytes written." + ^long [^InputStream input-stream ^java.io.OutputStream output-stream compress-fn] + (let [^java.io.OutputStream compressed-stream (compress-fn output-stream) + ^bytes buffer (get-buffer)] + (try + (loop [total (long 0)] + (let [n (.read input-stream buffer 0 DEFAULT_BUFFER_SIZE)] + (if (pos? n) + (do + (.write compressed-stream buffer 0 n) + (recur (+ total n))) + total))) + (finally + (.close compressed-stream))))) + +(defn copy-decompress + "Copies data from input-stream to output-stream with decompression. + Returns the number of bytes written." + ^long [^InputStream input-stream ^java.io.OutputStream output-stream decompress-fn] + (let [^InputStream decompressed-stream (decompress-fn input-stream) + ^bytes buffer (get-buffer)] + (try + (loop [total (long 0)] + (let [n (.read decompressed-stream buffer 0 DEFAULT_BUFFER_SIZE)] + (if (pos? n) + (do + (.write output-stream buffer 0 n) + (recur (+ total n))) + total))) + (finally + (.close decompressed-stream))))) diff --git a/src-java/CRC32C.java b/src-java/CRC32C.java index 2bc28ce..9f9380c 100644 --- a/src-java/CRC32C.java +++ b/src-java/CRC32C.java @@ -106,9 +106,25 @@ public void update(int b) { @Override public void update(byte[] bArray, int off, int len) { long newCrc = crc ^ LONG_MASK; - for (int i = off; i < off + len; i++) { + int end = off + len; + + // Process 8 bytes at a time for better performance + int fastEnd = end - 7; + int i = off; + while (i < fastEnd) { + // Process a block of 8 bytes using inner loop + for (int j = 0; j < 8; j++) { + newCrc = updateByte(bArray[i + j], newCrc); + } + i += 8; + } + + // Process remaining bytes + while (i < end) { newCrc = updateByte(bArray[i], newCrc); + i++; } + crc = newCrc ^ LONG_MASK; } diff --git a/src-java/CRC64.java b/src-java/CRC64.java index 2ea424b..006f238 100644 --- a/src-java/CRC64.java +++ b/src-java/CRC64.java @@ -34,9 +34,21 @@ public void update(byte[] buf) { public void update(byte[] buf, int off, int len) { int end = off + len; - - while (off < end) + + // Process 8 bytes at a time for better performance + int fastEnd = end - 7; + while (off < fastEnd) { + // Process a block of 8 bytes using inner loop + for (int j = 0; j < 8; j++) { + crc = crcTable[(buf[off + j] ^ (int) crc) & 0xFF] ^ (crc >>> 8); + } + off += 8; + } + + // Process remaining bytes + while (off < end) { crc = crcTable[(buf[off++] ^ (int) crc) & 0xFF] ^ (crc >>> 8); + } } public long getValue() { diff --git a/test/zlib_tiny/performance.clj b/test/zlib_tiny/performance.clj new file mode 100644 index 0000000..743c372 --- /dev/null +++ b/test/zlib_tiny/performance.clj @@ -0,0 +1,43 @@ +(ns zlib-tiny.performance + (:require [clojure.test :refer :all] + [zlib-tiny.core :refer :all]) + (:import [java.io ByteArrayInputStream ByteArrayOutputStream])) + +(defn ^"[B" generate-test-data + "Generate test data of specified size" + [size] + (byte-array (repeatedly size #(rand-int 256)))) + +(deftest performance-test + (testing "CRC32C performance with unrolled loop" + (let [small-data (generate-test-data 100) + medium-data (generate-test-data 10000) + large-data (generate-test-data 1000000)] + (println "\nCRC32C Performance:") + (println "Small data (100 bytes):" (time (crc32c small-data))) + (println "Medium data (10KB):" (time (crc32c medium-data))) + (println "Large data (1MB):" (time (crc32c large-data))))) + + (testing "CRC64 performance with unrolled loop" + (let [small-data (generate-test-data 100) + medium-data (generate-test-data 10000) + large-data (generate-test-data 1000000)] + (println "\nCRC64 Performance:") + (println "Small data (100 bytes):" (time (crc64 small-data))) + (println "Medium data (10KB):" (time (crc64 medium-data))) + (println "Large data (1MB):" (time (crc64 large-data))))) + + (testing "Streaming API for large data" + (let [test-data (generate-test-data 100000) + bais (ByteArrayInputStream. test-data) + baos (ByteArrayOutputStream.)] + (println "\nStreaming API Performance (100KB):") + (time (copy-compress bais baos gzip-stream)) + (println "Compressed size:" (.size baos)) + + (let [compressed-data (.toByteArray baos) + bais2 (ByteArrayInputStream. compressed-data) + baos2 (ByteArrayOutputStream.)] + (time (copy-decompress bais2 baos2 gunzip-stream)) + (println "Decompressed size:" (.size baos2)) + (is (= (alength test-data) (.size baos2)) "Decompressed size should match original"))))) \ No newline at end of file