fix error when multi-byte character is split across reads (#29)

eli-darkly · web-flow · commit e527c202116c · 2021-12-29T16:18:53.000-08:00
diff --git a/lib/ld-eventsource/client.rb b/lib/ld-eventsource/client.rb
@@ -306,6 +306,9 @@ def read_stream(cxn)
           else
             begin
               data = cxn.readpartial
+              # readpartial gives us a string, which may not be a valid UTF-8 string because a
+              # multi-byte character might not yet have been fully read, but BufferedLineReader
+              # will handle that.
             rescue HTTP::TimeoutError 
               # For historical reasons, we rethrow this as our own type
               raise Errors::ReadTimeoutError.new(@read_timeout)
diff --git a/lib/ld-eventsource/impl/buffered_line_reader.rb b/lib/ld-eventsource/impl/buffered_line_reader.rb
@@ -9,17 +9,21 @@ class BufferedLineReader
       # input data runs out, the output enumerator ends and does not include any partially
       # completed line.
       #
-      # @param [Enumerator] chunks  an enumerator that will yield strings from a stream
-      # @return [Enumerator]  an enumerator that will yield one line at a time
+      # @param [Enumerator] chunks  an enumerator that will yield strings from a stream -
+      #  these are treated as raw UTF-8 bytes, regardless of the string's declared encoding
+      #  (so it is OK if a multi-byte character is split across chunks); if the declared
+      #  encoding of the chunk is not ASCII-8BIT, it will be changed to ASCII-8BIT in place
+      # @return [Enumerator]  an enumerator that will yield one line at a time in UTF-8
       #
       def self.lines_from(chunks)
-        buffer = ""
+        buffer = "".b
         position = 0
         line_start = 0
         last_char_was_cr = false
 
         Enumerator.new do |gen|
           chunks.each do |chunk|
+            chunk.force_encoding("ASCII-8BIT")
             buffer << chunk
 
             loop do
@@ -47,7 +51,12 @@ def self.lines_from(chunks)
                 next
               end
 
-              line = buffer[line_start, i - line_start]
+              line = buffer[line_start, i - line_start].force_encoding("UTF-8")
+              # Calling force_encoding just declares that we believe the encoding of this string to be
+              # UTF-8 (which is the only encoding allowed in the SSE protocol); it doesn't cause any
+              # re-decoding of the string. The previous line-parsing steps were done on raw 8-bit
+              # strings so that it won't try to do any UTF-8 decoding on intermediate slices.
+
               last_char_was_cr = false
               i += 1
               if ch == "\r"
diff --git a/spec/buffered_line_reader_spec.rb b/spec/buffered_line_reader_spec.rb
@@ -74,4 +74,24 @@ def tests_for_terminator(term, desc)
       "fourth line", "", "last"]
     expect(subject.lines_from(chunks).to_a).to eq(expected)
   end
+
+  it "decodes from UTF-8" do
+    text = "abc€豆腐xyz"
+    chunks = [(text + "\n").encode("UTF-8").b]
+    expected = [text]
+    expect(subject.lines_from(chunks).to_a).to eq(expected)
+  end
+
+  it "decodes from UTF-8 when multi-byte characters are split across chunks" do
+    text = "abc€豆腐xyz"
+    raw = (text + "\n").encode("UTF-8").b
+    chunks = raw.bytes.to_a.map{ |byte| byte.chr.force_encoding("UTF-8") }
+    # Calling force_encoding("UTF-8") here simulates the behavior of the http gem's
+    # readpartial method. It actually returns undecoded bytes that might include an
+    # incomplete multi-byte character, but the string's decoding could still be
+    # declared as UTF-8. So we are making sure that BufferedLineReader correctly
+    # handles such a case.
+    expected = [text]
+    expect(subject.lines_from(chunks).to_a).to eq(expected)
+  end
 end