File tree Expand file tree Collapse file tree 3 files changed +36
-4
lines changed Expand file tree Collapse file tree 3 files changed +36
-4
lines changed Original file line number Diff line number Diff line change @@ -306,6 +306,9 @@ def read_stream(cxn)
306306 else
307307 begin
308308 data = cxn . readpartial
309+ # readpartial gives us a string, which may not be a valid UTF-8 string because a
310+ # multi-byte character might not yet have been fully read, but BufferedLineReader
311+ # will handle that.
309312 rescue HTTP ::TimeoutError
310313 # For historical reasons, we rethrow this as our own type
311314 raise Errors ::ReadTimeoutError . new ( @read_timeout )
Original file line number Diff line number Diff line change @@ -9,17 +9,21 @@ class BufferedLineReader
99 # input data runs out, the output enumerator ends and does not include any partially
1010 # completed line.
1111 #
12- # @param [Enumerator] chunks an enumerator that will yield strings from a stream
13- # @return [Enumerator] an enumerator that will yield one line at a time
12+ # @param [Enumerator] chunks an enumerator that will yield strings from a stream -
13+ # these are treated as raw UTF-8 bytes, regardless of the string's declared encoding
14+ # (so it is OK if a multi-byte character is split across chunks); if the declared
15+ # encoding of the chunk is not ASCII-8BIT, it will be changed to ASCII-8BIT in place
16+ # @return [Enumerator] an enumerator that will yield one line at a time in UTF-8
1417 #
1518 def self . lines_from ( chunks )
16- buffer = ""
19+ buffer = "" . b
1720 position = 0
1821 line_start = 0
1922 last_char_was_cr = false
2023
2124 Enumerator . new do |gen |
2225 chunks . each do |chunk |
26+ chunk . force_encoding ( "ASCII-8BIT" )
2327 buffer << chunk
2428
2529 loop do
@@ -47,7 +51,12 @@ def self.lines_from(chunks)
4751 next
4852 end
4953
50- line = buffer [ line_start , i - line_start ]
54+ line = buffer [ line_start , i - line_start ] . force_encoding ( "UTF-8" )
55+ # Calling force_encoding just declares that we believe the encoding of this string to be
56+ # UTF-8 (which is the only encoding allowed in the SSE protocol); it doesn't cause any
57+ # re-decoding of the string. The previous line-parsing steps were done on raw 8-bit
58+ # strings so that it won't try to do any UTF-8 decoding on intermediate slices.
59+
5160 last_char_was_cr = false
5261 i += 1
5362 if ch == "\r "
Original file line number Diff line number Diff line change @@ -74,4 +74,24 @@ def tests_for_terminator(term, desc)
7474 "fourth line" , "" , "last" ]
7575 expect ( subject . lines_from ( chunks ) . to_a ) . to eq ( expected )
7676 end
77+
78+ it "decodes from UTF-8" do
79+ text = "abc€豆腐xyz"
80+ chunks = [ ( text + "\n " ) . encode ( "UTF-8" ) . b ]
81+ expected = [ text ]
82+ expect ( subject . lines_from ( chunks ) . to_a ) . to eq ( expected )
83+ end
84+
85+ it "decodes from UTF-8 when multi-byte characters are split across chunks" do
86+ text = "abc€豆腐xyz"
87+ raw = ( text + "\n " ) . encode ( "UTF-8" ) . b
88+ chunks = raw . bytes . to_a . map { |byte | byte . chr . force_encoding ( "UTF-8" ) }
89+ # Calling force_encoding("UTF-8") here simulates the behavior of the http gem's
90+ # readpartial method. It actually returns undecoded bytes that might include an
91+ # incomplete multi-byte character, but the string's decoding could still be
92+ # declared as UTF-8. So we are making sure that BufferedLineReader correctly
93+ # handles such a case.
94+ expected = [ text ]
95+ expect ( subject . lines_from ( chunks ) . to_a ) . to eq ( expected )
96+ end
7797end
You can’t perform that action at this time.
0 commit comments