Skip to content

Commit efa5464

Browse files
authored
fix #679 - defer printing of partial characters at end of buffer when flushing text PyIO (#688)
Co-authored-by: Christopher Doris <github.com/cjdoris>
1 parent e358a52 commit efa5464

File tree

2 files changed

+59
-4
lines changed

2 files changed

+59
-4
lines changed

src/Wrap/PyIO.jl

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,51 @@ end
3434
# If obuf is non-empty, write it to the underlying stream.
3535
function putobuf(io::PyIO)
3636
if !isempty(io.obuf)
37-
data = io.text ? pystr_fromUTF8(io.obuf) : pybytes(io.obuf)
38-
pydel!(@py io.write(data))
39-
pydel!(data)
40-
empty!(io.obuf)
37+
if io.text
38+
# Check if there is a partial character at the end of obuf and if so then
39+
# do not write it.
40+
# get the last character
41+
nskip = 0
42+
n = length(io.obuf)
43+
c = io.obuf[end]
44+
if (c & 0xC0) == 0xC0
45+
# 11xxxxxx => buffer ends in a multi-byte char
46+
nskip = 1
47+
elseif ((c & 0xC0) == 0x80) && (n > 1)
48+
# 10xxxxxx => continuation char
49+
# get the second to last character
50+
c = io.obuf[end-1]
51+
if (c & 0xE0) == 0xE0
52+
# 111xxxxx => buffer ends in a 3- or 4-byte char
53+
nskip = 2
54+
elseif ((c & 0xC0) == 0x80) && (n > 2)
55+
# 10xxxxxx => continuation char
56+
# get the third to last character
57+
c = io.obuf[end-2]
58+
if (c & 0xF0) == 0xF0
59+
# 1111xxxx => buffer ends in a 4-byte char
60+
nskip = 3
61+
end
62+
end
63+
end
64+
if nskip == 0
65+
data = pystr_fromUTF8(io.obuf)
66+
else
67+
data = pystr_fromUTF8(view(io.obuf, 1:(n-nskip)))
68+
end
69+
pydel!(@py io.write(data))
70+
pydel!(data)
71+
if nskip == 0
72+
empty!(io.obuf)
73+
else
74+
deleteat!(io.obuf, 1:(n-nskip))
75+
end
76+
else
77+
data = pybytes(io.obuf)
78+
pydel!(@py io.write(data))
79+
pydel!(data)
80+
empty!(io.obuf)
81+
end
4182
end
4283
return
4384
end

test/Wrap.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,20 @@ end
270270
@test !isopen(b)
271271
@test !isopen(s)
272272
end
273+
@testset "flush partial characters (issue 679)" begin
274+
# In this example, "touché!" takes up 8 bytes, with 'é' taking 2. So when we
275+
# make a PyIO with buflen=6, it tries to flush after 6 bytes. Previously this
276+
# would try to create a string from those 6 bytes and fail with a
277+
# UnicodeDecodeError because the final character is incomplete. This is now
278+
# fixed by deferring printing of incomplete characters.
279+
s0 = pyimport("io").StringIO()
280+
s = PyIO(s0, buflen=6)
281+
@test s.text
282+
@test write(s, "touché!") == 8
283+
flush(s)
284+
s0.seek(0)
285+
@test pyeq(Bool, s0.read(), "touché!")
286+
end
273287
end
274288

275289
@testitem "PyIterable" begin

0 commit comments

Comments
 (0)