aboutsummaryrefslogtreecommitdiff
path: root/pypy
diff options
context:
space:
mode:
authorfijal <unknown>2017-11-23 18:30:30 +0100
committerfijal <unknown>2017-11-23 18:30:30 +0100
commit1ad2f30f600acf1da4f7b784cdac51edb0fb543a (patch)
treeed6310dff1f1f973b34ca7bddd410e43a9807738 /pypy
parentmerge default (diff)
downloadpypy-1ad2f30f600acf1da4f7b784cdac51edb0fb543a.tar.gz
pypy-1ad2f30f600acf1da4f7b784cdac51edb0fb543a.tar.bz2
pypy-1ad2f30f600acf1da4f7b784cdac51edb0fb543a.zip
fix multibytecodec
Diffstat (limited to 'pypy')
-rw-r--r--pypy/module/_multibytecodec/c_codecs.py35
-rw-r--r--pypy/module/_multibytecodec/interp_incremental.py35
-rw-r--r--pypy/module/_multibytecodec/interp_multibytecodec.py12
-rw-r--r--pypy/module/_multibytecodec/test/test_c_codecs.py37
-rw-r--r--pypy/module/_multibytecodec/test/test_translation.py4
5 files changed, 70 insertions, 53 deletions
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
index 2691df66db..23aab301f3 100644
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -197,19 +197,21 @@ pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
MBENC_FLUSH = 1
MBENC_RESET = 2
-def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+def encode(codec, unicodedata, length, errors="strict", errorcb=None,
+ namecb=None):
encodebuf = pypy_cjk_enc_new(codec)
if not encodebuf:
raise MemoryError
try:
- return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+ return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb)
finally:
pypy_cjk_enc_free(encodebuf)
-def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None,
namecb=None, ignore_error=0):
- inleft = len(unicodedata)
- with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf:
+ inleft = length
+ inbuf = rffi.utf82wcharp(utf8data, length)
+ try:
if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
raise MemoryError
if ignore_error == 0:
@@ -221,16 +223,18 @@ def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
if r == 0 or r == ignore_error:
break
multibytecodec_encerror(encodebuf, r, errors,
- errorcb, namecb, unicodedata)
+ errorcb, namecb, utf8data)
while flags & MBENC_RESET:
r = pypy_cjk_enc_reset(encodebuf)
if r == 0:
break
multibytecodec_encerror(encodebuf, r, errors,
- errorcb, namecb, unicodedata)
+ errorcb, namecb, utf8data)
src = pypy_cjk_enc_outbuf(encodebuf)
length = pypy_cjk_enc_outlen(encodebuf)
return rffi.charpsize2str(src, length)
+ finally:
+ lltype.free(inbuf, flavor='raw')
def multibytecodec_encerror(encodebuf, e, errors,
errorcb, namecb, unicodedata):
@@ -256,21 +260,16 @@ def multibytecodec_encerror(encodebuf, e, errors,
elif errors == "replace":
codec = pypy_cjk_enc_getcodec(encodebuf)
try:
- replace = encode(codec, u"?")
+ replace = encode(codec, "?", 1)
except EncodeDecodeError:
replace = "?"
else:
assert errorcb
- XXX
- retu, rets, end = errorcb(errors, namecb, reason,
- unicodedata.encode("utf8"), start, end)
- if rets is not None:
- # py3k only
- replace = rets
- else:
- assert retu is not None
- codec = pypy_cjk_enc_getcodec(encodebuf)
- replace = encode(codec, retu, "strict", errorcb, namecb)
+ rets, end = errorcb(errors, namecb, reason,
+ unicodedata, start, end)
+ codec = pypy_cjk_enc_getcodec(encodebuf)
+ lgt, _ = rutf8.get_utf8_length_flag(rets)
+ replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
with rffi.scoped_nonmovingbuffer(replace) as inbuf:
r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
if r == MBERR_NOMEMORY:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
index 5cbc263563..5fffb32a99 100644
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -1,4 +1,5 @@
from rpython.rtyper.lltypesystem import lltype
+from rpython.rlib import rutf8
from pypy.module._multibytecodec import c_codecs
from pypy.module._multibytecodec.interp_multibytecodec import (
MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
@@ -65,7 +66,8 @@ class MultibyteIncrementalDecoder(MultibyteIncrementalBase):
pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
assert 0 <= pos <= len(object)
self.pending = object[pos:]
- return space.newunicode(output)
+ lgt, flag = rutf8.get_utf8_length_flag(output)
+ return space.newutf8(output, lgt, flag)
@unwrap_spec(errors="text_or_none")
@@ -88,7 +90,8 @@ class MultibyteIncrementalEncoder(MultibyteIncrementalBase):
def _initialize(self):
self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
- self.pending = u""
+ self.pending = ""
+ self.pending_len = 0
def _free(self):
self.pending = None
@@ -96,25 +99,37 @@ class MultibyteIncrementalEncoder(MultibyteIncrementalBase):
c_codecs.pypy_cjk_enc_free(self.encodebuf)
self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
- @unwrap_spec(object='utf8', final=bool)
- def encode_w(self, object, final=False):
- u_object = object.decode('utf8')
+ @unwrap_spec(final=bool)
+ def encode_w(self, space, w_object, final=False):
+ utf8data, length = space.utf8_len_w(w_object)
space = self.space
state = space.fromcache(CodecState)
if len(self.pending) > 0:
- u_object = self.pending + u_object
+ utf8data = self.pending + utf8data
+ length += self.pending_len
try:
- output = c_codecs.encodeex(self.encodebuf, u_object, self.errors,
+ output = c_codecs.encodeex(self.encodebuf, utf8data, length,
+ self.errors,
state.encode_error_handler, self.name,
get_ignore_error(final))
except c_codecs.EncodeDecodeError as e:
- raise wrap_unicodeencodeerror(space, e, object, len(u_object),
+ raise wrap_unicodeencodeerror(space, e, utf8data, length,
self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
- assert 0 <= pos <= len(u_object)
- self.pending = u_object[pos:]
+ assert 0 <= pos <= length
+ # scan the utf8 string until we hit pos
+ i = 0
+ stop = length - pos
+ self.pending_len = stop
+ if stop > 0:
+ while pos > 0:
+ i = rutf8.next_codepoint_pos(utf8data, i)
+ pos -= 1
+ self.pending = utf8data[i:]
+ else:
+ self.pending = ""
return space.newbytes(output)
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
index 0728761d6c..f6d16c8141 100644
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -31,23 +31,23 @@ class MultibyteCodec(W_Root):
return space.newtuple([space.newutf8(utf8_output, lgt, flag),
space.newint(len(input))])
- @unwrap_spec(input='utf8', errors="text_or_none")
- def encode(self, space, input, errors=None):
+ @unwrap_spec(errors="text_or_none")
+ def encode(self, space, w_input, errors=None):
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
+ input, length = space.utf8_len_w(w_input)
#
- u_input = input.decode('utf8')
try:
- output = c_codecs.encode(self.codec, u_input, errors,
+ output = c_codecs.encode(self.codec, input, length, errors,
state.encode_error_handler, self.name)
except c_codecs.EncodeDecodeError as e:
- raise wrap_unicodeencodeerror(space, e, input, len(u_input),
+ raise wrap_unicodeencodeerror(space, e, input, length,
self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
return space.newtuple([space.newbytes(output),
- space.newint(len(u_input))])
+ space.newint(length)])
MultibyteCodec.typedef = TypeDef(
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
index 6fab3635ad..7c9178d7cb 100644
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -14,27 +14,27 @@ def test_codecs_existence():
def test_decode_gbk():
c = getcodec("gbk")
u = decode(c, "\xA1\xAA")
- assert u == unichr(0x2014)
+ assert u == unichr(0x2014).encode('utf8')
u = decode(c, "foobar")
- assert u == u"foobar"
+ assert u == "foobar"
def test_decode_hz():
# stateful
c = getcodec("hz")
u = decode(c, "~{abc}")
- assert u == u'\u5f95\u6cef'
+ assert u == u'\u5f95\u6cef'.encode('utf8')
u = decode(c, "~{")
- assert u == u''
+ assert u == ''
def test_decodeex_hz():
c = getcodec("hz")
decodebuf = c_codecs.pypy_cjk_dec_new(c)
u = c_codecs.decodeex(decodebuf, "~{abcd~}")
- assert u == u'\u5f95\u6c85'
+ assert u == u'\u5f95\u6c85'.encode('utf8')
u = c_codecs.decodeex(decodebuf, "~{efgh~}")
- assert u == u'\u5f50\u73b7'
+ assert u == u'\u5f50\u73b7'.encode('utf8')
u = c_codecs.decodeex(decodebuf, "!~{abcd~}xyz~{efgh")
- assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'
+ assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'.encode('utf8')
c_codecs.pypy_cjk_dec_free(decodebuf)
def test_decodeex_hz_incomplete():
@@ -64,7 +64,7 @@ def test_decodeex_hz_incomplete():
buf += c
u = c_codecs.decodeex(decodebuf, buf,
ignore_error = c_codecs.MBERR_TOOFEW)
- assert u == output
+ assert u == output.encode('utf8')
incompletepos = c_codecs.pypy_cjk_dec_inbuf_consumed(decodebuf)
buf = buf[incompletepos:]
assert buf == ''
@@ -86,46 +86,47 @@ def test_decode_hz_error():
def test_decode_hz_ignore():
c = getcodec("hz")
u = decode(c, 'def~{}abc', 'ignore')
- assert u == u'def\u5fcf'
+ assert u == u'def\u5fcf'.encode('utf8')
def test_decode_hz_replace():
c = getcodec("hz")
u = decode(c, 'def~{}abc', 'replace')
- assert u == u'def\ufffd\u5fcf'
+ assert u == u'def\ufffd\u5fcf'.encode('utf8')
def test_encode_hz():
c = getcodec("hz")
- s = encode(c, u'foobar')
+ s = encode(c, u'foobar'.encode('utf8'), 6)
assert s == 'foobar' and type(s) is str
- s = encode(c, u'\u5f95\u6cef')
+ s = encode(c, u'\u5f95\u6cef'.encode('utf8'), 2)
assert s == '~{abc}~}'
def test_encode_hz_error():
# error
c = getcodec("hz")
- e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value
+ e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def'.encode('utf8'), 7).value
assert e.start == 3
assert e.end == 4
assert e.reason == "illegal multibyte sequence"
def test_encode_hz_ignore():
c = getcodec("hz")
- s = encode(c, u'abc\u1234def', 'ignore')
+ s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'ignore')
assert s == 'abcdef'
def test_encode_hz_replace():
c = getcodec("hz")
- s = encode(c, u'abc\u1234def', 'replace')
+ s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'replace')
assert s == 'abc?def'
def test_encode_jisx0208():
c = getcodec('iso2022_jp')
- s = encode(c, u'\u83ca\u5730\u6642\u592b')
+ s = encode(c, u'\u83ca\u5730\u6642\u592b'.encode('utf8'), 4)
assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str
def test_encode_custom_error_handler_bytes():
+ py.test.skip("needs revamping in py3k")
c = getcodec("hz")
def errorhandler(errors, enc, msg, t, startingpos, endingpos):
- return None, '\xc3', endingpos
- s = encode(c, u'abc\u1234def', 'foo', errorhandler)
+ return u'\xc3'.encode('utf8'), endingpos
+ s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'foo', errorhandler)
assert '\xc3' in s
diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py
index 4ee42b37d7..2c655536a6 100644
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -1,6 +1,7 @@
from pypy.module._multibytecodec import c_codecs
from rpython.translator.c.test import test_standalone
from rpython.config.translationoption import get_combined_translation_config
+from rpython.rlib import rutf8
class TestTranslation(test_standalone.StandaloneTests):
@@ -13,7 +14,8 @@ class TestTranslation(test_standalone.StandaloneTests):
codecname, string = argv[1], argv[2]
c = c_codecs.getcodec(codecname)
u = c_codecs.decode(c, string)
- r = c_codecs.encode(c, u)
+ lgt, _ = rutf8.get_utf8_length_flag(u)
+ r = c_codecs.encode(c, u, lgt)
print r
return 0
#