aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArmin Rigo <arigo@tunes.org>2020-11-24 11:53:44 +0000
committerArmin Rigo <arigo@tunes.org>2020-11-24 11:53:44 +0000
commit2798aa697ef635ce0b85d486dbb6b24b833d9e9e (patch)
treeac017e4259c44f3ee9110ae57d67c0df5dab4e09 /rpython/rlib
parentupdate how-to-release document (diff)
downloadpypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.tar.gz
pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.tar.bz2
pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.zip
back-port the rpython bits of "py3.7-rsre"
Diffstat (limited to 'rpython/rlib')
-rw-r--r--rpython/rlib/rsre/rpy/_sre.py3
-rw-r--r--rpython/rlib/rsre/rpy/sre_constants.py40
-rw-r--r--rpython/rlib/rsre/rsre_char.py70
-rw-r--r--rpython/rlib/rsre/rsre_constants.py66
-rw-r--r--rpython/rlib/rsre/rsre_core.py251
-rw-r--r--rpython/rlib/rsre/rsre_utf8.py4
-rw-r--r--rpython/rlib/rsre/test/test_char.py12
-rw-r--r--rpython/rlib/rsre/test/test_match.py10
8 files changed, 327 insertions, 129 deletions
diff --git a/rpython/rlib/rsre/rpy/_sre.py b/rpython/rlib/rsre/rpy/_sre.py
index 617345483a..70d7737297 100644
--- a/rpython/rlib/rsre/rpy/_sre.py
+++ b/rpython/rlib/rsre/rpy/_sre.py
@@ -22,6 +22,9 @@ def get_code(regexp, flags=0, allargs=False):
"""NOT_RPYTHON: you can't compile new regexps in an RPython program,
you can only use precompiled ones"""
from . import sre_compile
+ if rsre_constants.V37:
+ import pytest
+ pytest.skip("This test cannot run in a 3.7 branch of pypy")
try:
sre_compile.compile(regexp, flags)
except GotIt as e:
diff --git a/rpython/rlib/rsre/rpy/sre_constants.py b/rpython/rlib/rsre/rpy/sre_constants.py
index 89cbdb0d5f..4b9deac743 100644
--- a/rpython/rlib/rsre/rpy/sre_constants.py
+++ b/rpython/rlib/rsre/rpy/sre_constants.py
@@ -94,35 +94,17 @@ CATEGORY_UNI_NOT_WORD = "category_uni_not_word"
CATEGORY_UNI_LINEBREAK = "category_uni_linebreak"
CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak"
-OPCODES = [
-
- # failure=0 success=1 (just because it looks better that way :-)
- FAILURE, SUCCESS,
-
- ANY, ANY_ALL,
- ASSERT, ASSERT_NOT,
- AT,
- BRANCH,
- CALL,
- CATEGORY,
- CHARSET, BIGCHARSET,
- GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
- IN, IN_IGNORE,
- INFO,
- JUMP,
- LITERAL, LITERAL_IGNORE,
- MARK,
- MAX_UNTIL,
- MIN_UNTIL,
- NOT_LITERAL, NOT_LITERAL_IGNORE,
- NEGATE,
- RANGE,
- REPEAT,
- REPEAT_ONE,
- SUBPATTERN,
- MIN_REPEAT_ONE,
- RANGE_IGNORE,
-]
+def _rpython_opcodes():
+ from rpython.rlib.rsre import rsre_constants as consts
+ mapping = {}
+ for name, value in consts.__dict__.items():
+ if name.startswith('OPCODE') and isinstance(value, int) and value < 70:
+ name = name[6:].lstrip('012346789_').lower()
+ mapping[value] = name
+ # check that there are no holes
+ assert sorted(mapping.keys()) == range(len(mapping))
+ return [name for value, name in sorted(mapping.items())]
+OPCODES = _rpython_opcodes()
ATCODES = [
AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
diff --git a/rpython/rlib/rsre/rsre_char.py b/rpython/rlib/rsre/rsre_char.py
index 1680d2973d..d5595d01db 100644
--- a/rpython/rlib/rsre/rsre_char.py
+++ b/rpython/rlib/rsre/rsre_char.py
@@ -45,10 +45,10 @@ else:
# codesize. But sre_compile will compile some stuff differently depending on the
# codesize (e.g., charsets).
from rpython.rlib.runicode import MAXUNICODE
-if MAXUNICODE == 65535:
+if MAXUNICODE == 65535 and not consts.V37:
CODESIZE = 2
else:
- CODESIZE = 4
+ CODESIZE = 4 # always 4 from py3.7
copyright = "_sre.py 2.4 Copyright 2005 by Nik Haldimann"
@@ -57,16 +57,22 @@ BIG_ENDIAN = sys.byteorder == "big"
def getlower_ascii(char_ord):
return char_ord + int_between(ord('A'), char_ord, ord('Z') + 1) * (ord('a') - ord('A'))
+def getlower_locale(char_ord):
+ if char_ord < 256: # cheating! Well, CPython does too.
+ char_ord = tolower(char_ord)
+ return char_ord
+
+def getlower_unicode(char_ord):
+ if char_ord < 128: # shortcut for ascii
+ return getlower_ascii(char_ord)
+ assert unicodedb is not None
+ return unicodedb.tolower(char_ord)
+
def getlower(char_ord, flags):
if flags & consts.SRE_FLAG_LOCALE:
- if char_ord < 256: # cheating! Well, CPython does too.
- char_ord = tolower(char_ord)
- return char_ord
+ char_ord = getlower_locale(char_ord)
elif flags & consts.SRE_FLAG_UNICODE:
- if char_ord < 128: # shortcut for ascii
- return getlower_ascii(char_ord)
- assert unicodedb is not None
- char_ord = unicodedb.tolower(char_ord)
+ char_ord = getlower_unicode(char_ord)
else:
char_ord = getlower_ascii(char_ord)
return char_ord
@@ -74,20 +80,38 @@ def getlower(char_ord, flags):
def getupper_ascii(char_ord):
return char_ord - int_between(ord('a'), char_ord, ord('z') + 1) * (ord('a') - ord('A'))
+def getupper_locale(char_ord):
+ if char_ord < 256: # cheating! Well, CPython does too.
+ char_ord = toupper(char_ord)
+ return char_ord
+
+def getupper_unicode(char_ord):
+ if char_ord < 128: # shortcut for ascii
+ return getupper_ascii(char_ord)
+ assert unicodedb is not None
+ return unicodedb.toupper(char_ord)
+
def getupper(char_ord, flags):
if flags & consts.SRE_FLAG_LOCALE:
- if char_ord < 256: # cheating! Well, CPython does too.
- char_ord = toupper(char_ord)
- return char_ord
+ char_ord = getupper_locale(char_ord)
elif flags & consts.SRE_FLAG_UNICODE:
- if char_ord < 128: # shortcut for ascii
- return getupper_ascii(char_ord)
- assert unicodedb is not None
- char_ord = unicodedb.toupper(char_ord)
+ char_ord = getupper_unicode(char_ord)
else:
char_ord = getupper_ascii(char_ord)
return char_ord
+def iscased_ascii(char_ord): # used by py3.7
+ upper = int_between(ord('A'), char_ord, ord('Z')+1)
+ lower = int_between(ord('a'), char_ord, ord('z')+1)
+ return upper | lower
+
+def iscased_unicode(char_ord): # used by py3.7
+ # NOTE: this is not unicodedb.iscased(). As per CPython 3.7, it is
+ # something different which---as far as I can tell---doesn't really
+ # have a meaning on its own, but well.
+ return (char_ord != getlower_unicode(char_ord) or
+ char_ord != getupper_unicode(char_ord))
+
#### Category helpers
is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)]
@@ -223,12 +247,22 @@ def set_range(ctx, pattern, index, char_code):
def set_range_ignore(ctx, pattern, index, char_code):
# <RANGE_IGNORE> <lower> <upper>
# the char_code is already lower cased
+ assert not consts.V37
lower = pattern.pattern[index + 1]
upper = pattern.pattern[index + 2]
match1 = int_between(lower, char_code, upper + 1)
match2 = int_between(lower, getupper(char_code, pattern.flags), upper + 1)
return match1 | match2, index + 3
+def set_range_uni_ignore(ctx, pattern, index, char_code):
+ # <RANGE_UNI_IGNORE> <lower> <upper>
+ # the char_code is already lower cased
+ lower = pattern.pattern[index + 1]
+ upper = pattern.pattern[index + 2]
+ match1 = int_between(lower, char_code, upper + 1)
+ match2 = int_between(lower, getupper_unicode(char_code), upper + 1)
+ return match1 | match2, index + 3
+
def set_bigcharset(ctx, pattern, index, char_code):
# <BIGCHARSET> <blockcount> <256 blockindices> <blocks>
count = pattern.pattern[index+1]
@@ -300,7 +334,9 @@ set_dispatch_table = {
consts.OPCODE_BIGCHARSET: set_bigcharset,
consts.OPCODE_LITERAL: set_literal,
consts.OPCODE_RANGE: set_range,
- consts.OPCODE_RANGE_IGNORE: set_range_ignore,
+ consts.OPCODE27_RANGE_IGNORE: set_range_ignore,
+ consts.OPCODE37_RANGE_UNI_IGNORE: set_range_uni_ignore,
consts.OPCODE_UNICODE_GENERAL_CATEGORY: set_unicode_general_category,
}
+set_dispatch_table.pop(None, None) # remove the OPCODE27_* or OPCODE37_*
set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items()))
diff --git a/rpython/rlib/rsre/rsre_constants.py b/rpython/rlib/rsre/rsre_constants.py
index 9af708532a..9994db7b05 100644
--- a/rpython/rlib/rsre/rsre_constants.py
+++ b/rpython/rlib/rsre/rsre_constants.py
@@ -1,3 +1,15 @@
+# Horrible import-time hack.
+# Blame CPython for renumbering these OPCODE_* at some point.
+from rpython.rlib.objectmodel import specialize
+try:
+ import pypy.module.sys.version
+ V37 = pypy.module.sys.version.CPYTHON_VERSION >= (3, 7)
+except ImportError:
+ raise ImportError("Cannot import pypy.module.sys.version. You can safely "
+ "remove this 'raise' line if you are not interested in "
+ "PyPy but only RPython.")
+ V37 = False
+
OPCODE_FAILURE = 0
OPCODE_SUCCESS = 1
OPCODE_ANY = 2
@@ -6,35 +18,49 @@ OPCODE_ASSERT = 4
OPCODE_ASSERT_NOT = 5
OPCODE_AT = 6
OPCODE_BRANCH = 7
-#OPCODE_CALL = 8
+OPCODE_CALL = 8 # not used
OPCODE_CATEGORY = 9
OPCODE_CHARSET = 10
OPCODE_BIGCHARSET = 11
OPCODE_GROUPREF = 12
OPCODE_GROUPREF_EXISTS = 13
-OPCODE_GROUPREF_IGNORE = 14
-OPCODE_IN = 15
-OPCODE_IN_IGNORE = 16
-OPCODE_INFO = 17
-OPCODE_JUMP = 18
-OPCODE_LITERAL = 19
-OPCODE_LITERAL_IGNORE = 20
-OPCODE_MARK = 21
-OPCODE_MAX_UNTIL = 22
-OPCODE_MIN_UNTIL = 23
-OPCODE_NOT_LITERAL = 24
-OPCODE_NOT_LITERAL_IGNORE = 25
-OPCODE_NEGATE = 26
-OPCODE_RANGE = 27
-OPCODE_REPEAT = 28
-OPCODE_REPEAT_ONE = 29
-#OPCODE_SUBPATTERN = 30
-OPCODE_MIN_REPEAT_ONE = 31
-OPCODE_RANGE_IGNORE = 32
+OPCODE_GROUPREF_IGNORE = 28 if V37 else 14
+OPCODE_IN = 14 if V37 else 15
+OPCODE_IN_IGNORE = 29 if V37 else 16
+OPCODE_INFO = 15 if V37 else 17
+OPCODE_JUMP = 16 if V37 else 18
+OPCODE_LITERAL = 17 if V37 else 19
+OPCODE_LITERAL_IGNORE = 30 if V37 else 20
+OPCODE_MARK = 18 if V37 else 21
+OPCODE_MAX_UNTIL = 19 if V37 else 22
+OPCODE_MIN_UNTIL = 20 if V37 else 23
+OPCODE_NOT_LITERAL = 21 if V37 else 24
+OPCODE_NOT_LITERAL_IGNORE = 31 if V37 else 25
+OPCODE_NEGATE = 22 if V37 else 26
+OPCODE_RANGE = 23 if V37 else 27
+OPCODE_REPEAT = 24 if V37 else 28
+OPCODE_REPEAT_ONE = 25 if V37 else 29
+OPCODE_SUBPATTERN = 26 if V37 else 30 # not used
+OPCODE_MIN_REPEAT_ONE = 27 if V37 else 31
+OPCODE27_RANGE_IGNORE = None if V37 else 32
+
+OPCODE37_GROUPREF_LOC_IGNORE = 32 if V37 else None
+OPCODE37_IN_LOC_IGNORE = 33 if V37 else None
+OPCODE37_LITERAL_LOC_IGNORE = 34 if V37 else None
+OPCODE37_NOT_LITERAL_LOC_IGNORE = 35 if V37 else None
+OPCODE37_GROUPREF_UNI_IGNORE = 36 if V37 else None
+OPCODE37_IN_UNI_IGNORE = 37 if V37 else None
+OPCODE37_LITERAL_UNI_IGNORE = 38 if V37 else None
+OPCODE37_NOT_LITERAL_UNI_IGNORE = 39 if V37 else None
+OPCODE37_RANGE_UNI_IGNORE = 40 if V37 else None
# not used by Python itself
OPCODE_UNICODE_GENERAL_CATEGORY = 70
+@specialize.argtype(1)
+def eq(op, const):
+ return const is not None and op == const
+
AT_BEGINNING = 0
AT_BEGINNING_LINE = 1
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
index 489636b783..3ce901c46a 100644
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -55,7 +55,8 @@ class CompiledPattern(object):
def __init__(self, pattern, flags):
self.pattern = pattern
- self.flags = flags
+ if not consts.V37: # 'flags' is ignored in >=3.7 mode
+ self.flags = flags
# check we don't get the old value of MAXREPEAT
# during the untranslated tests.
# On python3, MAXCODE can appear in patterns. It will be 65535
@@ -63,6 +64,29 @@ class CompiledPattern(object):
if not we_are_translated() and rsre_char.CODESIZE != 2:
assert 65535 not in pattern
+ def lowa(self, char_ord):
+ """Pre-3.7: uses getlower(flags).
+ Post-3.7: this is always getlower_ascii().
+ """
+ if not consts.V37:
+ return rsre_char.getlower(char_ord, self.flags)
+ else:
+ return rsre_char.getlower_ascii(char_ord)
+
+ def char_loc_ignore(self, index, char_ord):
+ assert consts.V37
+ pattern = self.pat(index)
+ return (char_ord == pattern or
+ rsre_char.getlower_locale(char_ord) == pattern or
+ rsre_char.getupper_locale(char_ord) == pattern)
+
+ def charset_loc_ignore(self, ctx, ppos, char_ord):
+ lo = rsre_char.getlower_locale(char_ord)
+ if rsre_char.check_charset(ctx, self, ppos, lo):
+ return True
+ up = rsre_char.getupper_locale(char_ord)
+ return up != lo and rsre_char.check_charset(ctx, self, ppos, up)
+
def pat(self, index):
jit.promote(self)
check_nonneg(index)
@@ -74,6 +98,10 @@ class CompiledPattern(object):
assert result >= 0
return result
+MODE_ANY = '\x00' # an empty match is fine
+MODE_NONEMPTY = '\x01' # must have a non-empty match
+MODE_FULL = '\x02' # must match the whole string
+
class AbstractMatchContext(object):
"""Abstract base class"""
_immutable_fields_ = ['end']
@@ -81,7 +109,7 @@ class AbstractMatchContext(object):
match_end = 0
match_marks = None
match_marks_flat = None
- fullmatch_only = False
+ match_mode = MODE_ANY
def __init__(self, match_start, end):
# 'match_start' and 'end' must be known to be non-negative
@@ -91,25 +119,30 @@ class AbstractMatchContext(object):
self.match_start = match_start
self.end = end
- def reset(self, start):
+ def reset(self, start, must_advance=False):
self.match_start = start
self.match_marks = None
self.match_marks_flat = None
+ #
+ assert MODE_ANY == chr(False)
+ assert MODE_NONEMPTY == chr(True)
+ self.match_mode = chr(must_advance)
+
+ @not_rpython
+ def _fullmatch_only(self, x=None):
+ raise Exception("'ctx.fullmatch_only' was replaced with"
+ " 'ctx.match_mode'")
+ fullmatch_only = property(_fullmatch_only, _fullmatch_only)
@not_rpython
def str(self, index):
"""Must be overridden in a concrete subclass.
- The tag ^^^ here is used to generate a translation-time crash
+ The @not_rpython is used to generate a translation-time crash
if there is a call to str() that is indirect. All calls must
be direct for performance reasons; you need to specialize the
caller with @specializectx."""
raise NotImplementedError
- @not_rpython
- def lowstr(self, index, flags):
- """Similar to str()."""
- raise NotImplementedError
-
# The following methods are provided to be overriden in
# Utf8MatchContext. The non-utf8 implementation is provided
# by the FixedMatchContext abstract subclass, in order to use
@@ -236,10 +269,6 @@ class BufMatchContext(FixedMatchContext):
check_nonneg(index)
return ord(self._buffer.getitem(index))
- def lowstr(self, index, flags):
- c = self.str(index)
- return rsre_char.getlower(c, flags)
-
def fresh_copy(self, start):
return BufMatchContext(self._buffer, start,
self.end)
@@ -261,10 +290,6 @@ class StrMatchContext(FixedMatchContext):
check_nonneg(index)
return ord(self._string[index])
- def lowstr(self, index, flags):
- c = self.str(index)
- return rsre_char.getlower(c, flags)
-
def fresh_copy(self, start):
return StrMatchContext(self._string, start,
self.end)
@@ -289,10 +314,6 @@ class UnicodeMatchContext(FixedMatchContext):
check_nonneg(index)
return ord(self._unicodestr[index])
- def lowstr(self, index, flags):
- c = self.str(index)
- return rsre_char.getlower(c, flags)
-
def fresh_copy(self, start):
return UnicodeMatchContext(self._unicodestr, start,
self.end)
@@ -599,9 +620,13 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
return
elif op == consts.OPCODE_SUCCESS:
- if ctx.fullmatch_only:
+ mode = ctx.match_mode
+ if mode == MODE_FULL:
if ptr != ctx.end:
return # not a full match
+ elif mode == MODE_NONEMPTY:
+ if ptr == ctx.match_start:
+ return # empty match
ctx.match_end = ptr
ctx.match_marks = marks
return MATCHED_OK
@@ -633,10 +658,10 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
ptr1 = ctx.prev_n(ptr, pattern.pat(ppos+1), ctx.ZERO)
except EndOfString:
return
- saved = ctx.fullmatch_only
- ctx.fullmatch_only = False
+ saved = ctx.match_mode
+ ctx.match_mode = MODE_ANY
stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is None
- ctx.fullmatch_only = saved
+ ctx.match_mode = saved
if stop:
return
marks = ctx.match_marks
@@ -651,10 +676,10 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
except EndOfString:
pass
else:
- saved = ctx.fullmatch_only
- ctx.fullmatch_only = False
+ saved = ctx.match_mode
+ ctx.match_mode = MODE_ANY
stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is not None
- ctx.fullmatch_only = saved
+ ctx.match_mode = saved
if stop:
return
ppos += pattern.pat(ppos)
@@ -699,7 +724,29 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos))
if length_bytes < 0:
return # group was not previously defined
- ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes, pattern.flags)
+ ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes, pattern)
+ if ptr < ctx.ZERO:
+ return # no match
+ ppos += 1
+
+ elif consts.eq(op, consts.OPCODE37_GROUPREF_UNI_IGNORE):
+ # unicode version of OPCODE_GROUPREF_IGNORE
+ # <GROUPREF> <groupnum>
+ startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos))
+ if length_bytes < 0:
+ return # group was not previously defined
+ ptr = match_repeated_uni_ignore(ctx, ptr, startptr, length_bytes)
+ if ptr < ctx.ZERO:
+ return # no match
+ ppos += 1
+
+ elif consts.eq(op, consts.OPCODE37_GROUPREF_LOC_IGNORE):
+ # locale version of OPCODE_GROUPREF_IGNORE
+ # <GROUPREF> <groupnum>
+ startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos))
+ if length_bytes < 0:
+ return # group was not previously defined
+ ptr = match_repeated_loc_ignore(ctx, ptr, startptr, length_bytes)
if ptr < ctx.ZERO:
return # no match
ppos += 1
@@ -726,7 +773,25 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
# match set member (or non_member), ignoring case
# <IN> <skip> <set>
if ptr >= ctx.end or not rsre_char.check_charset(ctx, pattern, ppos+1,
- ctx.lowstr(ptr, pattern.flags)):
+ pattern.lowa(ctx.str(ptr))):
+ return
+ ppos += pattern.pat(ppos)
+ ptr = ctx.next(ptr)
+
+ elif consts.eq(op, consts.OPCODE37_IN_UNI_IGNORE):
+ # match set member (or non_member), ignoring case, unicode mode
+ # <IN> <skip> <set>
+ if ptr >= ctx.end or not rsre_char.check_charset(ctx, pattern, ppos+1,
+ rsre_char.getlower_unicode(ctx.str(ptr))):
+ return
+ ppos += pattern.pat(ppos)
+ ptr = ctx.next(ptr)
+
+ elif consts.eq(op, consts.OPCODE37_IN_LOC_IGNORE):
+ # match set member (or non_member), ignoring case, locale mode
+ # <IN> <skip> <set>
+ if ptr >= ctx.end or not pattern.charset_loc_ignore(ctx, ppos+1,
+ ctx.str(ptr)):
return
ppos += pattern.pat(ppos)
ptr = ctx.next(ptr)
@@ -752,7 +817,23 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
elif op == consts.OPCODE_LITERAL_IGNORE:
# match literal string, ignoring case
# <LITERAL_IGNORE> <code>
- if ptr >= ctx.end or ctx.lowstr(ptr, pattern.flags) != pattern.pat(ppos):
+ if ptr >= ctx.end or pattern.lowa(ctx.str(ptr)) != pattern.pat(ppos):
+ return
+ ppos += 1
+ ptr = ctx.next(ptr)
+
+ elif consts.eq(op, consts.OPCODE37_LITERAL_UNI_IGNORE):
+ # match literal string, ignoring case, unicode mode
+ # <LITERAL_IGNORE> <code>
+ if ptr >= ctx.end or rsre_char.getlower_unicode(ctx.str(ptr)) != pattern.pat(ppos):
+ return
+ ppos += 1
+ ptr = ctx.next(ptr)
+
+ elif consts.eq(op, consts.OPCODE37_LITERAL_LOC_IGNORE):
+ # match literal string, ignoring case, locale mode
+ # <LITERAL_IGNORE> <code>
+ if ptr >= ctx.end or not pattern.char_loc_ignore(ppos, ctx.str(ptr)):
return
ppos += 1
ptr = ctx.next(ptr)
@@ -775,7 +856,23 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
elif op == consts.OPCODE_NOT_LITERAL_IGNORE:
# match if it's not a literal string, ignoring case
# <NOT_LITERAL> <code>
- if ptr >= ctx.end or ctx.lowstr(ptr, pattern.flags) == pattern.pat(ppos):
+ if ptr >= ctx.end or pattern.lowa(ctx.str(ptr)) == pattern.pat(ppos):
+ return
+ ppos += 1
+ ptr = ctx.next(ptr)
+
+ elif consts.eq(op, consts.OPCODE37_NOT_LITERAL_UNI_IGNORE):
+ # match if it's not a literal string, ignoring case, unicode mode
+ # <NOT_LITERAL> <code>
+ if ptr >= ctx.end or rsre_char.getlower_unicode(ctx.str(ptr)) == pattern.pat(ppos):
+ return
+ ppos += 1
+ ptr = ctx.next(ptr)
+
+ elif consts.eq(op, consts.OPCODE37_NOT_LITERAL_LOC_IGNORE):
+ # match if it's not a literal string, ignoring case, locale mode
+ # <NOT_LITERAL> <code>
+ if ptr >= ctx.end or pattern.char_loc_ignore(ppos, ctx.str(ptr)):
return
ppos += 1
ptr = ctx.next(ptr)
@@ -883,12 +980,36 @@ def match_repeated(ctx, ptr, oldptr, length_bytes):
return True
@specializectx
-def match_repeated_ignore(ctx, ptr, oldptr, length_bytes, flags):
+def match_repeated_ignore(ctx, ptr, oldptr, length_bytes, pattern):
+ oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+ while oldptr < oldend:
+ if ptr >= ctx.end:
+ return -1
+ if pattern.lowa(ctx.str(ptr)) != pattern.lowa(ctx.str(oldptr)):
+ return -1
+ ptr = ctx.next(ptr)
+ oldptr = ctx.next(oldptr)
+ return ptr
+
+@specializectx
+def match_repeated_uni_ignore(ctx, ptr, oldptr, length_bytes):
+ oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+ while oldptr < oldend:
+ if ptr >= ctx.end:
+ return -1
+ if rsre_char.getlower_unicode(ctx.str(ptr)) != rsre_char.getlower_unicode(ctx.str(oldptr)):
+ return -1
+ ptr = ctx.next(ptr)
+ oldptr = ctx.next(oldptr)
+ return ptr
+
+@specializectx
+def match_repeated_loc_ignore(ctx, ptr, oldptr, length_bytes):
oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
while oldptr < oldend:
if ptr >= ctx.end:
return -1
- if ctx.lowstr(ptr, flags) != ctx.lowstr(oldptr, flags):
+ if rsre_char.getlower_locale(ctx.str(ptr)) != rsre_char.getlower_locale(ctx.str(oldptr)):
return -1
ptr = ctx.next(ptr)
oldptr = ctx.next(oldptr)
@@ -955,54 +1076,63 @@ def match_IN(ctx, pattern, ptr, ppos):
return rsre_char.check_charset(ctx, pattern, ppos+2, ctx.str(ptr))
@specializectx
def match_IN_IGNORE(ctx, pattern, ptr, ppos):
- return rsre_char.check_charset(ctx, pattern, ppos+2, ctx.lowstr(ptr, pattern.flags))
+ return rsre_char.check_charset(ctx, pattern, ppos+2, pattern.lowa(ctx.str(ptr)))
+@specializectx
+def match_IN_UNI_IGNORE(ctx, pattern, ptr, ppos):
+ return rsre_char.check_charset(ctx, pattern, ppos+2, rsre_char.getlower_unicode(ctx.str(ptr)))
+@specializectx
+def match_IN_LOC_IGNORE(ctx, pattern, ptr, ppos):
+ return pattern.charset_loc_ignore(ctx, ppos+2, ctx.str(ptr))
@specializectx
def match_LITERAL(ctx, pattern, ptr, ppos):
return ctx.str(ptr) == pattern.pat(ppos+1)
@specializectx
def match_LITERAL_IGNORE(ctx, pattern, ptr, ppos):
- return ctx.lowstr(ptr, pattern.flags) == pattern.pat(ppos+1)
+ return pattern.lowa(ctx.str(ptr)) == pattern.pat(ppos+1)
+@specializectx
+def match_LITERAL_UNI_IGNORE(ctx, pattern, ptr, ppos):
+ return rsre_char.getlower_unicode(ctx.str(ptr)) == pattern.pat(ppos+1)
+@specializectx
+def match_LITERAL_LOC_IGNORE(ctx, pattern, ptr, ppos):
+ return pattern.char_loc_ignore(ppos+1, ctx.str(ptr))
@specializectx
def match_NOT_LITERAL(ctx, pattern, ptr, ppos):
return ctx.str(ptr) != pattern.pat(ppos+1)
@specializectx
def match_NOT_LITERAL_IGNORE(ctx, pattern, ptr, ppos):
- return ctx.lowstr(ptr, pattern.flags) != pattern.pat(ppos+1)
+ return pattern.lowa(ctx.str(ptr)) != pattern.pat(ppos+1)
+@specializectx
+def match_NOT_LITERAL_UNI_IGNORE(ctx, pattern, ptr, ppos):
+ return rsre_char.getlower_unicode(ctx.str(ptr)) != pattern.pat(ppos+1)
+@specializectx
+def match_NOT_LITERAL_LOC_IGNORE(ctx, pattern, ptr, ppos):
+ return not pattern.char_loc_ignore(ppos+1, ctx.str(ptr))
def _make_fre(checkerfn):
if checkerfn == match_ANY_ALL:
def fre(ctx, pattern, ptr, end, ppos):
return end
- elif checkerfn == match_IN:
- install_jitdriver_spec('MatchIn',
+ elif checkerfn in (match_IN, match_IN_IGNORE, match_IN_UNI_IGNORE):
+ # produces three jitdrivers:
+ # MatchIn
+ # MatchInIgnore
+ # MatchInUniIgnore
+ name = checkerfn.__name__.title().replace('_', '')
+ method_name = "jitdriver_" + name
+ install_jitdriver_spec(name,
greens=['ppos', 'pattern'],
reds=['ptr', 'end', 'ctx'],
debugprint=(1, 0))
@specializectx
def fre(ctx, pattern, ptr, end, ppos):
while True:
- ctx.jitdriver_MatchIn.jit_merge_point(ctx=ctx, ptr=ptr,
+ getattr(ctx, method_name).jit_merge_point(ctx=ctx, ptr=ptr,
end=end, ppos=ppos,
pattern=pattern)
if ptr < end and checkerfn(ctx, pattern, ptr, ppos):
ptr = ctx.next(ptr)
else:
return ptr
- elif checkerfn == match_IN_IGNORE:
- install_jitdriver_spec('MatchInIgnore',
- greens=['ppos', 'pattern'],
- reds=['ptr', 'end', 'ctx'],
- debugprint=(1, 0))
- @specializectx
- def fre(ctx, pattern, ptr, end, ppos):
- while True:
- ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr,
- end=end, ppos=ppos,
- pattern=pattern)
- if ptr < end and checkerfn(ctx, pattern, ptr, ppos):
- ptr = ctx.next(ptr)
- else:
- return ptr
else:
# in the other cases, the fre() function is not JITted at all
# and is present as a residual call.
@@ -1019,11 +1149,19 @@ unroll_char_checker = [
(consts.OPCODE_ANY_ALL, match_ANY_ALL),
(consts.OPCODE_IN, match_IN),
(consts.OPCODE_IN_IGNORE, match_IN_IGNORE),
+ (consts.OPCODE37_IN_UNI_IGNORE, match_IN_UNI_IGNORE),
+ (consts.OPCODE37_IN_LOC_IGNORE, match_IN_LOC_IGNORE),
(consts.OPCODE_LITERAL, match_LITERAL),
(consts.OPCODE_LITERAL_IGNORE, match_LITERAL_IGNORE),
+ (consts.OPCODE37_LITERAL_UNI_IGNORE, match_LITERAL_UNI_IGNORE),
+ (consts.OPCODE37_LITERAL_LOC_IGNORE, match_LITERAL_LOC_IGNORE),
(consts.OPCODE_NOT_LITERAL, match_NOT_LITERAL),
(consts.OPCODE_NOT_LITERAL_IGNORE, match_NOT_LITERAL_IGNORE),
+ (consts.OPCODE37_NOT_LITERAL_UNI_IGNORE, match_NOT_LITERAL_UNI_IGNORE),
+ (consts.OPCODE37_NOT_LITERAL_LOC_IGNORE, match_NOT_LITERAL_LOC_IGNORE),
]
+unroll_char_checker = [(_op, _fn) for (_op, _fn) in unroll_char_checker
+ if _op is not None] # possibly removes the OPCODE37_*
unroll_fre_checker = [(_op, _make_fre(_fn))
for (_op, _fn) in unroll_char_checker]
@@ -1119,7 +1257,8 @@ def match(pattern, string, start=0, end=sys.maxint, fullmatch=False):
assert isinstance(pattern, CompiledPattern)
start, end = _adjust(start, end, len(string))
ctx = StrMatchContext(string, start, end)
- ctx.fullmatch_only = fullmatch
+ if fullmatch:
+ ctx.match_mode = MODE_FULL
if match_context(ctx, pattern):
return ctx
else:
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
index 834748ebaa..aaac302ac0 100644
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -20,10 +20,6 @@ class Utf8MatchContext(AbstractMatchContext):
check_nonneg(index)
return rutf8.codepoint_at_pos(self._utf8, index)
- def lowstr(self, index, flags):
- c = self.str(index)
- return rsre_char.getlower(c, flags)
-
def get_single_byte(self, base_position, index):
return self._utf8[base_position + index]
diff --git a/rpython/rlib/rsre/test/test_char.py b/rpython/rlib/rsre/test/test_char.py
index bd3a6f2936..ef6a573b9e 100644
--- a/rpython/rlib/rsre/test/test_char.py
+++ b/rpython/rlib/rsre/test/test_char.py
@@ -204,3 +204,15 @@ def test_general_category():
assert check_charset(pat, 0, 99) # Lcheck_charset(pat, 0, 453) # Lt
assert not check_charset(pat, 0, 688) # Lm
assert not check_charset(pat, 0, 5870) # Nl
+
+def test_iscased():
+ assert rsre_char.iscased_ascii(65)
+ assert rsre_char.iscased_ascii(100)
+ assert rsre_char.iscased_ascii(64) is False
+ assert rsre_char.iscased_ascii(126) is False
+ assert rsre_char.iscased_ascii(1260) is False
+ assert rsre_char.iscased_ascii(12600) is False
+ for i in range(65, 10000, 33):
+ assert rsre_char.iscased_unicode(i) == (
+ rsre_char.getlower_unicode(i) != i or
+ rsre_char.getupper_unicode(i) != i)
diff --git a/rpython/rlib/rsre/test/test_match.py b/rpython/rlib/rsre/test/test_match.py
index c832244b11..758c015f7a 100644
--- a/rpython/rlib/rsre/test/test_match.py
+++ b/rpython/rlib/rsre/test/test_match.py
@@ -1,5 +1,5 @@
import re, random, py
-from rpython.rlib.rsre import rsre_char
+from rpython.rlib.rsre import rsre_char, rsre_constants
from rpython.rlib.rsre.rpy import get_code, VERSION
from rpython.rlib.rsre.test.support import match, fullmatch, Position as P
@@ -306,6 +306,10 @@ class TestMatch:
rsre_char.set_unicode_db(unicodedb)
#
r = get_code(u"[\U00010428-\U0001044f]", re.I)
- assert r.pattern.count(27) == 1 # OPCODE_RANGE
- r.pattern[r.pattern.index(27)] = 32 # => OPCODE_RANGE_IGNORE
+ assert r.pattern.count(rsre_constants.OPCODE_RANGE) == 1
+ if rsre_constants.V37:
+ repl = rsre_constants.OPCODE37_RANGE_UNI_IGNORE
+ else:
+ repl = rsre_constants.OPCODE27_RANGE_IGNORE
+ r.pattern[r.pattern.index(rsre_constants.OPCODE_RANGE)] = repl
assert match(r, u"\U00010428")