back-port the rpython bits of "py3.7-rsre"

author: Armin Rigo <arigo@tunes.org> 2020-11-24 11:53:44 +0000
committer: Armin Rigo <arigo@tunes.org> 2020-11-24 11:53:44 +0000
commit: 2798aa697ef635ce0b85d486dbb6b24b833d9e9e (patch)
tree: ac017e4259c44f3ee9110ae57d67c0df5dab4e09 /rpython/rlib
parent: update how-to-release document (diff)
download: pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.tar.gz
pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.tar.bz2
pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.zip
8 files changed, 327 insertions, 129 deletions
diff --git a/rpython/rlib/rsre/rpy/_sre.py b/rpython/rlib/rsre/rpy/_sre.py
index 617345483a..70d7737297 100644
--- a/rpython/rlib/rsre/rpy/_sre.py
+++ b/rpython/rlib/rsre/rpy/_sre.py
@@ -22,6 +22,9 @@ def get_code(regexp, flags=0, allargs=False):
     """NOT_RPYTHON: you can't compile new regexps in an RPython program,
     you can only use precompiled ones"""
     from . import sre_compile
+    if rsre_constants.V37:
+        import pytest
+        pytest.skip("This test cannot run in a 3.7 branch of pypy")
     try:
         sre_compile.compile(regexp, flags)
     except GotIt as e:
diff --git a/rpython/rlib/rsre/rpy/sre_constants.py b/rpython/rlib/rsre/rpy/sre_constants.py
index 89cbdb0d5f..4b9deac743 100644
--- a/rpython/rlib/rsre/rpy/sre_constants.py
+++ b/rpython/rlib/rsre/rpy/sre_constants.py
@@ -94,35 +94,17 @@ CATEGORY_UNI_NOT_WORD = "category_uni_not_word"
 CATEGORY_UNI_LINEBREAK = "category_uni_linebreak"
 CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak"
 
-OPCODES = [
-
-    # failure=0 success=1 (just because it looks better that way :-)
-    FAILURE, SUCCESS,
-
-    ANY, ANY_ALL,
-    ASSERT, ASSERT_NOT,
-    AT,
-    BRANCH,
-    CALL,
-    CATEGORY,
-    CHARSET, BIGCHARSET,
-    GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
-    IN, IN_IGNORE,
-    INFO,
-    JUMP,
-    LITERAL, LITERAL_IGNORE,
-    MARK,
-    MAX_UNTIL,
-    MIN_UNTIL,
-    NOT_LITERAL, NOT_LITERAL_IGNORE,
-    NEGATE,
-    RANGE,
-    REPEAT,
-    REPEAT_ONE,
-    SUBPATTERN,
-    MIN_REPEAT_ONE,
-    RANGE_IGNORE,
-]
+def _rpython_opcodes():
+    from rpython.rlib.rsre import rsre_constants as consts
+    mapping = {}
+    for name, value in consts.__dict__.items():
+        if name.startswith('OPCODE') and isinstance(value, int) and value < 70:
+            name = name[6:].lstrip('012346789_').lower()
+            mapping[value] = name
+    # check that there are no holes
+    assert sorted(mapping.keys()) == range(len(mapping))
+    return [name for value, name in sorted(mapping.items())]
+OPCODES = _rpython_opcodes()
 
 ATCODES = [
     AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
diff --git a/rpython/rlib/rsre/rsre_char.py b/rpython/rlib/rsre/rsre_char.py
index 1680d2973d..d5595d01db 100644
--- a/rpython/rlib/rsre/rsre_char.py
+++ b/rpython/rlib/rsre/rsre_char.py
@@ -45,10 +45,10 @@ else:
 # codesize. But sre_compile will compile some stuff differently depending on the
 # codesize (e.g., charsets).
 from rpython.rlib.runicode import MAXUNICODE
-if MAXUNICODE == 65535:
+if MAXUNICODE == 65535 and not consts.V37:
     CODESIZE = 2
 else:
-    CODESIZE = 4
+    CODESIZE = 4        # always 4 from py3.7
 
 copyright = "_sre.py 2.4 Copyright 2005 by Nik Haldimann"
 
@@ -57,16 +57,22 @@ BIG_ENDIAN = sys.byteorder == "big"
 def getlower_ascii(char_ord):
     return char_ord + int_between(ord('A'), char_ord, ord('Z') + 1) * (ord('a') - ord('A'))
 
+def getlower_locale(char_ord):
+    if char_ord < 256:      # cheating!  Well, CPython does too.
+        char_ord = tolower(char_ord)
+    return char_ord
+
+def getlower_unicode(char_ord):
+    if char_ord < 128: # shortcut for ascii
+        return getlower_ascii(char_ord)
+    assert unicodedb is not None
+    return unicodedb.tolower(char_ord)
+
 def getlower(char_ord, flags):
     if flags & consts.SRE_FLAG_LOCALE:
-        if char_ord < 256:      # cheating!  Well, CPython does too.
-            char_ord = tolower(char_ord)
-        return char_ord
+        char_ord = getlower_locale(char_ord)
     elif flags & consts.SRE_FLAG_UNICODE:
-        if char_ord < 128: # shortcut for ascii
-            return getlower_ascii(char_ord)
-        assert unicodedb is not None
-        char_ord = unicodedb.tolower(char_ord)
+        char_ord = getlower_unicode(char_ord)
     else:
         char_ord = getlower_ascii(char_ord)
     return char_ord
@@ -74,20 +80,38 @@ def getlower(char_ord, flags):
 def getupper_ascii(char_ord):
     return char_ord - int_between(ord('a'), char_ord, ord('z') + 1) * (ord('a') - ord('A'))
 
+def getupper_locale(char_ord):
+    if char_ord < 256:      # cheating!  Well, CPython does too.
+        char_ord = toupper(char_ord)
+    return char_ord
+
+def getupper_unicode(char_ord):
+    if char_ord < 128: # shortcut for ascii
+        return getupper_ascii(char_ord)
+    assert unicodedb is not None
+    return unicodedb.toupper(char_ord)
+
 def getupper(char_ord, flags):
     if flags & consts.SRE_FLAG_LOCALE:
-        if char_ord < 256:      # cheating!  Well, CPython does too.
-            char_ord = toupper(char_ord)
-        return char_ord
+        char_ord = getupper_locale(char_ord)
     elif flags & consts.SRE_FLAG_UNICODE:
-        if char_ord < 128: # shortcut for ascii
-            return getupper_ascii(char_ord)
-        assert unicodedb is not None
-        char_ord = unicodedb.toupper(char_ord)
+        char_ord = getupper_unicode(char_ord)
     else:
         char_ord = getupper_ascii(char_ord)
     return char_ord
 
+def iscased_ascii(char_ord):   # used by py3.7
+    upper = int_between(ord('A'), char_ord, ord('Z')+1)
+    lower = int_between(ord('a'), char_ord, ord('z')+1)
+    return upper | lower
+
+def iscased_unicode(char_ord):   # used by py3.7
+    # NOTE: this is not unicodedb.iscased().  As per CPython 3.7, it is
+    # something different which---as far as I can tell---doesn't really
+    # have a meaning on its own, but well.
+    return (char_ord != getlower_unicode(char_ord) or
+            char_ord != getupper_unicode(char_ord))
+
 #### Category helpers
 
 is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)]
@@ -223,12 +247,22 @@ def set_range(ctx, pattern, index, char_code):
 def set_range_ignore(ctx, pattern, index, char_code):
     # <RANGE_IGNORE> <lower> <upper>
     # the char_code is already lower cased
+    assert not consts.V37
     lower = pattern.pattern[index + 1]
     upper = pattern.pattern[index + 2]
     match1 = int_between(lower, char_code, upper + 1)
     match2 = int_between(lower, getupper(char_code, pattern.flags), upper + 1)
     return match1 | match2, index + 3
 
+def set_range_uni_ignore(ctx, pattern, index, char_code):
+    # <RANGE_UNI_IGNORE> <lower> <upper>
+    # the char_code is already lower cased
+    lower = pattern.pattern[index + 1]
+    upper = pattern.pattern[index + 2]
+    match1 = int_between(lower, char_code, upper + 1)
+    match2 = int_between(lower, getupper_unicode(char_code), upper + 1)
+    return match1 | match2, index + 3
+
 def set_bigcharset(ctx, pattern, index, char_code):
     # <BIGCHARSET> <blockcount> <256 blockindices> <blocks>
     count = pattern.pattern[index+1]
@@ -300,7 +334,9 @@ set_dispatch_table = {
     consts.OPCODE_BIGCHARSET: set_bigcharset,
     consts.OPCODE_LITERAL: set_literal,
     consts.OPCODE_RANGE: set_range,
-    consts.OPCODE_RANGE_IGNORE: set_range_ignore,
+    consts.OPCODE27_RANGE_IGNORE: set_range_ignore,
+    consts.OPCODE37_RANGE_UNI_IGNORE: set_range_uni_ignore,
     consts.OPCODE_UNICODE_GENERAL_CATEGORY: set_unicode_general_category,
 }
+set_dispatch_table.pop(None, None)   # remove the OPCODE27_* or OPCODE37_*
 set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items()))
diff --git a/rpython/rlib/rsre/rsre_constants.py b/rpython/rlib/rsre/rsre_constants.py
index 9af708532a..9994db7b05 100644
--- a/rpython/rlib/rsre/rsre_constants.py
+++ b/rpython/rlib/rsre/rsre_constants.py
@@ -1,3 +1,15 @@
+# Horrible import-time hack.
+# Blame CPython for renumbering these OPCODE_* at some point.
+from rpython.rlib.objectmodel import specialize
+try:
+    import pypy.module.sys.version
+    V37 = pypy.module.sys.version.CPYTHON_VERSION >= (3, 7)
+except ImportError:
+    raise ImportError("Cannot import pypy.module.sys.version. You can safely "
+                      "remove this 'raise' line if you are not interested in "
+                      "PyPy but only RPython.")
+    V37 = False
+
 OPCODE_FAILURE            = 0
 OPCODE_SUCCESS            = 1
 OPCODE_ANY                = 2
@@ -6,35 +18,49 @@ OPCODE_ASSERT             = 4
 OPCODE_ASSERT_NOT         = 5
 OPCODE_AT                 = 6
 OPCODE_BRANCH             = 7
-#OPCODE_CALL              = 8
+OPCODE_CALL               = 8                    # not used
 OPCODE_CATEGORY           = 9
 OPCODE_CHARSET            = 10
 OPCODE_BIGCHARSET         = 11
 OPCODE_GROUPREF           = 12
 OPCODE_GROUPREF_EXISTS    = 13
-OPCODE_GROUPREF_IGNORE    = 14
-OPCODE_IN                 = 15
-OPCODE_IN_IGNORE          = 16
-OPCODE_INFO               = 17
-OPCODE_JUMP               = 18
-OPCODE_LITERAL            = 19
-OPCODE_LITERAL_IGNORE     = 20
-OPCODE_MARK               = 21
-OPCODE_MAX_UNTIL          = 22
-OPCODE_MIN_UNTIL          = 23
-OPCODE_NOT_LITERAL        = 24
-OPCODE_NOT_LITERAL_IGNORE = 25
-OPCODE_NEGATE             = 26
-OPCODE_RANGE              = 27
-OPCODE_REPEAT             = 28
-OPCODE_REPEAT_ONE         = 29
-#OPCODE_SUBPATTERN        = 30
-OPCODE_MIN_REPEAT_ONE     = 31
-OPCODE_RANGE_IGNORE       = 32
+OPCODE_GROUPREF_IGNORE    = 28 if V37 else 14
+OPCODE_IN                 = 14 if V37 else 15
+OPCODE_IN_IGNORE          = 29 if V37 else 16
+OPCODE_INFO               = 15 if V37 else 17
+OPCODE_JUMP               = 16 if V37 else 18
+OPCODE_LITERAL            = 17 if V37 else 19
+OPCODE_LITERAL_IGNORE     = 30 if V37 else 20
+OPCODE_MARK               = 18 if V37 else 21
+OPCODE_MAX_UNTIL          = 19 if V37 else 22
+OPCODE_MIN_UNTIL          = 20 if V37 else 23
+OPCODE_NOT_LITERAL        = 21 if V37 else 24
+OPCODE_NOT_LITERAL_IGNORE = 31 if V37 else 25
+OPCODE_NEGATE             = 22 if V37 else 26
+OPCODE_RANGE              = 23 if V37 else 27
+OPCODE_REPEAT             = 24 if V37 else 28
+OPCODE_REPEAT_ONE         = 25 if V37 else 29
+OPCODE_SUBPATTERN         = 26 if V37 else 30    # not used
+OPCODE_MIN_REPEAT_ONE     = 27 if V37 else 31
+OPCODE27_RANGE_IGNORE     = None if V37 else 32
+
+OPCODE37_GROUPREF_LOC_IGNORE      = 32 if V37 else None
+OPCODE37_IN_LOC_IGNORE            = 33 if V37 else None
+OPCODE37_LITERAL_LOC_IGNORE       = 34 if V37 else None
+OPCODE37_NOT_LITERAL_LOC_IGNORE   = 35 if V37 else None
+OPCODE37_GROUPREF_UNI_IGNORE      = 36 if V37 else None
+OPCODE37_IN_UNI_IGNORE            = 37 if V37 else None
+OPCODE37_LITERAL_UNI_IGNORE       = 38 if V37 else None
+OPCODE37_NOT_LITERAL_UNI_IGNORE   = 39 if V37 else None
+OPCODE37_RANGE_UNI_IGNORE         = 40 if V37 else None
 
 # not used by Python itself
 OPCODE_UNICODE_GENERAL_CATEGORY = 70
 
+@specialize.argtype(1)
+def eq(op, const):
+    return const is not None and op == const
+
 
 AT_BEGINNING = 0
 AT_BEGINNING_LINE = 1
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
index 489636b783..3ce901c46a 100644
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -55,7 +55,8 @@ class CompiledPattern(object):
 
     def __init__(self, pattern, flags):
         self.pattern = pattern
-        self.flags = flags
+        if not consts.V37:      # 'flags' is ignored in >=3.7 mode
+            self.flags = flags
         # check we don't get the old value of MAXREPEAT
         # during the untranslated tests. 
         # On python3, MAXCODE can appear in patterns. It will be 65535
@@ -63,6 +64,29 @@ class CompiledPattern(object):
         if not we_are_translated() and rsre_char.CODESIZE != 2:
             assert 65535 not in pattern
 
+    def lowa(self, char_ord):
+        """Pre-3.7: uses getlower(flags).
+           Post-3.7: this is always getlower_ascii().
+        """
+        if not consts.V37:
+            return rsre_char.getlower(char_ord, self.flags)
+        else:
+            return rsre_char.getlower_ascii(char_ord)
+
+    def char_loc_ignore(self, index, char_ord):
+        assert consts.V37
+        pattern = self.pat(index)
+        return (char_ord == pattern or
+                rsre_char.getlower_locale(char_ord) == pattern or
+                rsre_char.getupper_locale(char_ord) == pattern)
+
+    def charset_loc_ignore(self, ctx, ppos, char_ord):
+        lo = rsre_char.getlower_locale(char_ord)
+        if rsre_char.check_charset(ctx, self, ppos, lo):
+            return True
+        up = rsre_char.getupper_locale(char_ord)
+        return up != lo and rsre_char.check_charset(ctx, self, ppos, up)
+
     def pat(self, index):
         jit.promote(self)
         check_nonneg(index)
@@ -74,6 +98,10 @@ class CompiledPattern(object):
         assert result >= 0
         return result
 
+MODE_ANY = '\x00'         # an empty match is fine
+MODE_NONEMPTY = '\x01'    # must have a non-empty match
+MODE_FULL = '\x02'        # must match the whole string
+
 class AbstractMatchContext(object):
     """Abstract base class"""
     _immutable_fields_ = ['end']
@@ -81,7 +109,7 @@ class AbstractMatchContext(object):
     match_end = 0
     match_marks = None
     match_marks_flat = None
-    fullmatch_only = False
+    match_mode = MODE_ANY
 
     def __init__(self, match_start, end):
         # 'match_start' and 'end' must be known to be non-negative
@@ -91,25 +119,30 @@ class AbstractMatchContext(object):
         self.match_start = match_start
         self.end = end
 
-    def reset(self, start):
+    def reset(self, start, must_advance=False):
         self.match_start = start
         self.match_marks = None
         self.match_marks_flat = None
+        #
+        assert MODE_ANY == chr(False)
+        assert MODE_NONEMPTY == chr(True)
+        self.match_mode = chr(must_advance)
+
+    @not_rpython
+    def _fullmatch_only(self, x=None):
+        raise Exception("'ctx.fullmatch_only' was replaced with"
+                        " 'ctx.match_mode'")
+    fullmatch_only = property(_fullmatch_only, _fullmatch_only)
 
     @not_rpython
     def str(self, index):
         """Must be overridden in a concrete subclass.
-        The tag ^^^ here is used to generate a translation-time crash
+        The @not_rpython is used to generate a translation-time crash
         if there is a call to str() that is indirect.  All calls must
         be direct for performance reasons; you need to specialize the
         caller with @specializectx."""
         raise NotImplementedError
 
-    @not_rpython
-    def lowstr(self, index, flags):
-        """Similar to str()."""
-        raise NotImplementedError
-
     # The following methods are provided to be overriden in
     # Utf8MatchContext.  The non-utf8 implementation is provided
     # by the FixedMatchContext abstract subclass, in order to use
@@ -236,10 +269,6 @@ class BufMatchContext(FixedMatchContext):
         check_nonneg(index)
         return ord(self._buffer.getitem(index))
 
-    def lowstr(self, index, flags):
-        c = self.str(index)
-        return rsre_char.getlower(c, flags)
-
     def fresh_copy(self, start):
         return BufMatchContext(self._buffer, start,
                                self.end)
@@ -261,10 +290,6 @@ class StrMatchContext(FixedMatchContext):
         check_nonneg(index)
         return ord(self._string[index])
 
-    def lowstr(self, index, flags):
-        c = self.str(index)
-        return rsre_char.getlower(c, flags)
-
     def fresh_copy(self, start):
         return StrMatchContext(self._string, start,
                                self.end)
@@ -289,10 +314,6 @@ class UnicodeMatchContext(FixedMatchContext):
         check_nonneg(index)
         return ord(self._unicodestr[index])
 
-    def lowstr(self, index, flags):
-        c = self.str(index)
-        return rsre_char.getlower(c, flags)
-
     def fresh_copy(self, start):
         return UnicodeMatchContext(self._unicodestr, start,
                                    self.end)
@@ -599,9 +620,13 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
             return
 
         elif op == consts.OPCODE_SUCCESS:
-            if ctx.fullmatch_only:
+            mode = ctx.match_mode
+            if mode == MODE_FULL:
                 if ptr != ctx.end:
                     return     # not a full match
+            elif mode == MODE_NONEMPTY:
+                if ptr == ctx.match_start:
+                    return     # empty match
             ctx.match_end = ptr
             ctx.match_marks = marks
             return MATCHED_OK
@@ -633,10 +658,10 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
                 ptr1 = ctx.prev_n(ptr, pattern.pat(ppos+1), ctx.ZERO)
             except EndOfString:
                 return
-            saved = ctx.fullmatch_only
-            ctx.fullmatch_only = False
+            saved = ctx.match_mode
+            ctx.match_mode = MODE_ANY
             stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is None
-            ctx.fullmatch_only = saved
+            ctx.match_mode = saved
             if stop:
                 return
             marks = ctx.match_marks
@@ -651,10 +676,10 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
             except EndOfString:
                 pass
             else:
-                saved = ctx.fullmatch_only
-                ctx.fullmatch_only = False
+                saved = ctx.match_mode
+                ctx.match_mode = MODE_ANY
                 stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is not None
-                ctx.fullmatch_only = saved
+                ctx.match_mode = saved
                 if stop:
                     return
             ppos += pattern.pat(ppos)
@@ -699,7 +724,29 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
             startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos))
             if length_bytes < 0:
                 return     # group was not previously defined
-            ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes, pattern.flags)
+            ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes, pattern)
+            if ptr < ctx.ZERO:
+                return     # no match
+            ppos += 1
+
+        elif consts.eq(op, consts.OPCODE37_GROUPREF_UNI_IGNORE):
+            # unicode version of OPCODE_GROUPREF_IGNORE
+            # <GROUPREF> <groupnum>
+            startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos))
+            if length_bytes < 0:
+                return     # group was not previously defined
+            ptr = match_repeated_uni_ignore(ctx, ptr, startptr, length_bytes)
+            if ptr < ctx.ZERO:
+                return     # no match
+            ppos += 1
+
+        elif consts.eq(op, consts.OPCODE37_GROUPREF_LOC_IGNORE):
+            # locale version of OPCODE_GROUPREF_IGNORE
+            # <GROUPREF> <groupnum>
+            startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos))
+            if length_bytes < 0:
+                return     # group was not previously defined
+            ptr = match_repeated_loc_ignore(ctx, ptr, startptr, length_bytes)
             if ptr < ctx.ZERO:
                 return     # no match
             ppos += 1
@@ -726,7 +773,25 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
             # match set member (or non_member), ignoring case
             # <IN> <skip> <set>
             if ptr >= ctx.end or not rsre_char.check_charset(ctx, pattern, ppos+1,
-                                                             ctx.lowstr(ptr, pattern.flags)):
+                                                             pattern.lowa(ctx.str(ptr))):
+                return
+            ppos += pattern.pat(ppos)
+            ptr = ctx.next(ptr)
+
+        elif consts.eq(op, consts.OPCODE37_IN_UNI_IGNORE):
+            # match set member (or non_member), ignoring case, unicode mode
+            # <IN> <skip> <set>
+            if ptr >= ctx.end or not rsre_char.check_charset(ctx, pattern, ppos+1,
+                                                             rsre_char.getlower_unicode(ctx.str(ptr))):
+                return
+            ppos += pattern.pat(ppos)
+            ptr = ctx.next(ptr)
+
+        elif consts.eq(op, consts.OPCODE37_IN_LOC_IGNORE):
+            # match set member (or non_member), ignoring case, locale mode
+            # <IN> <skip> <set>
+            if ptr >= ctx.end or not pattern.charset_loc_ignore(ctx, ppos+1,
+                                                                ctx.str(ptr)):
                 return
             ppos += pattern.pat(ppos)
             ptr = ctx.next(ptr)
@@ -752,7 +817,23 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
         elif op == consts.OPCODE_LITERAL_IGNORE:
             # match literal string, ignoring case
             # <LITERAL_IGNORE> <code>
-            if ptr >= ctx.end or ctx.lowstr(ptr, pattern.flags) != pattern.pat(ppos):
+            if ptr >= ctx.end or pattern.lowa(ctx.str(ptr)) != pattern.pat(ppos):
+                return
+            ppos += 1
+            ptr = ctx.next(ptr)
+
+        elif consts.eq(op, consts.OPCODE37_LITERAL_UNI_IGNORE):
+            # match literal string, ignoring case, unicode mode
+            # <LITERAL_IGNORE> <code>
+            if ptr >= ctx.end or rsre_char.getlower_unicode(ctx.str(ptr)) != pattern.pat(ppos):
+                return
+            ppos += 1
+            ptr = ctx.next(ptr)
+
+        elif consts.eq(op, consts.OPCODE37_LITERAL_LOC_IGNORE):
+            # match literal string, ignoring case, locale mode
+            # <LITERAL_IGNORE> <code>
+            if ptr >= ctx.end or not pattern.char_loc_ignore(ppos, ctx.str(ptr)):
                 return
             ppos += 1
             ptr = ctx.next(ptr)
@@ -775,7 +856,23 @@ def sre_match(ctx, pattern, ppos, ptr, marks):
         elif op == consts.OPCODE_NOT_LITERAL_IGNORE:
             # match if it's not a literal string, ignoring case
             # <NOT_LITERAL> <code>
-            if ptr >= ctx.end or ctx.lowstr(ptr, pattern.flags) == pattern.pat(ppos):
+            if ptr >= ctx.end or pattern.lowa(ctx.str(ptr)) == pattern.pat(ppos):
+                return
+            ppos += 1
+            ptr = ctx.next(ptr)
+
+        elif consts.eq(op, consts.OPCODE37_NOT_LITERAL_UNI_IGNORE):
+            # match if it's not a literal string, ignoring case, unicode mode
+            # <NOT_LITERAL> <code>
+            if ptr >= ctx.end or rsre_char.getlower_unicode(ctx.str(ptr)) == pattern.pat(ppos):
+                return
+            ppos += 1
+            ptr = ctx.next(ptr)
+
+        elif consts.eq(op, consts.OPCODE37_NOT_LITERAL_LOC_IGNORE):
+            # match if it's not a literal string, ignoring case, locale mode
+            # <NOT_LITERAL> <code>
+            if ptr >= ctx.end or pattern.char_loc_ignore(ppos, ctx.str(ptr)):
                 return
             ppos += 1
             ptr = ctx.next(ptr)
@@ -883,12 +980,36 @@ def match_repeated(ctx, ptr, oldptr, length_bytes):
     return True
 
 @specializectx
-def match_repeated_ignore(ctx, ptr, oldptr, length_bytes, flags):
+def match_repeated_ignore(ctx, ptr, oldptr, length_bytes, pattern):
+    oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+    while oldptr < oldend:
+        if ptr >= ctx.end:
+            return -1
+        if pattern.lowa(ctx.str(ptr)) != pattern.lowa(ctx.str(oldptr)):
+            return -1
+        ptr = ctx.next(ptr)
+        oldptr = ctx.next(oldptr)
+    return ptr
+
+@specializectx
+def match_repeated_uni_ignore(ctx, ptr, oldptr, length_bytes):
+    oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+    while oldptr < oldend:
+        if ptr >= ctx.end:
+            return -1
+        if rsre_char.getlower_unicode(ctx.str(ptr)) != rsre_char.getlower_unicode(ctx.str(oldptr)):
+            return -1
+        ptr = ctx.next(ptr)
+        oldptr = ctx.next(oldptr)
+    return ptr
+
+@specializectx
+def match_repeated_loc_ignore(ctx, ptr, oldptr, length_bytes):
     oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
     while oldptr < oldend:
         if ptr >= ctx.end:
             return -1
-        if ctx.lowstr(ptr, flags) != ctx.lowstr(oldptr, flags):
+        if rsre_char.getlower_locale(ctx.str(ptr)) != rsre_char.getlower_locale(ctx.str(oldptr)):
             return -1
         ptr = ctx.next(ptr)
         oldptr = ctx.next(oldptr)
@@ -955,54 +1076,63 @@ def match_IN(ctx, pattern, ptr, ppos):
     return rsre_char.check_charset(ctx, pattern, ppos+2, ctx.str(ptr))
 @specializectx
 def match_IN_IGNORE(ctx, pattern, ptr, ppos):
-    return rsre_char.check_charset(ctx, pattern, ppos+2, ctx.lowstr(ptr, pattern.flags))
+    return rsre_char.check_charset(ctx, pattern, ppos+2, pattern.lowa(ctx.str(ptr)))
+@specializectx
+def match_IN_UNI_IGNORE(ctx, pattern, ptr, ppos):
+    return rsre_char.check_charset(ctx, pattern, ppos+2, rsre_char.getlower_unicode(ctx.str(ptr)))
+@specializectx
+def match_IN_LOC_IGNORE(ctx, pattern, ptr, ppos):
+    return pattern.charset_loc_ignore(ctx, ppos+2, ctx.str(ptr))
 @specializectx
 def match_LITERAL(ctx, pattern, ptr, ppos):
     return ctx.str(ptr) == pattern.pat(ppos+1)
 @specializectx
 def match_LITERAL_IGNORE(ctx, pattern, ptr, ppos):
-    return ctx.lowstr(ptr, pattern.flags) == pattern.pat(ppos+1)
+    return pattern.lowa(ctx.str(ptr)) == pattern.pat(ppos+1)
+@specializectx
+def match_LITERAL_UNI_IGNORE(ctx, pattern, ptr, ppos):
+    return rsre_char.getlower_unicode(ctx.str(ptr)) == pattern.pat(ppos+1)
+@specializectx
+def match_LITERAL_LOC_IGNORE(ctx, pattern, ptr, ppos):
+    return pattern.char_loc_ignore(ppos+1, ctx.str(ptr))
 @specializectx
 def match_NOT_LITERAL(ctx, pattern, ptr, ppos):
     return ctx.str(ptr) != pattern.pat(ppos+1)
 @specializectx
 def match_NOT_LITERAL_IGNORE(ctx, pattern, ptr, ppos):
-    return ctx.lowstr(ptr, pattern.flags) != pattern.pat(ppos+1)
+    return pattern.lowa(ctx.str(ptr)) != pattern.pat(ppos+1)
+@specializectx
+def match_NOT_LITERAL_UNI_IGNORE(ctx, pattern, ptr, ppos):
+    return rsre_char.getlower_unicode(ctx.str(ptr)) != pattern.pat(ppos+1)
+@specializectx
+def match_NOT_LITERAL_LOC_IGNORE(ctx, pattern, ptr, ppos):
+    return not pattern.char_loc_ignore(ppos+1, ctx.str(ptr))
 
 def _make_fre(checkerfn):
     if checkerfn == match_ANY_ALL:
         def fre(ctx, pattern, ptr, end, ppos):
             return end
-    elif checkerfn == match_IN:
-        install_jitdriver_spec('MatchIn',
+    elif checkerfn in (match_IN, match_IN_IGNORE, match_IN_UNI_IGNORE):
+        # produces three jitdrivers:
+        #     MatchIn
+        #     MatchInIgnore
+        #     MatchInUniIgnore
+        name = checkerfn.__name__.title().replace('_', '')
+        method_name = "jitdriver_" + name
+        install_jitdriver_spec(name,
                                greens=['ppos', 'pattern'],
                                reds=['ptr', 'end', 'ctx'],
                                debugprint=(1, 0))
         @specializectx
         def fre(ctx, pattern, ptr, end, ppos):
             while True:
-                ctx.jitdriver_MatchIn.jit_merge_point(ctx=ctx, ptr=ptr,
+                getattr(ctx, method_name).jit_merge_point(ctx=ctx, ptr=ptr,
                                                       end=end, ppos=ppos,
                                                       pattern=pattern)
                 if ptr < end and checkerfn(ctx, pattern, ptr, ppos):
                     ptr = ctx.next(ptr)
                 else:
                     return ptr
-    elif checkerfn == match_IN_IGNORE:
-        install_jitdriver_spec('MatchInIgnore',
-                               greens=['ppos', 'pattern'],
-                               reds=['ptr', 'end', 'ctx'],
-                               debugprint=(1, 0))
-        @specializectx
-        def fre(ctx, pattern, ptr, end, ppos):
-            while True:
-                ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr,
-                                                            end=end, ppos=ppos,
-                                                            pattern=pattern)
-                if ptr < end and checkerfn(ctx, pattern, ptr, ppos):
-                    ptr = ctx.next(ptr)
-                else:
-                    return ptr
     else:
         # in the other cases, the fre() function is not JITted at all
         # and is present as a residual call.
@@ -1019,11 +1149,19 @@ unroll_char_checker = [
     (consts.OPCODE_ANY_ALL,            match_ANY_ALL),
     (consts.OPCODE_IN,                 match_IN),
     (consts.OPCODE_IN_IGNORE,          match_IN_IGNORE),
+    (consts.OPCODE37_IN_UNI_IGNORE,           match_IN_UNI_IGNORE),
+    (consts.OPCODE37_IN_LOC_IGNORE,           match_IN_LOC_IGNORE),
     (consts.OPCODE_LITERAL,            match_LITERAL),
     (consts.OPCODE_LITERAL_IGNORE,     match_LITERAL_IGNORE),
+    (consts.OPCODE37_LITERAL_UNI_IGNORE,      match_LITERAL_UNI_IGNORE),
+    (consts.OPCODE37_LITERAL_LOC_IGNORE,      match_LITERAL_LOC_IGNORE),
     (consts.OPCODE_NOT_LITERAL,        match_NOT_LITERAL),
     (consts.OPCODE_NOT_LITERAL_IGNORE, match_NOT_LITERAL_IGNORE),
+    (consts.OPCODE37_NOT_LITERAL_UNI_IGNORE,  match_NOT_LITERAL_UNI_IGNORE),
+    (consts.OPCODE37_NOT_LITERAL_LOC_IGNORE,  match_NOT_LITERAL_LOC_IGNORE),
     ]
+unroll_char_checker = [(_op, _fn) for (_op, _fn) in unroll_char_checker
+                       if _op is not None]   # possibly removes the OPCODE37_*
 unroll_fre_checker = [(_op, _make_fre(_fn))
                       for (_op, _fn) in unroll_char_checker]
 
@@ -1119,7 +1257,8 @@ def match(pattern, string, start=0, end=sys.maxint, fullmatch=False):
     assert isinstance(pattern, CompiledPattern)
     start, end = _adjust(start, end, len(string))
     ctx = StrMatchContext(string, start, end)
-    ctx.fullmatch_only = fullmatch
+    if fullmatch:
+        ctx.match_mode = MODE_FULL
     if match_context(ctx, pattern):
         return ctx
     else:
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
index 834748ebaa..aaac302ac0 100644
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -20,10 +20,6 @@ class Utf8MatchContext(AbstractMatchContext):
         check_nonneg(index)
         return rutf8.codepoint_at_pos(self._utf8, index)
 
-    def lowstr(self, index, flags):
-        c = self.str(index)
-        return rsre_char.getlower(c, flags)
-
     def get_single_byte(self, base_position, index):
         return self._utf8[base_position + index]
 
diff --git a/rpython/rlib/rsre/test/test_char.py b/rpython/rlib/rsre/test/test_char.py
index bd3a6f2936..ef6a573b9e 100644
--- a/rpython/rlib/rsre/test/test_char.py
+++ b/rpython/rlib/rsre/test/test_char.py
@@ -204,3 +204,15 @@ def test_general_category():
     assert check_charset(pat, 0, 99)    # Lcheck_charset(pat, 0, 453)   # Lt
     assert not check_charset(pat, 0, 688)    # Lm
     assert not check_charset(pat, 0, 5870)   # Nl
+
+def test_iscased():
+    assert rsre_char.iscased_ascii(65)
+    assert rsre_char.iscased_ascii(100)
+    assert rsre_char.iscased_ascii(64) is False
+    assert rsre_char.iscased_ascii(126) is False
+    assert rsre_char.iscased_ascii(1260) is False
+    assert rsre_char.iscased_ascii(12600) is False
+    for i in range(65, 10000, 33):
+        assert rsre_char.iscased_unicode(i) == (
+            rsre_char.getlower_unicode(i) != i or
+            rsre_char.getupper_unicode(i) != i)
diff --git a/rpython/rlib/rsre/test/test_match.py b/rpython/rlib/rsre/test/test_match.py
index c832244b11..758c015f7a 100644
--- a/rpython/rlib/rsre/test/test_match.py
+++ b/rpython/rlib/rsre/test/test_match.py
@@ -1,5 +1,5 @@
 import re, random, py
-from rpython.rlib.rsre import rsre_char
+from rpython.rlib.rsre import rsre_char, rsre_constants
 from rpython.rlib.rsre.rpy import get_code, VERSION
 from rpython.rlib.rsre.test.support import match, fullmatch, Position as P
 
@@ -306,6 +306,10 @@ class TestMatch:
         rsre_char.set_unicode_db(unicodedb)
         #
         r = get_code(u"[\U00010428-\U0001044f]", re.I)
-        assert r.pattern.count(27) == 1       # OPCODE_RANGE
-        r.pattern[r.pattern.index(27)] = 32   # => OPCODE_RANGE_IGNORE
+        assert r.pattern.count(rsre_constants.OPCODE_RANGE) == 1
+        if rsre_constants.V37:
+            repl = rsre_constants.OPCODE37_RANGE_UNI_IGNORE
+        else:
+            repl = rsre_constants.OPCODE27_RANGE_IGNORE
+        r.pattern[r.pattern.index(rsre_constants.OPCODE_RANGE)] = repl
         assert match(r, u"\U00010428")
author	Armin Rigo <arigo@tunes.org>	2020-11-24 11:53:44 +0000
committer	Armin Rigo <arigo@tunes.org>	2020-11-24 11:53:44 +0000
commit	2798aa697ef635ce0b85d486dbb6b24b833d9e9e (patch)
tree	ac017e4259c44f3ee9110ae57d67c0df5dab4e09 /rpython/rlib
parent	update how-to-release document (diff)
download	pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.tar.gz pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.tar.bz2 pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.zip