diff options
author | Armin Rigo <arigo@tunes.org> | 2020-11-24 11:53:44 +0000 |
---|---|---|
committer | Armin Rigo <arigo@tunes.org> | 2020-11-24 11:53:44 +0000 |
commit | 2798aa697ef635ce0b85d486dbb6b24b833d9e9e (patch) | |
tree | ac017e4259c44f3ee9110ae57d67c0df5dab4e09 /rpython/rlib | |
parent | update how-to-release document (diff) | |
download | pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.tar.gz pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.tar.bz2 pypy-2798aa697ef635ce0b85d486dbb6b24b833d9e9e.zip |
back-port the rpython bits of "py3.7-rsre"
Diffstat (limited to 'rpython/rlib')
-rw-r--r-- | rpython/rlib/rsre/rpy/_sre.py | 3 | ||||
-rw-r--r-- | rpython/rlib/rsre/rpy/sre_constants.py | 40 | ||||
-rw-r--r-- | rpython/rlib/rsre/rsre_char.py | 70 | ||||
-rw-r--r-- | rpython/rlib/rsre/rsre_constants.py | 66 | ||||
-rw-r--r-- | rpython/rlib/rsre/rsre_core.py | 251 | ||||
-rw-r--r-- | rpython/rlib/rsre/rsre_utf8.py | 4 | ||||
-rw-r--r-- | rpython/rlib/rsre/test/test_char.py | 12 | ||||
-rw-r--r-- | rpython/rlib/rsre/test/test_match.py | 10 |
8 files changed, 327 insertions, 129 deletions
diff --git a/rpython/rlib/rsre/rpy/_sre.py b/rpython/rlib/rsre/rpy/_sre.py index 617345483a..70d7737297 100644 --- a/rpython/rlib/rsre/rpy/_sre.py +++ b/rpython/rlib/rsre/rpy/_sre.py @@ -22,6 +22,9 @@ def get_code(regexp, flags=0, allargs=False): """NOT_RPYTHON: you can't compile new regexps in an RPython program, you can only use precompiled ones""" from . import sre_compile + if rsre_constants.V37: + import pytest + pytest.skip("This test cannot run in a 3.7 branch of pypy") try: sre_compile.compile(regexp, flags) except GotIt as e: diff --git a/rpython/rlib/rsre/rpy/sre_constants.py b/rpython/rlib/rsre/rpy/sre_constants.py index 89cbdb0d5f..4b9deac743 100644 --- a/rpython/rlib/rsre/rpy/sre_constants.py +++ b/rpython/rlib/rsre/rpy/sre_constants.py @@ -94,35 +94,17 @@ CATEGORY_UNI_NOT_WORD = "category_uni_not_word" CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" -OPCODES = [ - - # failure=0 success=1 (just because it looks better that way :-) - FAILURE, SUCCESS, - - ANY, ANY_ALL, - ASSERT, ASSERT_NOT, - AT, - BRANCH, - CALL, - CATEGORY, - CHARSET, BIGCHARSET, - GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE, - IN, IN_IGNORE, - INFO, - JUMP, - LITERAL, LITERAL_IGNORE, - MARK, - MAX_UNTIL, - MIN_UNTIL, - NOT_LITERAL, NOT_LITERAL_IGNORE, - NEGATE, - RANGE, - REPEAT, - REPEAT_ONE, - SUBPATTERN, - MIN_REPEAT_ONE, - RANGE_IGNORE, -] +def _rpython_opcodes(): + from rpython.rlib.rsre import rsre_constants as consts + mapping = {} + for name, value in consts.__dict__.items(): + if name.startswith('OPCODE') and isinstance(value, int) and value < 70: + name = name[6:].lstrip('012346789_').lower() + mapping[value] = name + # check that there are no holes + assert sorted(mapping.keys()) == range(len(mapping)) + return [name for value, name in sorted(mapping.items())] +OPCODES = _rpython_opcodes() ATCODES = [ AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, diff --git a/rpython/rlib/rsre/rsre_char.py b/rpython/rlib/rsre/rsre_char.py index 1680d2973d..d5595d01db 100644 --- a/rpython/rlib/rsre/rsre_char.py +++ b/rpython/rlib/rsre/rsre_char.py @@ -45,10 +45,10 @@ else: # codesize. But sre_compile will compile some stuff differently depending on the # codesize (e.g., charsets). from rpython.rlib.runicode import MAXUNICODE -if MAXUNICODE == 65535: +if MAXUNICODE == 65535 and not consts.V37: CODESIZE = 2 else: - CODESIZE = 4 + CODESIZE = 4 # always 4 from py3.7 copyright = "_sre.py 2.4 Copyright 2005 by Nik Haldimann" @@ -57,16 +57,22 @@ BIG_ENDIAN = sys.byteorder == "big" def getlower_ascii(char_ord): return char_ord + int_between(ord('A'), char_ord, ord('Z') + 1) * (ord('a') - ord('A')) +def getlower_locale(char_ord): + if char_ord < 256: # cheating! Well, CPython does too. + char_ord = tolower(char_ord) + return char_ord + +def getlower_unicode(char_ord): + if char_ord < 128: # shortcut for ascii + return getlower_ascii(char_ord) + assert unicodedb is not None + return unicodedb.tolower(char_ord) + def getlower(char_ord, flags): if flags & consts.SRE_FLAG_LOCALE: - if char_ord < 256: # cheating! Well, CPython does too. - char_ord = tolower(char_ord) - return char_ord + char_ord = getlower_locale(char_ord) elif flags & consts.SRE_FLAG_UNICODE: - if char_ord < 128: # shortcut for ascii - return getlower_ascii(char_ord) - assert unicodedb is not None - char_ord = unicodedb.tolower(char_ord) + char_ord = getlower_unicode(char_ord) else: char_ord = getlower_ascii(char_ord) return char_ord @@ -74,20 +80,38 @@ def getlower(char_ord, flags): def getupper_ascii(char_ord): return char_ord - int_between(ord('a'), char_ord, ord('z') + 1) * (ord('a') - ord('A')) +def getupper_locale(char_ord): + if char_ord < 256: # cheating! Well, CPython does too. + char_ord = toupper(char_ord) + return char_ord + +def getupper_unicode(char_ord): + if char_ord < 128: # shortcut for ascii + return getupper_ascii(char_ord) + assert unicodedb is not None + return unicodedb.toupper(char_ord) + def getupper(char_ord, flags): if flags & consts.SRE_FLAG_LOCALE: - if char_ord < 256: # cheating! Well, CPython does too. - char_ord = toupper(char_ord) - return char_ord + char_ord = getupper_locale(char_ord) elif flags & consts.SRE_FLAG_UNICODE: - if char_ord < 128: # shortcut for ascii - return getupper_ascii(char_ord) - assert unicodedb is not None - char_ord = unicodedb.toupper(char_ord) + char_ord = getupper_unicode(char_ord) else: char_ord = getupper_ascii(char_ord) return char_ord +def iscased_ascii(char_ord): # used by py3.7 + upper = int_between(ord('A'), char_ord, ord('Z')+1) + lower = int_between(ord('a'), char_ord, ord('z')+1) + return upper | lower + +def iscased_unicode(char_ord): # used by py3.7 + # NOTE: this is not unicodedb.iscased(). As per CPython 3.7, it is + # something different which---as far as I can tell---doesn't really + # have a meaning on its own, but well. + return (char_ord != getlower_unicode(char_ord) or + char_ord != getupper_unicode(char_ord)) + #### Category helpers is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)] @@ -223,12 +247,22 @@ def set_range(ctx, pattern, index, char_code): def set_range_ignore(ctx, pattern, index, char_code): # <RANGE_IGNORE> <lower> <upper> # the char_code is already lower cased + assert not consts.V37 lower = pattern.pattern[index + 1] upper = pattern.pattern[index + 2] match1 = int_between(lower, char_code, upper + 1) match2 = int_between(lower, getupper(char_code, pattern.flags), upper + 1) return match1 | match2, index + 3 +def set_range_uni_ignore(ctx, pattern, index, char_code): + # <RANGE_UNI_IGNORE> <lower> <upper> + # the char_code is already lower cased + lower = pattern.pattern[index + 1] + upper = pattern.pattern[index + 2] + match1 = int_between(lower, char_code, upper + 1) + match2 = int_between(lower, getupper_unicode(char_code), upper + 1) + return match1 | match2, index + 3 + def set_bigcharset(ctx, pattern, index, char_code): # <BIGCHARSET> <blockcount> <256 blockindices> <blocks> count = pattern.pattern[index+1] @@ -300,7 +334,9 @@ set_dispatch_table = { consts.OPCODE_BIGCHARSET: set_bigcharset, consts.OPCODE_LITERAL: set_literal, consts.OPCODE_RANGE: set_range, - consts.OPCODE_RANGE_IGNORE: set_range_ignore, + consts.OPCODE27_RANGE_IGNORE: set_range_ignore, + consts.OPCODE37_RANGE_UNI_IGNORE: set_range_uni_ignore, consts.OPCODE_UNICODE_GENERAL_CATEGORY: set_unicode_general_category, } +set_dispatch_table.pop(None, None) # remove the OPCODE27_* or OPCODE37_* set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items())) diff --git a/rpython/rlib/rsre/rsre_constants.py b/rpython/rlib/rsre/rsre_constants.py index 9af708532a..9994db7b05 100644 --- a/rpython/rlib/rsre/rsre_constants.py +++ b/rpython/rlib/rsre/rsre_constants.py @@ -1,3 +1,15 @@ +# Horrible import-time hack. +# Blame CPython for renumbering these OPCODE_* at some point. +from rpython.rlib.objectmodel import specialize +try: + import pypy.module.sys.version + V37 = pypy.module.sys.version.CPYTHON_VERSION >= (3, 7) +except ImportError: + raise ImportError("Cannot import pypy.module.sys.version. You can safely " + "remove this 'raise' line if you are not interested in " + "PyPy but only RPython.") + V37 = False + OPCODE_FAILURE = 0 OPCODE_SUCCESS = 1 OPCODE_ANY = 2 @@ -6,35 +18,49 @@ OPCODE_ASSERT = 4 OPCODE_ASSERT_NOT = 5 OPCODE_AT = 6 OPCODE_BRANCH = 7 -#OPCODE_CALL = 8 +OPCODE_CALL = 8 # not used OPCODE_CATEGORY = 9 OPCODE_CHARSET = 10 OPCODE_BIGCHARSET = 11 OPCODE_GROUPREF = 12 OPCODE_GROUPREF_EXISTS = 13 -OPCODE_GROUPREF_IGNORE = 14 -OPCODE_IN = 15 -OPCODE_IN_IGNORE = 16 -OPCODE_INFO = 17 -OPCODE_JUMP = 18 -OPCODE_LITERAL = 19 -OPCODE_LITERAL_IGNORE = 20 -OPCODE_MARK = 21 -OPCODE_MAX_UNTIL = 22 -OPCODE_MIN_UNTIL = 23 -OPCODE_NOT_LITERAL = 24 -OPCODE_NOT_LITERAL_IGNORE = 25 -OPCODE_NEGATE = 26 -OPCODE_RANGE = 27 -OPCODE_REPEAT = 28 -OPCODE_REPEAT_ONE = 29 -#OPCODE_SUBPATTERN = 30 -OPCODE_MIN_REPEAT_ONE = 31 -OPCODE_RANGE_IGNORE = 32 +OPCODE_GROUPREF_IGNORE = 28 if V37 else 14 +OPCODE_IN = 14 if V37 else 15 +OPCODE_IN_IGNORE = 29 if V37 else 16 +OPCODE_INFO = 15 if V37 else 17 +OPCODE_JUMP = 16 if V37 else 18 +OPCODE_LITERAL = 17 if V37 else 19 +OPCODE_LITERAL_IGNORE = 30 if V37 else 20 +OPCODE_MARK = 18 if V37 else 21 +OPCODE_MAX_UNTIL = 19 if V37 else 22 +OPCODE_MIN_UNTIL = 20 if V37 else 23 +OPCODE_NOT_LITERAL = 21 if V37 else 24 +OPCODE_NOT_LITERAL_IGNORE = 31 if V37 else 25 +OPCODE_NEGATE = 22 if V37 else 26 +OPCODE_RANGE = 23 if V37 else 27 +OPCODE_REPEAT = 24 if V37 else 28 +OPCODE_REPEAT_ONE = 25 if V37 else 29 +OPCODE_SUBPATTERN = 26 if V37 else 30 # not used +OPCODE_MIN_REPEAT_ONE = 27 if V37 else 31 +OPCODE27_RANGE_IGNORE = None if V37 else 32 + +OPCODE37_GROUPREF_LOC_IGNORE = 32 if V37 else None +OPCODE37_IN_LOC_IGNORE = 33 if V37 else None +OPCODE37_LITERAL_LOC_IGNORE = 34 if V37 else None +OPCODE37_NOT_LITERAL_LOC_IGNORE = 35 if V37 else None +OPCODE37_GROUPREF_UNI_IGNORE = 36 if V37 else None +OPCODE37_IN_UNI_IGNORE = 37 if V37 else None +OPCODE37_LITERAL_UNI_IGNORE = 38 if V37 else None +OPCODE37_NOT_LITERAL_UNI_IGNORE = 39 if V37 else None +OPCODE37_RANGE_UNI_IGNORE = 40 if V37 else None # not used by Python itself OPCODE_UNICODE_GENERAL_CATEGORY = 70 +@specialize.argtype(1) +def eq(op, const): + return const is not None and op == const + AT_BEGINNING = 0 AT_BEGINNING_LINE = 1 diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py index 489636b783..3ce901c46a 100644 --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -55,7 +55,8 @@ class CompiledPattern(object): def __init__(self, pattern, flags): self.pattern = pattern - self.flags = flags + if not consts.V37: # 'flags' is ignored in >=3.7 mode + self.flags = flags # check we don't get the old value of MAXREPEAT # during the untranslated tests. # On python3, MAXCODE can appear in patterns. It will be 65535 @@ -63,6 +64,29 @@ class CompiledPattern(object): if not we_are_translated() and rsre_char.CODESIZE != 2: assert 65535 not in pattern + def lowa(self, char_ord): + """Pre-3.7: uses getlower(flags). + Post-3.7: this is always getlower_ascii(). + """ + if not consts.V37: + return rsre_char.getlower(char_ord, self.flags) + else: + return rsre_char.getlower_ascii(char_ord) + + def char_loc_ignore(self, index, char_ord): + assert consts.V37 + pattern = self.pat(index) + return (char_ord == pattern or + rsre_char.getlower_locale(char_ord) == pattern or + rsre_char.getupper_locale(char_ord) == pattern) + + def charset_loc_ignore(self, ctx, ppos, char_ord): + lo = rsre_char.getlower_locale(char_ord) + if rsre_char.check_charset(ctx, self, ppos, lo): + return True + up = rsre_char.getupper_locale(char_ord) + return up != lo and rsre_char.check_charset(ctx, self, ppos, up) + def pat(self, index): jit.promote(self) check_nonneg(index) @@ -74,6 +98,10 @@ class CompiledPattern(object): assert result >= 0 return result +MODE_ANY = '\x00' # an empty match is fine +MODE_NONEMPTY = '\x01' # must have a non-empty match +MODE_FULL = '\x02' # must match the whole string + class AbstractMatchContext(object): """Abstract base class""" _immutable_fields_ = ['end'] @@ -81,7 +109,7 @@ class AbstractMatchContext(object): match_end = 0 match_marks = None match_marks_flat = None - fullmatch_only = False + match_mode = MODE_ANY def __init__(self, match_start, end): # 'match_start' and 'end' must be known to be non-negative @@ -91,25 +119,30 @@ class AbstractMatchContext(object): self.match_start = match_start self.end = end - def reset(self, start): + def reset(self, start, must_advance=False): self.match_start = start self.match_marks = None self.match_marks_flat = None + # + assert MODE_ANY == chr(False) + assert MODE_NONEMPTY == chr(True) + self.match_mode = chr(must_advance) + + @not_rpython + def _fullmatch_only(self, x=None): + raise Exception("'ctx.fullmatch_only' was replaced with" + " 'ctx.match_mode'") + fullmatch_only = property(_fullmatch_only, _fullmatch_only) @not_rpython def str(self, index): """Must be overridden in a concrete subclass. - The tag ^^^ here is used to generate a translation-time crash + The @not_rpython is used to generate a translation-time crash if there is a call to str() that is indirect. All calls must be direct for performance reasons; you need to specialize the caller with @specializectx.""" raise NotImplementedError - @not_rpython - def lowstr(self, index, flags): - """Similar to str().""" - raise NotImplementedError - # The following methods are provided to be overriden in # Utf8MatchContext. The non-utf8 implementation is provided # by the FixedMatchContext abstract subclass, in order to use @@ -236,10 +269,6 @@ class BufMatchContext(FixedMatchContext): check_nonneg(index) return ord(self._buffer.getitem(index)) - def lowstr(self, index, flags): - c = self.str(index) - return rsre_char.getlower(c, flags) - def fresh_copy(self, start): return BufMatchContext(self._buffer, start, self.end) @@ -261,10 +290,6 @@ class StrMatchContext(FixedMatchContext): check_nonneg(index) return ord(self._string[index]) - def lowstr(self, index, flags): - c = self.str(index) - return rsre_char.getlower(c, flags) - def fresh_copy(self, start): return StrMatchContext(self._string, start, self.end) @@ -289,10 +314,6 @@ class UnicodeMatchContext(FixedMatchContext): check_nonneg(index) return ord(self._unicodestr[index]) - def lowstr(self, index, flags): - c = self.str(index) - return rsre_char.getlower(c, flags) - def fresh_copy(self, start): return UnicodeMatchContext(self._unicodestr, start, self.end) @@ -599,9 +620,13 @@ def sre_match(ctx, pattern, ppos, ptr, marks): return elif op == consts.OPCODE_SUCCESS: - if ctx.fullmatch_only: + mode = ctx.match_mode + if mode == MODE_FULL: if ptr != ctx.end: return # not a full match + elif mode == MODE_NONEMPTY: + if ptr == ctx.match_start: + return # empty match ctx.match_end = ptr ctx.match_marks = marks return MATCHED_OK @@ -633,10 +658,10 @@ def sre_match(ctx, pattern, ppos, ptr, marks): ptr1 = ctx.prev_n(ptr, pattern.pat(ppos+1), ctx.ZERO) except EndOfString: return - saved = ctx.fullmatch_only - ctx.fullmatch_only = False + saved = ctx.match_mode + ctx.match_mode = MODE_ANY stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is None - ctx.fullmatch_only = saved + ctx.match_mode = saved if stop: return marks = ctx.match_marks @@ -651,10 +676,10 @@ def sre_match(ctx, pattern, ppos, ptr, marks): except EndOfString: pass else: - saved = ctx.fullmatch_only - ctx.fullmatch_only = False + saved = ctx.match_mode + ctx.match_mode = MODE_ANY stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is not None - ctx.fullmatch_only = saved + ctx.match_mode = saved if stop: return ppos += pattern.pat(ppos) @@ -699,7 +724,29 @@ def sre_match(ctx, pattern, ppos, ptr, marks): startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos)) if length_bytes < 0: return # group was not previously defined - ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes, pattern.flags) + ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes, pattern) + if ptr < ctx.ZERO: + return # no match + ppos += 1 + + elif consts.eq(op, consts.OPCODE37_GROUPREF_UNI_IGNORE): + # unicode version of OPCODE_GROUPREF_IGNORE + # <GROUPREF> <groupnum> + startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos)) + if length_bytes < 0: + return # group was not previously defined + ptr = match_repeated_uni_ignore(ctx, ptr, startptr, length_bytes) + if ptr < ctx.ZERO: + return # no match + ppos += 1 + + elif consts.eq(op, consts.OPCODE37_GROUPREF_LOC_IGNORE): + # locale version of OPCODE_GROUPREF_IGNORE + # <GROUPREF> <groupnum> + startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos)) + if length_bytes < 0: + return # group was not previously defined + ptr = match_repeated_loc_ignore(ctx, ptr, startptr, length_bytes) if ptr < ctx.ZERO: return # no match ppos += 1 @@ -726,7 +773,25 @@ def sre_match(ctx, pattern, ppos, ptr, marks): # match set member (or non_member), ignoring case # <IN> <skip> <set> if ptr >= ctx.end or not rsre_char.check_charset(ctx, pattern, ppos+1, - ctx.lowstr(ptr, pattern.flags)): + pattern.lowa(ctx.str(ptr))): + return + ppos += pattern.pat(ppos) + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_IN_UNI_IGNORE): + # match set member (or non_member), ignoring case, unicode mode + # <IN> <skip> <set> + if ptr >= ctx.end or not rsre_char.check_charset(ctx, pattern, ppos+1, + rsre_char.getlower_unicode(ctx.str(ptr))): + return + ppos += pattern.pat(ppos) + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_IN_LOC_IGNORE): + # match set member (or non_member), ignoring case, locale mode + # <IN> <skip> <set> + if ptr >= ctx.end or not pattern.charset_loc_ignore(ctx, ppos+1, + ctx.str(ptr)): return ppos += pattern.pat(ppos) ptr = ctx.next(ptr) @@ -752,7 +817,23 @@ def sre_match(ctx, pattern, ppos, ptr, marks): elif op == consts.OPCODE_LITERAL_IGNORE: # match literal string, ignoring case # <LITERAL_IGNORE> <code> - if ptr >= ctx.end or ctx.lowstr(ptr, pattern.flags) != pattern.pat(ppos): + if ptr >= ctx.end or pattern.lowa(ctx.str(ptr)) != pattern.pat(ppos): + return + ppos += 1 + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_LITERAL_UNI_IGNORE): + # match literal string, ignoring case, unicode mode + # <LITERAL_IGNORE> <code> + if ptr >= ctx.end or rsre_char.getlower_unicode(ctx.str(ptr)) != pattern.pat(ppos): + return + ppos += 1 + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_LITERAL_LOC_IGNORE): + # match literal string, ignoring case, locale mode + # <LITERAL_IGNORE> <code> + if ptr >= ctx.end or not pattern.char_loc_ignore(ppos, ctx.str(ptr)): return ppos += 1 ptr = ctx.next(ptr) @@ -775,7 +856,23 @@ def sre_match(ctx, pattern, ppos, ptr, marks): elif op == consts.OPCODE_NOT_LITERAL_IGNORE: # match if it's not a literal string, ignoring case # <NOT_LITERAL> <code> - if ptr >= ctx.end or ctx.lowstr(ptr, pattern.flags) == pattern.pat(ppos): + if ptr >= ctx.end or pattern.lowa(ctx.str(ptr)) == pattern.pat(ppos): + return + ppos += 1 + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_NOT_LITERAL_UNI_IGNORE): + # match if it's not a literal string, ignoring case, unicode mode + # <NOT_LITERAL> <code> + if ptr >= ctx.end or rsre_char.getlower_unicode(ctx.str(ptr)) == pattern.pat(ppos): + return + ppos += 1 + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_NOT_LITERAL_LOC_IGNORE): + # match if it's not a literal string, ignoring case, locale mode + # <NOT_LITERAL> <code> + if ptr >= ctx.end or pattern.char_loc_ignore(ppos, ctx.str(ptr)): return ppos += 1 ptr = ctx.next(ptr) @@ -883,12 +980,36 @@ def match_repeated(ctx, ptr, oldptr, length_bytes): return True @specializectx -def match_repeated_ignore(ctx, ptr, oldptr, length_bytes, flags): +def match_repeated_ignore(ctx, ptr, oldptr, length_bytes, pattern): + oldend = ctx.go_forward_by_bytes(oldptr, length_bytes) + while oldptr < oldend: + if ptr >= ctx.end: + return -1 + if pattern.lowa(ctx.str(ptr)) != pattern.lowa(ctx.str(oldptr)): + return -1 + ptr = ctx.next(ptr) + oldptr = ctx.next(oldptr) + return ptr + +@specializectx +def match_repeated_uni_ignore(ctx, ptr, oldptr, length_bytes): + oldend = ctx.go_forward_by_bytes(oldptr, length_bytes) + while oldptr < oldend: + if ptr >= ctx.end: + return -1 + if rsre_char.getlower_unicode(ctx.str(ptr)) != rsre_char.getlower_unicode(ctx.str(oldptr)): + return -1 + ptr = ctx.next(ptr) + oldptr = ctx.next(oldptr) + return ptr + +@specializectx +def match_repeated_loc_ignore(ctx, ptr, oldptr, length_bytes): oldend = ctx.go_forward_by_bytes(oldptr, length_bytes) while oldptr < oldend: if ptr >= ctx.end: return -1 - if ctx.lowstr(ptr, flags) != ctx.lowstr(oldptr, flags): + if rsre_char.getlower_locale(ctx.str(ptr)) != rsre_char.getlower_locale(ctx.str(oldptr)): return -1 ptr = ctx.next(ptr) oldptr = ctx.next(oldptr) @@ -955,54 +1076,63 @@ def match_IN(ctx, pattern, ptr, ppos): return rsre_char.check_charset(ctx, pattern, ppos+2, ctx.str(ptr)) @specializectx def match_IN_IGNORE(ctx, pattern, ptr, ppos): - return rsre_char.check_charset(ctx, pattern, ppos+2, ctx.lowstr(ptr, pattern.flags)) + return rsre_char.check_charset(ctx, pattern, ppos+2, pattern.lowa(ctx.str(ptr))) +@specializectx +def match_IN_UNI_IGNORE(ctx, pattern, ptr, ppos): + return rsre_char.check_charset(ctx, pattern, ppos+2, rsre_char.getlower_unicode(ctx.str(ptr))) +@specializectx +def match_IN_LOC_IGNORE(ctx, pattern, ptr, ppos): + return pattern.charset_loc_ignore(ctx, ppos+2, ctx.str(ptr)) @specializectx def match_LITERAL(ctx, pattern, ptr, ppos): return ctx.str(ptr) == pattern.pat(ppos+1) @specializectx def match_LITERAL_IGNORE(ctx, pattern, ptr, ppos): - return ctx.lowstr(ptr, pattern.flags) == pattern.pat(ppos+1) + return pattern.lowa(ctx.str(ptr)) == pattern.pat(ppos+1) +@specializectx +def match_LITERAL_UNI_IGNORE(ctx, pattern, ptr, ppos): + return rsre_char.getlower_unicode(ctx.str(ptr)) == pattern.pat(ppos+1) +@specializectx +def match_LITERAL_LOC_IGNORE(ctx, pattern, ptr, ppos): + return pattern.char_loc_ignore(ppos+1, ctx.str(ptr)) @specializectx def match_NOT_LITERAL(ctx, pattern, ptr, ppos): return ctx.str(ptr) != pattern.pat(ppos+1) @specializectx def match_NOT_LITERAL_IGNORE(ctx, pattern, ptr, ppos): - return ctx.lowstr(ptr, pattern.flags) != pattern.pat(ppos+1) + return pattern.lowa(ctx.str(ptr)) != pattern.pat(ppos+1) +@specializectx +def match_NOT_LITERAL_UNI_IGNORE(ctx, pattern, ptr, ppos): + return rsre_char.getlower_unicode(ctx.str(ptr)) != pattern.pat(ppos+1) +@specializectx +def match_NOT_LITERAL_LOC_IGNORE(ctx, pattern, ptr, ppos): + return not pattern.char_loc_ignore(ppos+1, ctx.str(ptr)) def _make_fre(checkerfn): if checkerfn == match_ANY_ALL: def fre(ctx, pattern, ptr, end, ppos): return end - elif checkerfn == match_IN: - install_jitdriver_spec('MatchIn', + elif checkerfn in (match_IN, match_IN_IGNORE, match_IN_UNI_IGNORE): + # produces three jitdrivers: + # MatchIn + # MatchInIgnore + # MatchInUniIgnore + name = checkerfn.__name__.title().replace('_', '') + method_name = "jitdriver_" + name + install_jitdriver_spec(name, greens=['ppos', 'pattern'], reds=['ptr', 'end', 'ctx'], debugprint=(1, 0)) @specializectx def fre(ctx, pattern, ptr, end, ppos): while True: - ctx.jitdriver_MatchIn.jit_merge_point(ctx=ctx, ptr=ptr, + getattr(ctx, method_name).jit_merge_point(ctx=ctx, ptr=ptr, end=end, ppos=ppos, pattern=pattern) if ptr < end and checkerfn(ctx, pattern, ptr, ppos): ptr = ctx.next(ptr) else: return ptr - elif checkerfn == match_IN_IGNORE: - install_jitdriver_spec('MatchInIgnore', - greens=['ppos', 'pattern'], - reds=['ptr', 'end', 'ctx'], - debugprint=(1, 0)) - @specializectx - def fre(ctx, pattern, ptr, end, ppos): - while True: - ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr, - end=end, ppos=ppos, - pattern=pattern) - if ptr < end and checkerfn(ctx, pattern, ptr, ppos): - ptr = ctx.next(ptr) - else: - return ptr else: # in the other cases, the fre() function is not JITted at all # and is present as a residual call. @@ -1019,11 +1149,19 @@ unroll_char_checker = [ (consts.OPCODE_ANY_ALL, match_ANY_ALL), (consts.OPCODE_IN, match_IN), (consts.OPCODE_IN_IGNORE, match_IN_IGNORE), + (consts.OPCODE37_IN_UNI_IGNORE, match_IN_UNI_IGNORE), + (consts.OPCODE37_IN_LOC_IGNORE, match_IN_LOC_IGNORE), (consts.OPCODE_LITERAL, match_LITERAL), (consts.OPCODE_LITERAL_IGNORE, match_LITERAL_IGNORE), + (consts.OPCODE37_LITERAL_UNI_IGNORE, match_LITERAL_UNI_IGNORE), + (consts.OPCODE37_LITERAL_LOC_IGNORE, match_LITERAL_LOC_IGNORE), (consts.OPCODE_NOT_LITERAL, match_NOT_LITERAL), (consts.OPCODE_NOT_LITERAL_IGNORE, match_NOT_LITERAL_IGNORE), + (consts.OPCODE37_NOT_LITERAL_UNI_IGNORE, match_NOT_LITERAL_UNI_IGNORE), + (consts.OPCODE37_NOT_LITERAL_LOC_IGNORE, match_NOT_LITERAL_LOC_IGNORE), ] +unroll_char_checker = [(_op, _fn) for (_op, _fn) in unroll_char_checker + if _op is not None] # possibly removes the OPCODE37_* unroll_fre_checker = [(_op, _make_fre(_fn)) for (_op, _fn) in unroll_char_checker] @@ -1119,7 +1257,8 @@ def match(pattern, string, start=0, end=sys.maxint, fullmatch=False): assert isinstance(pattern, CompiledPattern) start, end = _adjust(start, end, len(string)) ctx = StrMatchContext(string, start, end) - ctx.fullmatch_only = fullmatch + if fullmatch: + ctx.match_mode = MODE_FULL if match_context(ctx, pattern): return ctx else: diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py index 834748ebaa..aaac302ac0 100644 --- a/rpython/rlib/rsre/rsre_utf8.py +++ b/rpython/rlib/rsre/rsre_utf8.py @@ -20,10 +20,6 @@ class Utf8MatchContext(AbstractMatchContext): check_nonneg(index) return rutf8.codepoint_at_pos(self._utf8, index) - def lowstr(self, index, flags): - c = self.str(index) - return rsre_char.getlower(c, flags) - def get_single_byte(self, base_position, index): return self._utf8[base_position + index] diff --git a/rpython/rlib/rsre/test/test_char.py b/rpython/rlib/rsre/test/test_char.py index bd3a6f2936..ef6a573b9e 100644 --- a/rpython/rlib/rsre/test/test_char.py +++ b/rpython/rlib/rsre/test/test_char.py @@ -204,3 +204,15 @@ def test_general_category(): assert check_charset(pat, 0, 99) # Lcheck_charset(pat, 0, 453) # Lt assert not check_charset(pat, 0, 688) # Lm assert not check_charset(pat, 0, 5870) # Nl + +def test_iscased(): + assert rsre_char.iscased_ascii(65) + assert rsre_char.iscased_ascii(100) + assert rsre_char.iscased_ascii(64) is False + assert rsre_char.iscased_ascii(126) is False + assert rsre_char.iscased_ascii(1260) is False + assert rsre_char.iscased_ascii(12600) is False + for i in range(65, 10000, 33): + assert rsre_char.iscased_unicode(i) == ( + rsre_char.getlower_unicode(i) != i or + rsre_char.getupper_unicode(i) != i) diff --git a/rpython/rlib/rsre/test/test_match.py b/rpython/rlib/rsre/test/test_match.py index c832244b11..758c015f7a 100644 --- a/rpython/rlib/rsre/test/test_match.py +++ b/rpython/rlib/rsre/test/test_match.py @@ -1,5 +1,5 @@ import re, random, py -from rpython.rlib.rsre import rsre_char +from rpython.rlib.rsre import rsre_char, rsre_constants from rpython.rlib.rsre.rpy import get_code, VERSION from rpython.rlib.rsre.test.support import match, fullmatch, Position as P @@ -306,6 +306,10 @@ class TestMatch: rsre_char.set_unicode_db(unicodedb) # r = get_code(u"[\U00010428-\U0001044f]", re.I) - assert r.pattern.count(27) == 1 # OPCODE_RANGE - r.pattern[r.pattern.index(27)] = 32 # => OPCODE_RANGE_IGNORE + assert r.pattern.count(rsre_constants.OPCODE_RANGE) == 1 + if rsre_constants.V37: + repl = rsre_constants.OPCODE37_RANGE_UNI_IGNORE + else: + repl = rsre_constants.OPCODE27_RANGE_IGNORE + r.pattern[r.pattern.index(rsre_constants.OPCODE_RANGE)] = repl assert match(r, u"\U00010428") |