diff options
author | Ronan Lamy <ronan.lamy@gmail.com> | 2020-11-25 20:25:40 +0000 |
---|---|---|
committer | Ronan Lamy <ronan.lamy@gmail.com> | 2020-11-25 20:25:40 +0000 |
commit | 3c2bacc346bf0557e146c909b1723568ac2f02d2 (patch) | |
tree | f83daf6cb1c7449b78984e9a9c2ee358b67fa4ac | |
parent | hg merge default (diff) | |
parent | fix for a case where the difference shows up with recent unicode databases only (diff) | |
download | pypy-3c2bacc346bf0557e146c909b1723568ac2f02d2.tar.gz pypy-3c2bacc346bf0557e146c909b1723568ac2f02d2.tar.bz2 pypy-3c2bacc346bf0557e146c909b1723568ac2f02d2.zip |
hg merge default
-rw-r--r-- | .gitlab-ci.yml | 7 | ||||
-rw-r--r-- | .gitlab-ci/Dockerfile | 28 | ||||
-rw-r--r-- | .gitlab-ci/build.sh | 10 | ||||
-rw-r--r-- | .hgtags | 6 | ||||
-rw-r--r-- | lib-python/2.7/test/test_xml_etree.py | 20 | ||||
-rw-r--r-- | lib-python/2.7/xml/etree/ElementTree.py | 2 | ||||
-rw-r--r-- | pypy/doc/contributing.rst | 31 | ||||
-rw-r--r-- | pypy/doc/how-to-release.rst | 8 | ||||
-rw-r--r-- | pypy/doc/project-ideas.rst | 34 | ||||
-rw-r--r-- | pypy/doc/release-v7.3.3.rst | 27 | ||||
-rw-r--r-- | pypy/doc/whatsnew-head.rst | 11 | ||||
-rwxr-xr-x | pypy/tool/release/repackage.sh | 8 | ||||
-rw-r--r-- | pypy/tool/test/test_tab.py | 2 | ||||
-rw-r--r-- | rpython/rlib/rsre/rpy/_sre.py | 3 | ||||
-rw-r--r-- | rpython/rlib/rsre/rpy/sre_constants.py | 40 | ||||
-rw-r--r-- | rpython/rlib/rsre/rsre_char.py | 76 | ||||
-rw-r--r-- | rpython/rlib/rsre/rsre_constants.py | 66 | ||||
-rw-r--r-- | rpython/rlib/rsre/rsre_core.py | 251 | ||||
-rw-r--r-- | rpython/rlib/rsre/rsre_utf8.py | 9 | ||||
-rw-r--r-- | rpython/rlib/rsre/test/support.py | 4 | ||||
-rw-r--r-- | rpython/rlib/rsre/test/test_char.py | 12 | ||||
-rw-r--r-- | rpython/rlib/rsre/test/test_match.py | 10 | ||||
-rw-r--r-- | rpython/rlib/rwinreg.py | 40 |
23 files changed, 492 insertions, 213 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b5964fff08..7ac5eb3c13 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,5 @@ check_rpython_annotations: - image: octobus/ci-for-pypy - script: - - (cd pypy/goal; python2 ../../rpython/bin/rpython --batch -O2 --annotate targetpypystandalone) + image: registry.heptapod.net/pypy/pypy/ci:v1 + script: | + cd pypy/goal + python2 ../../rpython/bin/rpython --batch -O2 --annotate targetpypystandalone diff --git a/.gitlab-ci/Dockerfile b/.gitlab-ci/Dockerfile new file mode 100644 index 0000000000..a2cfc4cc22 --- /dev/null +++ b/.gitlab-ci/Dockerfile @@ -0,0 +1,28 @@ +FROM debian:buster + +RUN apt-get update && \ + apt-get install -y \ + build-essential \ + gcc \ + libbz2-dev \ + libexpat1-dev \ + libffi-dev \ + libgc-dev \ + libgdbm-dev \ + liblzma-dev \ + libncurses5-dev \ + libncursesw5-dev \ + libsqlite3-dev \ + libssl-dev \ + make \ + pkg-config \ + pypy-dev \ + python \ + python-cffi \ + python-dev \ + python-pip \ + python-virtualenv \ + tk-dev \ + zlib1g-dev \ + && \ + apt-get -y clean diff --git a/.gitlab-ci/build.sh b/.gitlab-ci/build.sh new file mode 100644 index 0000000000..68859cbc7a --- /dev/null +++ b/.gitlab-ci/build.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +GITLAB="registry.heptapod.net" +TAG="${GITLAB}/pypy/pypy/ci:v1" + +docker build --tag "${TAG}" --file "Dockerfile" . + +echo -e "run: \n docker login ${GITLAB} # first time\n docker push ${TAG}"
\ No newline at end of file @@ -95,3 +95,9 @@ d38cd66c14b86430155e2a122e59648e93011cc0 release-pypy3.6-v7.3.2 a29ef73f9b32953753d0dd6d2a56255fa2892e24 release-pypy2.7-v7.3.3rc1 fab92f174c7754272e9bef31d2a6d66d8d45188b release-pypy3.6-v7.3.3rc1 6cf8fa20a7f6fc61dc07402e1e859cf31bf977ad release-pypy3.7-v7.3.3rc1 +a29ef73f9b32953753d0dd6d2a56255fa2892e24 release-pypy2.7-v7.3.3rc2 +db1e853f94de42ad711bd930222bd2434e0f900d release-pypy3.6-v7.3.3rc2 +7e6e2bb30ac5fbdbd443619cae28c51d5c162a02 release-pypy3.7-v7.3.3rc2 +a29ef73f9b32953753d0dd6d2a56255fa2892e24 release-pypy2.7-v7.3.3 +db1e853f94de42ad711bd930222bd2434e0f900d release-pypy3.6-v7.3.3 +7e6e2bb30ac5fbdbd443619cae28c51d5c162a02 release-pypy3.7-v7.3.3 diff --git a/lib-python/2.7/test/test_xml_etree.py b/lib-python/2.7/test/test_xml_etree.py index f6d5d17fc3..f1f1c21cd1 100644 --- a/lib-python/2.7/test/test_xml_etree.py +++ b/lib-python/2.7/test/test_xml_etree.py @@ -87,6 +87,19 @@ ENTITY_XML = """\ <document>&entity;</document> """ +# backport from https://github.com/python/cpython/pull/22987 +ATTLIST_XML = """\ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE Foo [ +<!ELEMENT foo (bar*)> +<!ELEMENT bar (#PCDATA)*> +<!ATTLIST bar xml:lang CDATA "eng"> +<!ENTITY qux "quux"> +]> +<foo> +<bar>&qux;</bar> +</foo> +""" def checkwarnings(*filters): def decorator(test): @@ -1002,6 +1015,13 @@ class ElementTreeTest(unittest.TestCase): method='html') self.assertEqual(serialized, expected) + # backported from https://github.com/python/cpython/pull/22987 + def test_attlist_default(self): + # Test default attribute values; See BPO 42151. + root = ET.fromstring(ATTLIST_XML) + self.assertEqual(root[0].attrib, + {'{http://www.w3.org/XML/1998/namespace}lang': 'eng'}) + # # xinclude tests (samples from appendix C of the xinclude specification) diff --git a/lib-python/2.7/xml/etree/ElementTree.py b/lib-python/2.7/xml/etree/ElementTree.py index dca69106d1..4cae355398 100644 --- a/lib-python/2.7/xml/etree/ElementTree.py +++ b/lib-python/2.7/xml/etree/ElementTree.py @@ -1226,7 +1226,6 @@ class _IterParseIterator(object): if event == "start": try: parser.ordered_attributes = 1 - parser.specified_attributes = 1 def handler(tag, attrib_in, event=event, append=append, start=self._parser._start_list): append((event, start(tag, attrib_in))) @@ -1505,7 +1504,6 @@ class XMLParser(object): # use new-style attribute handling, if supported try: self._parser.ordered_attributes = 1 - self._parser.specified_attributes = 1 parser.StartElementHandler = self._start_list except AttributeError: pass diff --git a/pypy/doc/contributing.rst b/pypy/doc/contributing.rst index f5477e5805..959cecb065 100644 --- a/pypy/doc/contributing.rst +++ b/pypy/doc/contributing.rst @@ -40,14 +40,16 @@ details of which can be found in our :ref:`contact <contact>` section. The folks there are very friendly, and can point you in the right direction. We give out commit rights usually fairly liberally, so if you want to do something -with PyPy, you can become a committer. We also run frequent coding sprints which -are separately announced and often happen around Python conferences such as -EuroPython or PyCon. Upcoming events are usually announced on `the blog`_. +with PyPy, you can become a "developer" by logging into https://foss.heptapod.net +and clicking the "Request Access" link on the `PyPy group page`. We also run +coding sprints which are separately announced and are usually announced on `the +blog`_. Further Reading: :ref:`Contact <contact>` .. _the blog: https://morepypy.blogspot.com .. _pypy-dev mailing list: https://mail.python.org/mailman/listinfo/pypy-dev +.. _`PyPy group page`: https://foss.heptapod.net/pypy Your first contribution @@ -96,31 +98,19 @@ Thanks to `Octobus <https://octobus.net/>`_ and `Clever Cloud </a> </h1> -If you are new with Mercurial and Heptapod, you can read this `short tutorial`_ - -.. _`short tutorial`: https://heptapod.net/pages/quick-start-guide.html - -However, we recommend at this time you **not** use topic branches. We prefer -the usual mercurial named branch model, as pointed out in the :ref:`FAQ -<github>` about why we didn't move to git. - Get Access ---------- -The important take-away from that tutorial for experienced developers is that -since the free hosting on foss.heptapod.net does not allow personal forks, you +As stated above, you need to request access to the repo. +Since the free hosting on foss.heptapod.net does not allow personal forks, you need permissions to push your changes directly to our repo. Once you sign in to https://foss.heptapod.net using either a new login or your GitHub or Atlassian logins, you can get developer status for pushing directly to the project (just ask by clicking the link at foss.heptapod.net/pypy just under the logo, and you'll get it, basically). Once you have it you can rewrite your file ``.hg/hgrc`` to contain ``default = ssh://hg@foss.heptapod.net/pypy/pypy``. -Your changes will then be pushed directly to the official repo, but (if you -follow these rules) they are still on a branch, and we can still review the -branches you want to merge. With developer status, you can push topic -branches. If you wish to push long-lived branches, you will need to ask for -higher permissions. - +Your changes will then be pushed directly to a branch on the official repo, and +we will review the branches you want to merge. Clone ----- @@ -138,8 +128,7 @@ Clone then edit ``.hg/hgrc`` as above and do ``hg pull && hg up``. * Now you have a complete copy of the PyPy repo. Make a long-lived branch - with a command like ``hg branch name_of_your_branch``, or make a short- - lived branch for a simple fix with a command like ``hg topic issueXXXX``. + with a command like ``hg branch name_of_your_branch``. Edit ---- diff --git a/pypy/doc/how-to-release.rst b/pypy/doc/how-to-release.rst index 43cc823fcb..81d6fe0ee2 100644 --- a/pypy/doc/how-to-release.rst +++ b/pypy/doc/how-to-release.rst @@ -96,6 +96,10 @@ Other steps create a fresh whatsnew_head.rst after the release and add the new file to pypy/doc/index-of-whatsnew.rst + * rename pypy/doc/whatsnew-pypy3-HEAD.rst to whatsnew-pypy3-VERSION.rst + create a fresh whatsnew-pypy3_HEAD.rst after the release + and add the new file to pypy/doc/index-of-whatsnew.rst + * write release announcement pypy/doc/release-VERSION.rst The release announcement should contain a direct link to the download page @@ -125,7 +129,8 @@ Other steps Also repackage and upload source "-src.tar.bz2" - * Upload binaries to https://buildbot.pypy.org/mirror + * Upload binaries to https://buildbot.pypy.org/mirror. It takes an hour for + https://downloads.python.org/pypy/ to sync * Send out a mailing list message asking for last-minute comments and testing @@ -142,5 +147,4 @@ Other steps * add a tag on the codespeed web site that corresponds to pypy release * revise versioning at https://readthedocs.org/projects/pypy - * tag the final release(s) with appropriate tags diff --git a/pypy/doc/project-ideas.rst b/pypy/doc/project-ideas.rst index 9eeabedd49..70026f9597 100644 --- a/pypy/doc/project-ideas.rst +++ b/pypy/doc/project-ideas.rst @@ -150,26 +150,18 @@ knowledge of the internals. Head over to `vmprof-python`_, `vmprof-server`_ and .. _vmprof-server: https://github.com/vmprof/vmprof-server .. _vmprof-integration: https://github.com/vmprof/vmprof-integration -Optimized Unicode Representation --------------------------------- - -CPython 3.3 will use an optimized unicode representation (see :pep:`0393`) which switches between -different ways to represent a unicode string, depending on whether the string -fits into ASCII, has only two-byte characters or needs four-byte characters. - -The actual details would be rather different in PyPy, but we would like to have -the same optimization implemented. - -Or maybe not. We can also play around with the idea of using a single -representation: as a byte string in utf-8. (This idea needs some extra logic -for efficient indexing, like a cache.) Work has begun on the ``unicode-utf`` -and ``unicode-utf8-py3`` branches. More is needed, for instance there are -SIMD optimizations that are not yet used. - Convert RPython to Python3 -------------------------- -The world is moving on, we should too. +The world is moving on, we should too. Work in this direction has begun on the +``rpython3`` branch, mainly to enable building documentation with Python3. Some +things that are known to need careful refactoring: +- a single character in python3 is an int, not a byte +- we use ``str``/``unicode`` to distiguish between different modes of + operation for windows in ``make_win32_traits``. + +There are probably more. The branch currently does not pass rpython tests so +work is needed to back out some of the changes and redo them properly Improve performance ------------------- @@ -273,7 +265,10 @@ and it is hard to imagine NumPy abandoning the C-API. Here are a few ideas: Support more platforms ---------------------- -We have a plan for a `Windows 64`_ port. +We have a plan for a `Windows 64`_ port. There is progress on the ``win64`` +branch. Help is needed to continue the work. Stage I is complete: we now have +a 64-bit PyPy2.7 on windows. But it is missing cpyext and other tidbits to +enable releasing it. .. _`Windows 64`: windows.html#what-is-missing-for-a-full-64-bit-translation @@ -322,3 +317,6 @@ good work that needs to be finished: TODO: see the end of the blog post +Work has begun on HPy_ to enable a faster C-API. + +.. _HPy: https://hpy.readthedocs.io/en/latest/ diff --git a/pypy/doc/release-v7.3.3.rst b/pypy/doc/release-v7.3.3.rst index 2a072b1525..ea6c0cf537 100644 --- a/pypy/doc/release-v7.3.3.rst +++ b/pypy/doc/release-v7.3.3.rst @@ -2,12 +2,6 @@ PyPy v7.3.3: release of 2.7, 3.6, and 3.7 beta ============================================== -.. note:: - This is a pre-release announcement. When the release actually happens, it - will be announced on the `morepypy blog`_ - -.. _`morepypy blog`: https://morepypy.blogspot.com - -The PyPy team is proud to release the version 7.3.3 of PyPy, which includes three different interpreters: @@ -39,9 +33,9 @@ releases, but read on to find out what is new. interpreters like GraalPython_ (written on top of the Java virtual machine), RustPython_, and PyPy. Thanks to Oracle for sponsoring work on HPy. -Several issues were exposed in the 7.3.2 release. Many of them came from the -great work ongoing to ship PyPy packages in `conda-forge`_. A big shout out -to them for taking this on. +Several issues exposed in the 7.3.2 release were fixed. Many of them came from the +great work ongoing to ship PyPy-compatible binary packages in `conda-forge`_. +A big shout out to them for taking this on. Development of PyPy has moved to https://foss.heptapod.net/pypy/pypy. This was covered more extensively in this `blog post`_. We have seen an @@ -52,7 +46,7 @@ The `CFFI`_ backend has been updated to version 1.14.3. We recommend using CFFI rather than c-extensions to interact with C, and using cppyy_ for performant wrapping of C++ code for Python. -A new contributor took us up on the challenge to get `windows 64-bit`` support. +A new contributor took us up on the challenge to get `windows 64-bit`_ support. The work is proceeding on the ``win64`` branch, more help in coding or sponsorship is welcome. In anticipation of merging this large change, we fixed many test failures on windows. @@ -161,17 +155,22 @@ Python 3.6+ - bpo-17288_: Prevent jump from a yield statement - bpo-11471_: avoid generating a ``JUMP_FORWARD`` instruction at the end of an ``if``-block if there is no ``else``-clause -- Fix ``os.listdir('')`` and ``os.stat('')`` on windows (issue 3331) +- Fix ``os.listdir('')`` and ``os.stat('')`` on windows (issue 3331_) - Fix many unicode encoding/decoding errors on windows -- Fix pickling of time subclasses (issue 3324, bpo 41966) -- Add support for ``sqlite3_load_extension`` (issue 3334) +- Fix pickling of time subclasses (issue 3324_, bpo-41966_) +- Add support for ``sqlite3_load_extension`` (issue 3334_) - Change default file encoding from mbcs to utf-8 on windows - Change default file encoding from ascii to utf-8 on linux - Add ``resource.prlimit()`` +- Accept PathLike in ``nt._getfullpathname`` (issue 3343_) +- Fix some problems with ``winreg`` + Python 3.6 C-API ~~~~~~~~~~~~~~~~ +- Export ``PyStructSequence_NewType`` (issue 3346_) + .. _3312: https://foss.heptapod.net/pypy/pypy/-/issues/3312 .. _3315: https://foss.heptapod.net/pypy/pypy/-/issues/3315 .. _3321: https://foss.heptapod.net/pypy/pypy/-/issues/3321 @@ -180,6 +179,8 @@ Python 3.6 C-API .. _3324: https://foss.heptapod.net/pypy/pypy/-/issues/3324 .. _3334: https://foss.heptapod.net/pypy/pypy/-/issues/3334 .. _3307: https://foss.heptapod.net/pypy/pypy/-/issues/3307 +.. _3343: https://foss.heptapod.net/pypy/pypy/-/issues/3343 +.. _3346: https://foss.heptapod.net/pypy/pypy/-/issues/3346 .. _`merge request 723`: https://foss.heptapod.net/pypy/pypy/-/merge_request/723 diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst index 5f21ccff61..6fe082cc9a 100644 --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -5,3 +5,14 @@ What's new in PyPy2.7 7.3.3+ .. this is a revision shortly after release-pypy-7.3.3 .. startrev: de512cf13506 +.. branch: new-ci-image + +CI: Add a Dockerfile for CI to prevent hitting pull limits on docker hub + +.. branch: issue-3333 + +Fix xml.etree.ElementTree assigning default attribute values: issue 3333 + +.. branch: rpython-rsre-for-37 + +Support for the new format of regular expressions in Python 3.7 diff --git a/pypy/tool/release/repackage.sh b/pypy/tool/release/repackage.sh index b6b4f801c2..29636ab220 100755 --- a/pypy/tool/release/repackage.sh +++ b/pypy/tool/release/repackage.sh @@ -1,12 +1,12 @@ #! /bin/bash # Edit these appropriately before running this script -pmaj=2 # python main version: 2 or 3 -pmin=7 # python minor version +pmaj=3 # python main version: 2 or 3 +pmin=6 # python minor version maj=7 min=3 -rev=2 -# rc=rc3 # set to blank for actual release +rev=3 +#rc=rc2 # set to blank for actual release function maybe_exit { if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then diff --git a/pypy/tool/test/test_tab.py b/pypy/tool/test/test_tab.py index 0aea831a39..ce96b61cf0 100644 --- a/pypy/tool/test/test_tab.py +++ b/pypy/tool/test/test_tab.py @@ -44,6 +44,8 @@ def test_no_pypy_import_in_rpython(): if os.path.isfile(path): if not path.lower().endswith('.py'): return + if path.lower().endswith('rsre_constants.py'): + return # exception in this file with file(path) as f: for line in f: if "import" not in line: diff --git a/rpython/rlib/rsre/rpy/_sre.py b/rpython/rlib/rsre/rpy/_sre.py index 617345483a..70d7737297 100644 --- a/rpython/rlib/rsre/rpy/_sre.py +++ b/rpython/rlib/rsre/rpy/_sre.py @@ -22,6 +22,9 @@ def get_code(regexp, flags=0, allargs=False): """NOT_RPYTHON: you can't compile new regexps in an RPython program, you can only use precompiled ones""" from . import sre_compile + if rsre_constants.V37: + import pytest + pytest.skip("This test cannot run in a 3.7 branch of pypy") try: sre_compile.compile(regexp, flags) except GotIt as e: diff --git a/rpython/rlib/rsre/rpy/sre_constants.py b/rpython/rlib/rsre/rpy/sre_constants.py index 89cbdb0d5f..4b9deac743 100644 --- a/rpython/rlib/rsre/rpy/sre_constants.py +++ b/rpython/rlib/rsre/rpy/sre_constants.py @@ -94,35 +94,17 @@ CATEGORY_UNI_NOT_WORD = "category_uni_not_word" CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" -OPCODES = [ - - # failure=0 success=1 (just because it looks better that way :-) - FAILURE, SUCCESS, - - ANY, ANY_ALL, - ASSERT, ASSERT_NOT, - AT, - BRANCH, - CALL, - CATEGORY, - CHARSET, BIGCHARSET, - GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE, - IN, IN_IGNORE, - INFO, - JUMP, - LITERAL, LITERAL_IGNORE, - MARK, - MAX_UNTIL, - MIN_UNTIL, - NOT_LITERAL, NOT_LITERAL_IGNORE, - NEGATE, - RANGE, - REPEAT, - REPEAT_ONE, - SUBPATTERN, - MIN_REPEAT_ONE, - RANGE_IGNORE, -] +def _rpython_opcodes(): + from rpython.rlib.rsre import rsre_constants as consts + mapping = {} + for name, value in consts.__dict__.items(): + if name.startswith('OPCODE') and isinstance(value, int) and value < 70: + name = name[6:].lstrip('012346789_').lower() + mapping[value] = name + # check that there are no holes + assert sorted(mapping.keys()) == range(len(mapping)) + return [name for value, name in sorted(mapping.items())] +OPCODES = _rpython_opcodes() ATCODES = [ AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, diff --git a/rpython/rlib/rsre/rsre_char.py b/rpython/rlib/rsre/rsre_char.py index 1680d2973d..49787946ae 100644 --- a/rpython/rlib/rsre/rsre_char.py +++ b/rpython/rlib/rsre/rsre_char.py @@ -25,6 +25,7 @@ def set_unicode_db(newunicodedb): for i in range(128): assert newunicodedb.tolower(i) == getlower_ascii(i) assert newunicodedb.toupper(i) == getupper_ascii(i) + assert newunicodedb.toupper_full(i) == [getupper_ascii(i)] global unicodedb unicodedb = newunicodedb @@ -45,10 +46,10 @@ else: # codesize. But sre_compile will compile some stuff differently depending on the # codesize (e.g., charsets). from rpython.rlib.runicode import MAXUNICODE -if MAXUNICODE == 65535: +if MAXUNICODE == 65535 and not consts.V37: CODESIZE = 2 else: - CODESIZE = 4 + CODESIZE = 4 # always 4 from py3.7 copyright = "_sre.py 2.4 Copyright 2005 by Nik Haldimann" @@ -57,16 +58,22 @@ BIG_ENDIAN = sys.byteorder == "big" def getlower_ascii(char_ord): return char_ord + int_between(ord('A'), char_ord, ord('Z') + 1) * (ord('a') - ord('A')) +def getlower_locale(char_ord): + if char_ord < 256: # cheating! Well, CPython does too. + char_ord = tolower(char_ord) + return char_ord + +def getlower_unicode(char_ord): + if char_ord < 128: # shortcut for ascii + return getlower_ascii(char_ord) + assert unicodedb is not None + return unicodedb.tolower(char_ord) + def getlower(char_ord, flags): if flags & consts.SRE_FLAG_LOCALE: - if char_ord < 256: # cheating! Well, CPython does too. - char_ord = tolower(char_ord) - return char_ord + char_ord = getlower_locale(char_ord) elif flags & consts.SRE_FLAG_UNICODE: - if char_ord < 128: # shortcut for ascii - return getlower_ascii(char_ord) - assert unicodedb is not None - char_ord = unicodedb.tolower(char_ord) + char_ord = getlower_unicode(char_ord) else: char_ord = getlower_ascii(char_ord) return char_ord @@ -74,20 +81,43 @@ def getlower(char_ord, flags): def getupper_ascii(char_ord): return char_ord - int_between(ord('a'), char_ord, ord('z') + 1) * (ord('a') - ord('A')) +def getupper_locale(char_ord): + if char_ord < 256: # cheating! Well, CPython does too. + char_ord = toupper(char_ord) + return char_ord + +def getupper_unicode(char_ord): + if char_ord < 128: # shortcut for ascii + return getupper_ascii(char_ord) + # Note: this is like CPython's sre_upper_unicode(), including for a few + # arguments like 0xfb05, whose uppercase is *several letters* in unicode. + # We return the first of these letters. That's rather random but no + # caller expects a sane result in this case, I think: iscased_unicode() + # is fine as long as it returns anything != char_ord in this case. + assert unicodedb is not None + return unicodedb.toupper_full(char_ord)[0] + def getupper(char_ord, flags): if flags & consts.SRE_FLAG_LOCALE: - if char_ord < 256: # cheating! Well, CPython does too. - char_ord = toupper(char_ord) - return char_ord + char_ord = getupper_locale(char_ord) elif flags & consts.SRE_FLAG_UNICODE: - if char_ord < 128: # shortcut for ascii - return getupper_ascii(char_ord) - assert unicodedb is not None - char_ord = unicodedb.toupper(char_ord) + char_ord = getupper_unicode(char_ord) else: char_ord = getupper_ascii(char_ord) return char_ord +def iscased_ascii(char_ord): # used by py3.7 + upper = int_between(ord('A'), char_ord, ord('Z')+1) + lower = int_between(ord('a'), char_ord, ord('z')+1) + return upper | lower + +def iscased_unicode(char_ord): # used by py3.7 + # NOTE: this is not unicodedb.iscased(). As per CPython 3.7, it is + # something different which---as far as I can tell---doesn't really + # have a meaning on its own, but well. + return (char_ord != getlower_unicode(char_ord) or + char_ord != getupper_unicode(char_ord)) + #### Category helpers is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)] @@ -223,12 +253,22 @@ def set_range(ctx, pattern, index, char_code): def set_range_ignore(ctx, pattern, index, char_code): # <RANGE_IGNORE> <lower> <upper> # the char_code is already lower cased + assert not consts.V37 lower = pattern.pattern[index + 1] upper = pattern.pattern[index + 2] match1 = int_between(lower, char_code, upper + 1) match2 = int_between(lower, getupper(char_code, pattern.flags), upper + 1) return match1 | match2, index + 3 +def set_range_uni_ignore(ctx, pattern, index, char_code): + # <RANGE_UNI_IGNORE> <lower> <upper> + # the char_code is already lower cased + lower = pattern.pattern[index + 1] + upper = pattern.pattern[index + 2] + match1 = int_between(lower, char_code, upper + 1) + match2 = int_between(lower, getupper_unicode(char_code), upper + 1) + return match1 | match2, index + 3 + def set_bigcharset(ctx, pattern, index, char_code): # <BIGCHARSET> <blockcount> <256 blockindices> <blocks> count = pattern.pattern[index+1] @@ -300,7 +340,9 @@ set_dispatch_table = { consts.OPCODE_BIGCHARSET: set_bigcharset, consts.OPCODE_LITERAL: set_literal, consts.OPCODE_RANGE: set_range, - consts.OPCODE_RANGE_IGNORE: set_range_ignore, + consts.OPCODE27_RANGE_IGNORE: set_range_ignore, + consts.OPCODE37_RANGE_UNI_IGNORE: set_range_uni_ignore, consts.OPCODE_UNICODE_GENERAL_CATEGORY: set_unicode_general_category, } +set_dispatch_table.pop(None, None) # remove the OPCODE27_* or OPCODE37_* set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items())) diff --git a/rpython/rlib/rsre/rsre_constants.py b/rpython/rlib/rsre/rsre_constants.py index 9af708532a..9994db7b05 100644 --- a/rpython/rlib/rsre/rsre_constants.py +++ b/rpython/rlib/rsre/rsre_constants.py @@ -1,3 +1,15 @@ +# Horrible import-time hack. +# Blame CPython for renumbering these OPCODE_* at some point. +from rpython.rlib.objectmodel import specialize +try: + import pypy.module.sys.version + V37 = pypy.module.sys.version.CPYTHON_VERSION >= (3, 7) +except ImportError: + raise ImportError("Cannot import pypy.module.sys.version. You can safely " + "remove this 'raise' line if you are not interested in " + "PyPy but only RPython.") + V37 = False + OPCODE_FAILURE = 0 OPCODE_SUCCESS = 1 OPCODE_ANY = 2 @@ -6,35 +18,49 @@ OPCODE_ASSERT = 4 OPCODE_ASSERT_NOT = 5 OPCODE_AT = 6 OPCODE_BRANCH = 7 -#OPCODE_CALL = 8 +OPCODE_CALL = 8 # not used OPCODE_CATEGORY = 9 OPCODE_CHARSET = 10 OPCODE_BIGCHARSET = 11 OPCODE_GROUPREF = 12 OPCODE_GROUPREF_EXISTS = 13 -OPCODE_GROUPREF_IGNORE = 14 -OPCODE_IN = 15 -OPCODE_IN_IGNORE = 16 -OPCODE_INFO = 17 -OPCODE_JUMP = 18 -OPCODE_LITERAL = 19 -OPCODE_LITERAL_IGNORE = 20 -OPCODE_MARK = 21 -OPCODE_MAX_UNTIL = 22 -OPCODE_MIN_UNTIL = 23 -OPCODE_NOT_LITERAL = 24 -OPCODE_NOT_LITERAL_IGNORE = 25 -OPCODE_NEGATE = 26 -OPCODE_RANGE = 27 -OPCODE_REPEAT = 28 -OPCODE_REPEAT_ONE = 29 -#OPCODE_SUBPATTERN = 30 -OPCODE_MIN_REPEAT_ONE = 31 -OPCODE_RANGE_IGNORE = 32 +OPCODE_GROUPREF_IGNORE = 28 if V37 else 14 +OPCODE_IN = 14 if V37 else 15 +OPCODE_IN_IGNORE = 29 if V37 else 16 +OPCODE_INFO = 15 if V37 else 17 +OPCODE_JUMP = 16 if V37 else 18 +OPCODE_LITERAL = 17 if V37 else 19 +OPCODE_LITERAL_IGNORE = 30 if V37 else 20 +OPCODE_MARK = 18 if V37 else 21 +OPCODE_MAX_UNTIL = 19 if V37 else 22 +OPCODE_MIN_UNTIL = 20 if V37 else 23 +OPCODE_NOT_LITERAL = 21 if V37 else 24 +OPCODE_NOT_LITERAL_IGNORE = 31 if V37 else 25 +OPCODE_NEGATE = 22 if V37 else 26 +OPCODE_RANGE = 23 if V37 else 27 +OPCODE_REPEAT = 24 if V37 else 28 +OPCODE_REPEAT_ONE = 25 if V37 else 29 +OPCODE_SUBPATTERN = 26 if V37 else 30 # not used +OPCODE_MIN_REPEAT_ONE = 27 if V37 else 31 +OPCODE27_RANGE_IGNORE = None if V37 else 32 + +OPCODE37_GROUPREF_LOC_IGNORE = 32 if V37 else None +OPCODE37_IN_LOC_IGNORE = 33 if V37 else None +OPCODE37_LITERAL_LOC_IGNORE = 34 if V37 else None +OPCODE37_NOT_LITERAL_LOC_IGNORE = 35 if V37 else None +OPCODE37_GROUPREF_UNI_IGNORE = 36 if V37 else None +OPCODE37_IN_UNI_IGNORE = 37 if V37 else None +OPCODE37_LITERAL_UNI_IGNORE = 38 if V37 else None +OPCODE37_NOT_LITERAL_UNI_IGNORE = 39 if V37 else None +OPCODE37_RANGE_UNI_IGNORE = 40 if V37 else None # not used by Python itself OPCODE_UNICODE_GENERAL_CATEGORY = 70 +@specialize.argtype(1) +def eq(op, const): + return const is not None and op == const + AT_BEGINNING = 0 AT_BEGINNING_LINE = 1 diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py index 489636b783..3ce901c46a 100644 --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -55,7 +55,8 @@ class CompiledPattern(object): def __init__(self, pattern, flags): self.pattern = pattern - self.flags = flags + if not consts.V37: # 'flags' is ignored in >=3.7 mode + self.flags = flags # check we don't get the old value of MAXREPEAT # during the untranslated tests. # On python3, MAXCODE can appear in patterns. It will be 65535 @@ -63,6 +64,29 @@ class CompiledPattern(object): if not we_are_translated() and rsre_char.CODESIZE != 2: assert 65535 not in pattern + def lowa(self, char_ord): + """Pre-3.7: uses getlower(flags). + Post-3.7: this is always getlower_ascii(). + """ + if not consts.V37: + return rsre_char.getlower(char_ord, self.flags) + else: + return rsre_char.getlower_ascii(char_ord) + + def char_loc_ignore(self, index, char_ord): + assert consts.V37 + pattern = self.pat(index) + return (char_ord == pattern or + rsre_char.getlower_locale(char_ord) == pattern or + rsre_char.getupper_locale(char_ord) == pattern) + + def charset_loc_ignore(self, ctx, ppos, char_ord): + lo = rsre_char.getlower_locale(char_ord) + if rsre_char.check_charset(ctx, self, ppos, lo): + return True + up = rsre_char.getupper_locale(char_ord) + return up != lo and rsre_char.check_charset(ctx, self, ppos, up) + def pat(self, index): jit.promote(self) check_nonneg(index) @@ -74,6 +98,10 @@ class CompiledPattern(object): assert result >= 0 return result +MODE_ANY = '\x00' # an empty match is fine +MODE_NONEMPTY = '\x01' # must have a non-empty match +MODE_FULL = '\x02' # must match the whole string + class AbstractMatchContext(object): """Abstract base class""" _immutable_fields_ = ['end'] @@ -81,7 +109,7 @@ class AbstractMatchContext(object): match_end = 0 match_marks = None match_marks_flat = None - fullmatch_only = False + match_mode = MODE_ANY def __init__(self, match_start, end): # 'match_start' and 'end' must be known to be non-negative @@ -91,25 +119,30 @@ class AbstractMatchContext(object): self.match_start = match_start self.end = end - def reset(self, start): + def reset(self, start, must_advance=False): self.match_start = start self.match_marks = None self.match_marks_flat = None + # + assert MODE_ANY == chr(False) + assert MODE_NONEMPTY == chr(True) + self.match_mode = chr(must_advance) + + @not_rpython + def _fullmatch_only(self, x=None): + raise Exception("'ctx.fullmatch_only' was replaced with" + " 'ctx.match_mode'") + fullmatch_only = property(_fullmatch_only, _fullmatch_only) @not_rpython def str(self, index): """Must be overridden in a concrete subclass. - The tag ^^^ here is used to generate a translation-time crash + The @not_rpython is used to generate a translation-time crash if there is a call to str() that is indirect. All calls must be direct for performance reasons; you need to specialize the caller with @specializectx.""" raise NotImplementedError - @not_rpython - def lowstr(self, index, flags): - """Similar to str().""" - raise NotImplementedError - # The following methods are provided to be overriden in # Utf8MatchContext. The non-utf8 implementation is provided # by the FixedMatchContext abstract subclass, in order to use @@ -236,10 +269,6 @@ class BufMatchContext(FixedMatchContext): check_nonneg(index) return ord(self._buffer.getitem(index)) - def lowstr(self, index, flags): - c = self.str(index) - return rsre_char.getlower(c, flags) - def fresh_copy(self, start): return BufMatchContext(self._buffer, start, self.end) @@ -261,10 +290,6 @@ class StrMatchContext(FixedMatchContext): check_nonneg(index) return ord(self._string[index]) - def lowstr(self, index, flags): - c = self.str(index) - return rsre_char.getlower(c, flags) - def fresh_copy(self, start): return StrMatchContext(self._string, start, self.end) @@ -289,10 +314,6 @@ class UnicodeMatchContext(FixedMatchContext): check_nonneg(index) return ord(self._unicodestr[index]) - def lowstr(self, index, flags): - c = self.str(index) - return rsre_char.getlower(c, flags) - def fresh_copy(self, start): return UnicodeMatchContext(self._unicodestr, start, self.end) @@ -599,9 +620,13 @@ def sre_match(ctx, pattern, ppos, ptr, marks): return elif op == consts.OPCODE_SUCCESS: - if ctx.fullmatch_only: + mode = ctx.match_mode + if mode == MODE_FULL: if ptr != ctx.end: return # not a full match + elif mode == MODE_NONEMPTY: + if ptr == ctx.match_start: + return # empty match ctx.match_end = ptr ctx.match_marks = marks return MATCHED_OK @@ -633,10 +658,10 @@ def sre_match(ctx, pattern, ppos, ptr, marks): ptr1 = ctx.prev_n(ptr, pattern.pat(ppos+1), ctx.ZERO) except EndOfString: return - saved = ctx.fullmatch_only - ctx.fullmatch_only = False + saved = ctx.match_mode + ctx.match_mode = MODE_ANY stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is None - ctx.fullmatch_only = saved + ctx.match_mode = saved if stop: return marks = ctx.match_marks @@ -651,10 +676,10 @@ def sre_match(ctx, pattern, ppos, ptr, marks): except EndOfString: pass else: - saved = ctx.fullmatch_only - ctx.fullmatch_only = False + saved = ctx.match_mode + ctx.match_mode = MODE_ANY stop = sre_match(ctx, pattern, ppos + 2, ptr1, marks) is not None - ctx.fullmatch_only = saved + ctx.match_mode = saved if stop: return ppos += pattern.pat(ppos) @@ -699,7 +724,29 @@ def sre_match(ctx, pattern, ppos, ptr, marks): startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos)) if length_bytes < 0: return # group was not previously defined - ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes, pattern.flags) + ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes, pattern) + if ptr < ctx.ZERO: + return # no match + ppos += 1 + + elif consts.eq(op, consts.OPCODE37_GROUPREF_UNI_IGNORE): + # unicode version of OPCODE_GROUPREF_IGNORE + # <GROUPREF> <groupnum> + startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos)) + if length_bytes < 0: + return # group was not previously defined + ptr = match_repeated_uni_ignore(ctx, ptr, startptr, length_bytes) + if ptr < ctx.ZERO: + return # no match + ppos += 1 + + elif consts.eq(op, consts.OPCODE37_GROUPREF_LOC_IGNORE): + # locale version of OPCODE_GROUPREF_IGNORE + # <GROUPREF> <groupnum> + startptr, length_bytes = get_group_ref(ctx, marks, pattern.pat(ppos)) + if length_bytes < 0: + return # group was not previously defined + ptr = match_repeated_loc_ignore(ctx, ptr, startptr, length_bytes) if ptr < ctx.ZERO: return # no match ppos += 1 @@ -726,7 +773,25 @@ def sre_match(ctx, pattern, ppos, ptr, marks): # match set member (or non_member), ignoring case # <IN> <skip> <set> if ptr >= ctx.end or not rsre_char.check_charset(ctx, pattern, ppos+1, - ctx.lowstr(ptr, pattern.flags)): + pattern.lowa(ctx.str(ptr))): + return + ppos += pattern.pat(ppos) + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_IN_UNI_IGNORE): + # match set member (or non_member), ignoring case, unicode mode + # <IN> <skip> <set> + if ptr >= ctx.end or not rsre_char.check_charset(ctx, pattern, ppos+1, + rsre_char.getlower_unicode(ctx.str(ptr))): + return + ppos += pattern.pat(ppos) + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_IN_LOC_IGNORE): + # match set member (or non_member), ignoring case, locale mode + # <IN> <skip> <set> + if ptr >= ctx.end or not pattern.charset_loc_ignore(ctx, ppos+1, + ctx.str(ptr)): return ppos += pattern.pat(ppos) ptr = ctx.next(ptr) @@ -752,7 +817,23 @@ def sre_match(ctx, pattern, ppos, ptr, marks): elif op == consts.OPCODE_LITERAL_IGNORE: # match literal string, ignoring case # <LITERAL_IGNORE> <code> - if ptr >= ctx.end or ctx.lowstr(ptr, pattern.flags) != pattern.pat(ppos): + if ptr >= ctx.end or pattern.lowa(ctx.str(ptr)) != pattern.pat(ppos): + return + ppos += 1 + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_LITERAL_UNI_IGNORE): + # match literal string, ignoring case, unicode mode + # <LITERAL_IGNORE> <code> + if ptr >= ctx.end or rsre_char.getlower_unicode(ctx.str(ptr)) != pattern.pat(ppos): + return + ppos += 1 + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_LITERAL_LOC_IGNORE): + # match literal string, ignoring case, locale mode + # <LITERAL_IGNORE> <code> + if ptr >= ctx.end or not pattern.char_loc_ignore(ppos, ctx.str(ptr)): return ppos += 1 ptr = ctx.next(ptr) @@ -775,7 +856,23 @@ def sre_match(ctx, pattern, ppos, ptr, marks): elif op == consts.OPCODE_NOT_LITERAL_IGNORE: # match if it's not a literal string, ignoring case # <NOT_LITERAL> <code> - if ptr >= ctx.end or ctx.lowstr(ptr, pattern.flags) == pattern.pat(ppos): + if ptr >= ctx.end or pattern.lowa(ctx.str(ptr)) == pattern.pat(ppos): + return + ppos += 1 + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_NOT_LITERAL_UNI_IGNORE): + # match if it's not a literal string, ignoring case, unicode mode + # <NOT_LITERAL> <code> + if ptr >= ctx.end or rsre_char.getlower_unicode(ctx.str(ptr)) == pattern.pat(ppos): + return + ppos += 1 + ptr = ctx.next(ptr) + + elif consts.eq(op, consts.OPCODE37_NOT_LITERAL_LOC_IGNORE): + # match if it's not a literal string, ignoring case, locale mode + # <NOT_LITERAL> <code> + if ptr >= ctx.end or pattern.char_loc_ignore(ppos, ctx.str(ptr)): return ppos += 1 ptr = ctx.next(ptr) @@ -883,12 +980,36 @@ def match_repeated(ctx, ptr, oldptr, length_bytes): return True @specializectx -def match_repeated_ignore(ctx, ptr, oldptr, length_bytes, flags): +def match_repeated_ignore(ctx, ptr, oldptr, length_bytes, pattern): + oldend = ctx.go_forward_by_bytes(oldptr, length_bytes) + while oldptr < oldend: + if ptr >= ctx.end: + return -1 + if pattern.lowa(ctx.str(ptr)) != pattern.lowa(ctx.str(oldptr)): + return -1 + ptr = ctx.next(ptr) + oldptr = ctx.next(oldptr) + return ptr + +@specializectx +def match_repeated_uni_ignore(ctx, ptr, oldptr, length_bytes): + oldend = ctx.go_forward_by_bytes(oldptr, length_bytes) + while oldptr < oldend: + if ptr >= ctx.end: + return -1 + if rsre_char.getlower_unicode(ctx.str(ptr)) != rsre_char.getlower_unicode(ctx.str(oldptr)): + return -1 + ptr = ctx.next(ptr) + oldptr = ctx.next(oldptr) + return ptr + +@specializectx +def match_repeated_loc_ignore(ctx, ptr, oldptr, length_bytes): oldend = ctx.go_forward_by_bytes(oldptr, length_bytes) while oldptr < oldend: if ptr >= ctx.end: return -1 - if ctx.lowstr(ptr, flags) != ctx.lowstr(oldptr, flags): + if rsre_char.getlower_locale(ctx.str(ptr)) != rsre_char.getlower_locale(ctx.str(oldptr)): return -1 ptr = ctx.next(ptr) oldptr = ctx.next(oldptr) @@ -955,54 +1076,63 @@ def match_IN(ctx, pattern, ptr, ppos): return rsre_char.check_charset(ctx, pattern, ppos+2, ctx.str(ptr)) @specializectx def match_IN_IGNORE(ctx, pattern, ptr, ppos): - return rsre_char.check_charset(ctx, pattern, ppos+2, ctx.lowstr(ptr, pattern.flags)) + return rsre_char.check_charset(ctx, pattern, ppos+2, pattern.lowa(ctx.str(ptr))) +@specializectx +def match_IN_UNI_IGNORE(ctx, pattern, ptr, ppos): + return rsre_char.check_charset(ctx, pattern, ppos+2, rsre_char.getlower_unicode(ctx.str(ptr))) +@specializectx +def match_IN_LOC_IGNORE(ctx, pattern, ptr, ppos): + return pattern.charset_loc_ignore(ctx, ppos+2, ctx.str(ptr)) @specializectx def match_LITERAL(ctx, pattern, ptr, ppos): return ctx.str(ptr) == pattern.pat(ppos+1) @specializectx def match_LITERAL_IGNORE(ctx, pattern, ptr, ppos): - return ctx.lowstr(ptr, pattern.flags) == pattern.pat(ppos+1) + return pattern.lowa(ctx.str(ptr)) == pattern.pat(ppos+1) +@specializectx +def match_LITERAL_UNI_IGNORE(ctx, pattern, ptr, ppos): + return rsre_char.getlower_unicode(ctx.str(ptr)) == pattern.pat(ppos+1) +@specializectx +def match_LITERAL_LOC_IGNORE(ctx, pattern, ptr, ppos): + return pattern.char_loc_ignore(ppos+1, ctx.str(ptr)) @specializectx def match_NOT_LITERAL(ctx, pattern, ptr, ppos): return ctx.str(ptr) != pattern.pat(ppos+1) @specializectx def match_NOT_LITERAL_IGNORE(ctx, pattern, ptr, ppos): - return ctx.lowstr(ptr, pattern.flags) != pattern.pat(ppos+1) + return pattern.lowa(ctx.str(ptr)) != pattern.pat(ppos+1) +@specializectx +def match_NOT_LITERAL_UNI_IGNORE(ctx, pattern, ptr, ppos): + return rsre_char.getlower_unicode(ctx.str(ptr)) != pattern.pat(ppos+1) +@specializectx +def match_NOT_LITERAL_LOC_IGNORE(ctx, pattern, ptr, ppos): + return not pattern.char_loc_ignore(ppos+1, ctx.str(ptr)) def _make_fre(checkerfn): if checkerfn == match_ANY_ALL: def fre(ctx, pattern, ptr, end, ppos): return end - elif checkerfn == match_IN: - install_jitdriver_spec('MatchIn', + elif checkerfn in (match_IN, match_IN_IGNORE, match_IN_UNI_IGNORE): + # produces three jitdrivers: + # MatchIn + # MatchInIgnore + # MatchInUniIgnore + name = checkerfn.__name__.title().replace('_', '') + method_name = "jitdriver_" + name + install_jitdriver_spec(name, greens=['ppos', 'pattern'], reds=['ptr', 'end', 'ctx'], debugprint=(1, 0)) @specializectx def fre(ctx, pattern, ptr, end, ppos): while True: - ctx.jitdriver_MatchIn.jit_merge_point(ctx=ctx, ptr=ptr, + getattr(ctx, method_name).jit_merge_point(ctx=ctx, ptr=ptr, end=end, ppos=ppos, pattern=pattern) if ptr < end and checkerfn(ctx, pattern, ptr, ppos): ptr = ctx.next(ptr) else: return ptr - elif checkerfn == match_IN_IGNORE: - install_jitdriver_spec('MatchInIgnore', - greens=['ppos', 'pattern'], - reds=['ptr', 'end', 'ctx'], - debugprint=(1, 0)) - @specializectx - def fre(ctx, pattern, ptr, end, ppos): - while True: - ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr, - end=end, ppos=ppos, - pattern=pattern) - if ptr < end and checkerfn(ctx, pattern, ptr, ppos): - ptr = ctx.next(ptr) - else: - return ptr else: # in the other cases, the fre() function is not JITted at all # and is present as a residual call. @@ -1019,11 +1149,19 @@ unroll_char_checker = [ (consts.OPCODE_ANY_ALL, match_ANY_ALL), (consts.OPCODE_IN, match_IN), (consts.OPCODE_IN_IGNORE, match_IN_IGNORE), + (consts.OPCODE37_IN_UNI_IGNORE, match_IN_UNI_IGNORE), + (consts.OPCODE37_IN_LOC_IGNORE, match_IN_LOC_IGNORE), (consts.OPCODE_LITERAL, match_LITERAL), (consts.OPCODE_LITERAL_IGNORE, match_LITERAL_IGNORE), + (consts.OPCODE37_LITERAL_UNI_IGNORE, match_LITERAL_UNI_IGNORE), + (consts.OPCODE37_LITERAL_LOC_IGNORE, match_LITERAL_LOC_IGNORE), (consts.OPCODE_NOT_LITERAL, match_NOT_LITERAL), (consts.OPCODE_NOT_LITERAL_IGNORE, match_NOT_LITERAL_IGNORE), + (consts.OPCODE37_NOT_LITERAL_UNI_IGNORE, match_NOT_LITERAL_UNI_IGNORE), + (consts.OPCODE37_NOT_LITERAL_LOC_IGNORE, match_NOT_LITERAL_LOC_IGNORE), ] +unroll_char_checker = [(_op, _fn) for (_op, _fn) in unroll_char_checker + if _op is not None] # possibly removes the OPCODE37_* unroll_fre_checker = [(_op, _make_fre(_fn)) for (_op, _fn) in unroll_char_checker] @@ -1119,7 +1257,8 @@ def match(pattern, string, start=0, end=sys.maxint, fullmatch=False): assert isinstance(pattern, CompiledPattern) start, end = _adjust(start, end, len(string)) ctx = StrMatchContext(string, start, end) - ctx.fullmatch_only = fullmatch + if fullmatch: + ctx.match_mode = MODE_FULL if match_context(ctx, pattern): return ctx else: diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py index 834748ebaa..7617acc5fd 100644 --- a/rpython/rlib/rsre/rsre_utf8.py +++ b/rpython/rlib/rsre/rsre_utf8.py @@ -20,10 +20,6 @@ class Utf8MatchContext(AbstractMatchContext): check_nonneg(index) return rutf8.codepoint_at_pos(self._utf8, index) - def lowstr(self, index, flags): - c = self.str(index) - return rsre_char.getlower(c, flags) - def get_single_byte(self, base_position, index): return self._utf8[base_position + index] @@ -97,10 +93,11 @@ def utf8match(pattern, utf8string, bytestart=0, byteend=sys.maxint, fullmatch=False): # bytestart and byteend must be valid byte positions inside the # utf8string. - from rpython.rlib.rsre.rsre_core import match_context + from rpython.rlib.rsre.rsre_core import match_context, MODE_FULL ctx = make_utf8_ctx(utf8string, bytestart, byteend) - ctx.fullmatch_only = fullmatch + if fullmatch: + ctx.match_mode = MODE_FULL if match_context(ctx, pattern): return ctx else: diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py index 9e8fafc7c1..4e12277bcb 100644 --- a/rpython/rlib/rsre/test/support.py +++ b/rpython/rlib/rsre/test/support.py @@ -1,6 +1,7 @@ import sys, random from rpython.rlib import debug from rpython.rlib.rsre.rsre_core import _adjust, match_context, search_context +from rpython.rlib.rsre.rsre_core import MODE_FULL from rpython.rlib.rsre.rsre_core import StrMatchContext, EndOfString @@ -112,7 +113,8 @@ def match(pattern, string, start=0, end=sys.maxint, fullmatch=False): start = Position(start) end = Position(end) ctx = MatchContextForTests(string, start, end) - ctx.fullmatch_only = fullmatch + if fullmatch: + ctx.match_mode = MODE_FULL if match_context(ctx, pattern): return ctx else: diff --git a/rpython/rlib/rsre/test/test_char.py b/rpython/rlib/rsre/test/test_char.py index bd3a6f2936..6e7d6f3e33 100644 --- a/rpython/rlib/rsre/test/test_char.py +++ b/rpython/rlib/rsre/test/test_char.py @@ -204,3 +204,15 @@ def test_general_category(): assert check_charset(pat, 0, 99) # Lcheck_charset(pat, 0, 453) # Lt assert not check_charset(pat, 0, 688) # Lm assert not check_charset(pat, 0, 5870) # Nl + +def test_iscased(): + assert rsre_char.iscased_ascii(65) + assert rsre_char.iscased_ascii(100) + assert rsre_char.iscased_ascii(64) is False + assert rsre_char.iscased_ascii(126) is False + assert rsre_char.iscased_ascii(1260) is False + assert rsre_char.iscased_ascii(12600) is False + for i in range(65536): + assert rsre_char.iscased_unicode(i) == ( + rsre_char.getlower_unicode(i) != i or + rsre_char.getupper_unicode(i) != i) diff --git a/rpython/rlib/rsre/test/test_match.py b/rpython/rlib/rsre/test/test_match.py index c832244b11..758c015f7a 100644 --- a/rpython/rlib/rsre/test/test_match.py +++ b/rpython/rlib/rsre/test/test_match.py @@ -1,5 +1,5 @@ import re, random, py -from rpython.rlib.rsre import rsre_char +from rpython.rlib.rsre import rsre_char, rsre_constants from rpython.rlib.rsre.rpy import get_code, VERSION from rpython.rlib.rsre.test.support import match, fullmatch, Position as P @@ -306,6 +306,10 @@ class TestMatch: rsre_char.set_unicode_db(unicodedb) # r = get_code(u"[\U00010428-\U0001044f]", re.I) - assert r.pattern.count(27) == 1 # OPCODE_RANGE - r.pattern[r.pattern.index(27)] = 32 # => OPCODE_RANGE_IGNORE + assert r.pattern.count(rsre_constants.OPCODE_RANGE) == 1 + if rsre_constants.V37: + repl = rsre_constants.OPCODE37_RANGE_UNI_IGNORE + else: + repl = rsre_constants.OPCODE27_RANGE_IGNORE + r.pattern[r.pattern.index(rsre_constants.OPCODE_RANGE)] = repl assert match(r, u"\U00010428") diff --git a/rpython/rlib/rwinreg.py b/rpython/rlib/rwinreg.py index 7f45088ba3..4628804faf 100644 --- a/rpython/rlib/rwinreg.py +++ b/rpython/rlib/rwinreg.py @@ -49,71 +49,75 @@ PHKEY = rffi.CArrayPtr(HKEY) REGSAM = rwin32.DWORD def get_traits(suffix): + if suffix == 'A': + strp = rffi.CCHARP + else: + strp = rffi.CWCHARP RegSetValue = external( 'RegSetValue' + suffix, - [HKEY, rffi.CCHARP, rwin32.DWORD, rffi.CCHARP, rwin32.DWORD], + [HKEY, strp, rwin32.DWORD, strp, rwin32.DWORD], rffi.LONG) RegSetValueEx = external( 'RegSetValueEx' + suffix, - [HKEY, rffi.CCHARP, rwin32.DWORD, - rwin32.DWORD, rffi.CCHARP, rwin32.DWORD], + [HKEY, strp, rwin32.DWORD, + rwin32.DWORD, strp, rwin32.DWORD], rffi.LONG) RegQueryValue = external( 'RegQueryValue' + suffix, - [HKEY, rffi.CCHARP, rffi.CCHARP, rwin32.PLONG], + [HKEY, strp, strp, rwin32.PLONG], rffi.LONG) RegQueryValueEx = external( 'RegQueryValueEx' + suffix, - [HKEY, rffi.CCHARP, rwin32.LPDWORD, rwin32.LPDWORD, - rffi.CCHARP, rwin32.LPDWORD], + [HKEY, strp, rwin32.LPDWORD, rwin32.LPDWORD, + strp, rwin32.LPDWORD], rffi.LONG) RegCreateKey = external( 'RegCreateKey' + suffix, - [HKEY, rffi.CCHARP, PHKEY], + [HKEY, strp, PHKEY], rffi.LONG) RegCreateKeyEx = external( 'RegCreateKeyEx' + suffix, - [HKEY, rffi.CCHARP, rwin32.DWORD, rffi.CCHARP, rwin32.DWORD, + [HKEY, strp, rwin32.DWORD, strp, rwin32.DWORD, REGSAM, rffi.VOIDP, PHKEY, rwin32.LPDWORD], rffi.LONG) RegDeleteValue = external( 'RegDeleteValue' + suffix, - [HKEY, rffi.CCHARP], + [HKEY, strp], rffi.LONG) RegDeleteKey = external( 'RegDeleteKey' + suffix, - [HKEY, rffi.CCHARP], + [HKEY, strp], rffi.LONG) RegOpenKeyEx = external( 'RegOpenKeyEx' + suffix, - [HKEY, rffi.CCHARP, rwin32.DWORD, REGSAM, PHKEY], + [HKEY, strp, rwin32.DWORD, REGSAM, PHKEY], rffi.LONG) RegEnumValue = external( 'RegEnumValue' + suffix, - [HKEY, rwin32.DWORD, rffi.CCHARP, + [HKEY, rwin32.DWORD, strp, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.LPDWORD, rffi.CCHARP, rwin32.LPDWORD], rffi.LONG) RegEnumKeyEx = external( 'RegEnumKeyEx' + suffix, - [HKEY, rwin32.DWORD, rffi.CCHARP, + [HKEY, rwin32.DWORD, strp, rwin32.LPDWORD, rwin32.LPDWORD, - rffi.CCHARP, rwin32.LPDWORD, rwin32.PFILETIME], + strp, rwin32.LPDWORD, rwin32.PFILETIME], rffi.LONG) RegQueryInfoKey = external( 'RegQueryInfoKey' + suffix, - [HKEY, rffi.CCHARP, rwin32.LPDWORD, rwin32.LPDWORD, + [HKEY, strp, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.LPDWORD, rwin32.PFILETIME], @@ -121,17 +125,17 @@ def get_traits(suffix): RegLoadKey = external( 'RegLoadKey' + suffix, - [HKEY, rffi.CCHARP, rffi.CCHARP], + [HKEY, strp, strp], rffi.LONG) RegSaveKey = external( 'RegSaveKey' + suffix, - [HKEY, rffi.CCHARP, rffi.VOIDP], + [HKEY, strp, rffi.VOIDP], rffi.LONG) RegConnectRegistry = external( 'RegConnectRegistry' + suffix, - [rffi.CCHARP, HKEY, PHKEY], + [strp, HKEY, PHKEY], rffi.LONG) return (RegSetValue, RegSetValueEx, RegQueryValue, RegQueryValueEx, |