[blib2to3] Support non-ASCII identifiers

This support isn't *exactly* right per PEP 3131 as the regex engine is a bit too limited for that and I didn't want to spend time on Other_ID_Start and Other_ID_Continue unless they're actually needed. Hopefully this doesn't slow it down too much.
2018-04-04 21:38:25 -07:00 · 2018-04-04 21:38:25 -07:00 · e36b8c71bb
commit e36b8c71bb
parent 2e0bb0fa9f
3 changed files with 20 additions and 8 deletions
--- a/blib2to3/pgen2/tokenize.py
+++ b/blib2to3/pgen2/tokenize.py
@ -29,7 +29,7 @@
 __credits__ = \
    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'

-import string, re
+import string, re, unicodedata
 from codecs import BOM_UTF8, lookup
 from blib2to3.pgen2.token import *

@ -52,7 +52,7 @@ def maybe(*choices): return group(*choices) + '?'
 Whitespace = r'[ \f\t]*'
 Comment = r'#[^\r\n]*'
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'[a-zA-Z_]\w*'
+Name = r'[^\d\W]\w*'

 Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
 Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
@ -103,8 +103,10 @@ def maybe(*choices): return group(*choices) + '?'
 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

-tokenprog, pseudoprog, single3prog, double3prog = list(map(
-    re.compile, (Token, PseudoToken, Single3, Double3)))
+tokenprog = re.compile(Token, re.UNICODE)
+pseudoprog = re.compile(PseudoToken, re.UNICODE)
+single3prog = re.compile(Single3)
+double3prog = re.compile(Double3)
 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
            "'''": single3prog, '"""': double3prog,
            "r'''": single3prog, 'r"""': double3prog,
@ -358,6 +360,8 @@ def untokenize(iterable):
    ut = Untokenizer()
    return ut.untokenize(iterable)

+InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
+
 def generate_tokens(readline):
    """
    The generate_tokens() generator requires one argument, readline, which
@ -473,6 +477,8 @@ def generate_tokens(readline):

        while pos < max:
            pseudomatch = pseudoprog.match(line, pos)
+            if not pseudomatch:
+                print('no pseudomatch')
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
                spos, epos, pos = (lnum, start), (lnum, end), end
@ -528,7 +534,8 @@ def generate_tokens(readline):
                            yield stashed
                            stashed = None
                        yield (STRING, token, spos, epos, line)
-                elif initial in namechars:                 # ordinary name
+                elif (initial in namechars or              # ordinary name
+                      unicodedata.category(initial) in InitialCategories):
                    if token in ('async', 'await'):
                        if async_def:
                            yield (ASYNC if token == 'async' else AWAIT,
--- a/tests/expression.diff
+++ b/tests/expression.diff
@ -103,7 +103,7 @@
 ]
 slice[0]
 slice[0:1]
-@@ -114,71 +123,90 @@
+@@ -114,73 +123,92 @@
 numpy[-(c + 1):, d]
 numpy[:, l[-2]]
 numpy[:, ::-1]
@ -142,8 +142,10 @@
 +).order_by(
 +    models.Customer.id.asc()
 +).all()
-+
+ Ø = set()
+ authors.łukasz.say_thanks()
 
+
 def gen():
     yield from outside_of_generator
 +
@ -235,4 +237,3 @@
 +
 last_call()
 # standalone comment at ENDMARKER
-
--- a/tests/expression.py
+++ b/tests/expression.py
@ -135,6 +135,8 @@
 what_is_up_with_those_new_coord_names = (coord_names + set(vars_to_create)) + set(vars_to_remove)
 what_is_up_with_those_new_coord_names = (coord_names | set(vars_to_create)) - set(vars_to_remove)
 result = session.query(models.Customer.id).filter(models.Customer.account_id == account_id, models.Customer.email == email_address).order_by(models.Customer.id.asc(),).all()
+Ø = set()
+authors.łukasz.say_thanks()

 def gen():
    yield from outside_of_generator
@ -340,6 +342,8 @@ async def f():
 ).order_by(
    models.Customer.id.asc()
 ).all()
+Ø = set()
+authors.łukasz.say_thanks()


 def gen():