Accelerate Unicode identifier support (backport from Lib/tokenize.py)
This commit is contained in:
parent
82198030ee
commit
c86fb36232
@ -29,7 +29,7 @@
|
|||||||
__credits__ = \
|
__credits__ = \
|
||||||
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
|
'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
|
||||||
|
|
||||||
import string, re, unicodedata
|
import re
|
||||||
from codecs import BOM_UTF8, lookup
|
from codecs import BOM_UTF8, lookup
|
||||||
from blib2to3.pgen2.token import *
|
from blib2to3.pgen2.token import *
|
||||||
|
|
||||||
@ -56,7 +56,7 @@ def _combinations(*l):
|
|||||||
Whitespace = r'[ \f\t]*'
|
Whitespace = r'[ \f\t]*'
|
||||||
Comment = r'#[^\r\n]*'
|
Comment = r'#[^\r\n]*'
|
||||||
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
|
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
|
||||||
Name = r'[^\d\W]\w*'
|
Name = r'\w+' # this is invalid but it's fine because Name comes after Number in all groups
|
||||||
|
|
||||||
Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
|
Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
|
||||||
Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
|
Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
|
||||||
@ -334,8 +334,6 @@ def untokenize(iterable):
|
|||||||
ut = Untokenizer()
|
ut = Untokenizer()
|
||||||
return ut.untokenize(iterable)
|
return ut.untokenize(iterable)
|
||||||
|
|
||||||
InitialCategories = {'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'}
|
|
||||||
|
|
||||||
def generate_tokens(readline):
|
def generate_tokens(readline):
|
||||||
"""
|
"""
|
||||||
The generate_tokens() generator requires one argument, readline, which
|
The generate_tokens() generator requires one argument, readline, which
|
||||||
@ -353,7 +351,7 @@ def generate_tokens(readline):
|
|||||||
logical line; continuation lines are included.
|
logical line; continuation lines are included.
|
||||||
"""
|
"""
|
||||||
lnum = parenlev = continued = 0
|
lnum = parenlev = continued = 0
|
||||||
namechars, numchars = string.ascii_letters + '_', '0123456789'
|
numchars = '0123456789'
|
||||||
contstr, needcont = '', 0
|
contstr, needcont = '', 0
|
||||||
contline = None
|
contline = None
|
||||||
indents = [0]
|
indents = [0]
|
||||||
@ -506,8 +504,7 @@ def generate_tokens(readline):
|
|||||||
yield stashed
|
yield stashed
|
||||||
stashed = None
|
stashed = None
|
||||||
yield (STRING, token, spos, epos, line)
|
yield (STRING, token, spos, epos, line)
|
||||||
elif (initial in namechars or # ordinary name
|
elif initial.isidentifier(): # ordinary name
|
||||||
unicodedata.category(initial) in InitialCategories):
|
|
||||||
if token in ('async', 'await'):
|
if token in ('async', 'await'):
|
||||||
if async_def:
|
if async_def:
|
||||||
yield (ASYNC if token == 'async' else AWAIT,
|
yield (ASYNC if token == 'async' else AWAIT,
|
||||||
|
Loading…
Reference in New Issue
Block a user