Reduce usage of regex (#2644)

This removes all but one usage of the `regex` dependency. Tricky bits included: - A bug in test_black.py where we were incorrectly using a character range. Fix also submitted separately in #2643. - `tokenize.py` was the original use case for regex (#1047). The important bit is that we rely on `\w` to match anything valid in an identifier, and `re` fails to match a few characters as part of identifiers. My solution is to instead match all characters *except* those we know to mean something else in Python: whitespace and ASCII punctuation. This will make Black able to parse some invalid Python programs, like those that contain non-ASCII punctuation in the place of an identifier, but that seems fine to me. - One import of `regex` remains, in `trans.py`. We use a recursive regex to parse f-strings, and only `regex` supports that. I haven't thought of a better fix there (except maybe writing a manual parser), so I'm leaving that for now. My goal is to remove the `regex` dependency to reduce the risk of breakage due to dependencies and make life easier for users on platforms without wheels.
2021-11-30 18:01:36 -08:00 · 2021-11-30 18:01:36 -08:00 · 5e2bb528e0
commit 5e2bb528e0
parent b336b390d0
7 changed files with 13 additions and 12 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -7,12 +7,13 @@
 - Cell magics are now only processed if they are known Python cell magics. Earlier, all
  cell magics were tokenized, leading to possible indentation errors e.g. with
  `%%writefile`. (#2630)
- Fixed Python 3.10 support on platforms without ProcessPoolExecutor (#2631)
- Fixed `match` statements with open sequence subjects, like `match a, b:` or
+- Fix Python 3.10 support on platforms without ProcessPoolExecutor (#2631)
+- Reduce usage of the `regex` dependency (#2644)
+- Fix `match` statements with open sequence subjects, like `match a, b:` or
  `match a, *b:` (#2639) (#2659)
- Fixed `match`/`case` statements that contain `match`/`case` soft keywords multiple
+- Fix `match`/`case` statements that contain `match`/`case` soft keywords multiple
  times, like `match re.match()` (#2661)
- Fixed assignment to environment variables in Jupyter Notebooks (#2642)
+- Fix assignment to environment variables in Jupyter Notebooks (#2642)
 - Add `flake8-simplify` and `flake8-comprehensions` plugins (#2653)

 ## 21.11b1
--- a/src/black/init.py
+++ b/src/black/init.py
@ -10,7 +10,7 @@
 import os
 from pathlib import Path
 from pathspec.patterns.gitwildmatch import GitWildMatchPatternError
-import regex as re
+import re
 import signal
 import sys
 import tokenize
--- a/src/black/comments.py
+++ b/src/black/comments.py
@ -1,7 +1,7 @@
 import sys
 from dataclasses import dataclass
 from functools import lru_cache
-import regex as re
+import re
 from typing import Iterator, List, Optional, Union

 if sys.version_info >= (3, 8):
--- a/src/black/strings.py
+++ b/src/black/strings.py
@ -2,7 +2,7 @@
 Simple formatting on strings. Further string formatting code is in trans.py.
 """

-import regex as re
+import re
 import sys
 from functools import lru_cache
 from typing import List, Pattern
@ -156,7 +156,7 @@ def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str:
 # performance on a long list literal of strings by 5-9% since lru_cache's
 # caching overhead is much lower.
@lru_cache(maxsize=64)
-def _cached_compile(pattern: str) -> re.Pattern:
+def _cached_compile(pattern: str) -> Pattern[str]:
    return re.compile(pattern)


--- a/src/black/trans.py
+++ b/src/black/trans.py
@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
-import regex as re
+import regex as re  # We need recursive patterns here (?R)
 from typing import (
    Any,
    Callable,
--- a/src/blib2to3/pgen2/conv.py
+++ b/src/blib2to3/pgen2/conv.py
@ -29,7 +29,7 @@
 """

 # Python imports
-import regex as re
+import re

 # Local imports
 from pgen2 import grammar, token
--- a/src/blib2to3/pgen2/tokenize.py
+++ b/src/blib2to3/pgen2/tokenize.py
@ -52,7 +52,7 @@
 __author__ = "Ka-Ping Yee <ping@lfw.org>"
 __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"

-import regex as re
+import re
 from codecs import BOM_UTF8, lookup
 from blib2to3.pgen2.token import *

@ -86,7 +86,7 @@ def _combinations(*l):
 Comment = r"#[^\r\n]*"
 Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
 Name = (  # this is invalid but it's fine because Name comes after Number in all groups
-    r"\w+"
+    r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
 )

 Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"