blob: 6b47953577e15b32543a7e3c06b47e16afb2ac0c [file] [log] [blame]
IRIS YANG31213572020-08-18 13:17:02 +00001import itertools
2import os
3import re
4import sys
5
6
7def get_characters():
8 """Find every Unicode character that is valid in a Python `identifier`_ but
9 is not matched by the regex ``\\w`` group.
10
11 ``\\w`` matches some characters that aren't valid in identifiers, but
12 :meth:`str.isidentifier` will catch that later in lexing.
13
14 All start characters are valid continue characters, so we only test for
15 continue characters.
16
17 _identifier: https://docs.python.org/3/reference/lexical_analysis.html#identifiers
18 """
19 for cp in range(sys.maxunicode + 1):
20 s = chr(cp)
21
22 if ("a" + s).isidentifier() and not re.match(r"\w", s):
23 yield s
24
25
26def collapse_ranges(data):
27 """Given a sorted list of unique characters, generate ranges representing
28 sequential code points.
29
30 Source: https://stackoverflow.com/a/4629241/400617
31 """
32 for _, b in itertools.groupby(enumerate(data), lambda x: ord(x[1]) - x[0]):
33 b = list(b)
34 yield b[0][1], b[-1][1]
35
36
37def build_pattern(ranges):
38 """Output the regex pattern for ranges of characters.
39
40 One and two character ranges output the individual characters.
41 """
42 out = []
43
44 for a, b in ranges:
45 if a == b: # single char
46 out.append(a)
47 elif ord(b) - ord(a) == 1: # two chars, range is redundant
48 out.append(a)
49 out.append(b)
50 else:
51 out.append(f"{a}-{b}")
52
53 return "".join(out)
54
55
56def main():
57 """Build the regex pattern and write it to
58 ``jinja2/_identifier.py``.
59 """
60 pattern = build_pattern(collapse_ranges(get_characters()))
61 filename = os.path.abspath(
62 os.path.join(os.path.dirname(__file__), "..", "src", "jinja2", "_identifier.py")
63 )
64
65 with open(filename, "w", encoding="utf8") as f:
66 f.write("import re\n\n")
67 f.write("# generated by scripts/generate_identifier_pattern.py\n")
68 f.write("pattern = re.compile(\n")
69 f.write(f' r"[\\w{pattern}]+" # noqa: B950\n')
70 f.write(")\n")
71
72
73if __name__ == "__main__":
74 main()