Skip to content

Commit 2bdcd06

Browse files
committed
Replace idlelib.colorizer with a faster solution
1 parent b428513 commit 2bdcd06

3 files changed

Lines changed: 221 additions & 24 deletions

File tree

Lib/_pyrepl/reader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ def calc_screen(self) -> list[str]:
320320
colors = list(gen_colors(self.get_unicode()))
321321
else:
322322
colors = None
323+
trace("colors = {colors}", colors=colors)
323324
lines = "".join(self.buffer[offset:]).split("\n")
324325
cursor_found = False
325326
lines_beyond_cursor = 0

Lib/_pyrepl/utils.py

Lines changed: 128 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
import builtins
2+
import functools
3+
import keyword
14
import re
5+
import token as T
6+
import tokenize
27
import unicodedata
3-
import functools
48

5-
from idlelib import colorizer
6-
from typing import cast, Iterator, Literal, Match, NamedTuple, Pattern, Self
9+
from io import StringIO
10+
from typing import cast, Iterator, Literal, Match, NamedTuple, Self
711
from _colorize import ANSIColors
812

913
from .types import CharBuffer, CharWidths
@@ -12,17 +16,19 @@
1216
ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@]*[A-~]")
1317
ZERO_WIDTH_BRACKET = re.compile(r"\x01.*?\x02")
1418
ZERO_WIDTH_TRANS = str.maketrans({"\x01": "", "\x02": ""})
15-
COLORIZE_RE: Pattern[str] = colorizer.prog
16-
IDENTIFIER_RE: Pattern[str] = colorizer.idprog
1719
IDENTIFIERS_AFTER = {"def", "class"}
18-
COLORIZE_GROUP_NAME_MAP: dict[str, str] = colorizer.prog_group_name_to_tag
20+
BUILTINS = {str(name) for name in dir(builtins) if not name.startswith('_')}
21+
1922

2023
type ColorTag = (
2124
Literal["KEYWORD"]
2225
| Literal["BUILTIN"]
2326
| Literal["COMMENT"]
2427
| Literal["STRING"]
28+
| Literal["NUMBER"]
29+
| Literal["OP"]
2530
| Literal["DEFINITION"]
31+
| Literal["SOFT_KEYWORD"]
2632
| Literal["SYNC"]
2733
)
2834

@@ -38,6 +44,13 @@ def from_re(cls, m: Match[str], group: int | str) -> Self:
3844
re_span = m.span(group)
3945
return cls(re_span[0], re_span[1] - 1)
4046

47+
@classmethod
48+
def from_token(cls, token: tokenize.TokenInfo, line_len: list[int]) -> Self:
49+
return cls(
50+
line_len[token.start[0] - 1] + token.start[1],
51+
line_len[token.end[0] - 1] + token.end[1] - 1,
52+
)
53+
4154

4255
class ColorSpan(NamedTuple):
4356
span: Span
@@ -49,7 +62,10 @@ class ColorSpan(NamedTuple):
4962
"BUILTIN": ANSIColors.CYAN,
5063
"COMMENT": ANSIColors.RED,
5164
"STRING": ANSIColors.GREEN,
65+
"NUMBER": ANSIColors.YELLOW,
66+
"OP": ANSIColors.RESET,
5267
"DEFINITION": ANSIColors.BOLD_WHITE,
68+
"SOFT_KEYWORD": ANSIColors.BOLD_GREEN, # FIXME: change to RESET
5369
"SYNC": ANSIColors.RESET,
5470
}
5571

@@ -86,17 +102,19 @@ def unbracket(s: str, including_content: bool = False) -> str:
86102

87103

88104
def gen_colors(buffer: str) -> Iterator[ColorSpan]:
89-
"""Returns a list of index spans to color using the given color tag.
90-
91-
The input `buffer` should be a valid start of a Python code block, i.e.
92-
it cannot be a block starting in the middle of a multiline string.
93-
"""
105+
# FIXME: delete this previous version, now only kept for debugging.
106+
from idlelib import colorizer
107+
COLORIZE_RE = colorizer.prog
94108
for match in COLORIZE_RE.finditer(buffer):
95109
yield from gen_color_spans(match)
96110

97111

98112
def gen_color_spans(re_match: Match[str]) -> Iterator[ColorSpan]:
99-
"""Generate non-empty color spans."""
113+
# FIXME: delete this previous version, now only kept for debugging.
114+
from idlelib import colorizer
115+
COLORIZE_GROUP_NAME_MAP = colorizer.prog_group_name_to_tag
116+
IDENTIFIER_RE = colorizer.idprog
117+
100118
for tag, data in re_match.groupdict().items():
101119
if not data:
102120
continue
@@ -109,6 +127,104 @@ def gen_color_spans(re_match: Match[str]) -> Iterator[ColorSpan]:
109127
yield ColorSpan(span, "DEFINITION")
110128

111129

130+
def gen_colors(buffer: str) -> Iterator[ColorSpan]:
131+
"""Returns a list of index spans to color using the given color tag.
132+
133+
The input `buffer` should be a valid start of a Python code block, i.e.
134+
it cannot be a block starting in the middle of a multiline string.
135+
"""
136+
sio = StringIO(buffer)
137+
line_lengths = [0] + [len(line) for line in sio.readlines()]
138+
# make line_lengths cumulative
139+
for i in range(1, len(line_lengths)):
140+
line_lengths[i] += line_lengths[i-1]
141+
142+
sio.seek(0)
143+
gen = tokenize.generate_tokens(sio.readline)
144+
last_emitted = None
145+
try:
146+
for color in gen_colors_from_token_stream(gen, line_lengths):
147+
yield color
148+
last_emitted = color
149+
except tokenize.TokenError as te:
150+
yield from recover_unterminated_string(
151+
te, line_lengths, last_emitted, buffer
152+
)
153+
154+
155+
def recover_unterminated_string(
156+
exc: tokenize.TokenError,
157+
line_lengths: list[int],
158+
last_emitted: ColorTag | None,
159+
buffer: str,
160+
) -> Iterator[ColorSpan]:
161+
msg, loc = exc.args
162+
if (
163+
msg.startswith("unterminated string literal")
164+
or msg.startswith("unterminated f-string literal")
165+
or msg.startswith("EOF in multi-line string")
166+
or msg.startswith("unterminated triple-quoted f-string literal")
167+
):
168+
start = line_lengths[loc[0] - 1] + loc[1] - 1
169+
end = line_lengths[-1] - 1
170+
171+
# in case FSTRING_START was already emitted
172+
if last_emitted and start <= last_emitted.span.start:
173+
trace("before last emitted = {s}", s=start)
174+
start = last_emitted.span.end + 1
175+
176+
span = Span(start, end)
177+
trace("yielding span {a} -> {b}", a=span.start, b=span.end)
178+
yield ColorSpan(span, "STRING")
179+
else:
180+
trace(
181+
"unhandled token error({buffer}) = {te}",
182+
buffer=repr(buffer),
183+
te=str(exc),
184+
)
185+
186+
187+
def gen_colors_from_token_stream(
188+
token_generator: Iterator[tokenize.TokenInfo],
189+
line_lengths: list[int],
190+
) -> Iterator[ColorSpan]:
191+
is_def_name = False
192+
for token in token_generator:
193+
if token.start == token.end:
194+
continue
195+
196+
match token.type:
197+
case T.STRING | T.FSTRING_START | T.FSTRING_MIDDLE | T.FSTRING_END:
198+
span = Span.from_token(token, line_lengths)
199+
yield ColorSpan(span, "STRING")
200+
case T.COMMENT:
201+
span = Span.from_token(token, line_lengths)
202+
yield ColorSpan(span, "COMMENT")
203+
case T.NUMBER:
204+
span = Span.from_token(token, line_lengths)
205+
yield ColorSpan(span, "NUMBER")
206+
case T.OP:
207+
span = Span.from_token(token, line_lengths)
208+
yield ColorSpan(span, "OP")
209+
case T.NAME:
210+
if is_def_name:
211+
is_def_name = False
212+
span = Span.from_token(token, line_lengths)
213+
yield ColorSpan(span, "DEFINITION")
214+
elif keyword.iskeyword(token.string):
215+
span = Span.from_token(token, line_lengths)
216+
yield ColorSpan(span, "KEYWORD")
217+
if token.string in IDENTIFIERS_AFTER:
218+
is_def_name = True
219+
elif keyword.issoftkeyword(token.string):
220+
span = Span.from_token(token, line_lengths)
221+
yield ColorSpan(span, "SOFT_KEYWORD")
222+
elif token.string in BUILTINS:
223+
span = Span.from_token(token, line_lengths)
224+
yield ColorSpan(span, "BUILTIN")
225+
# TODO: soft keywords
226+
227+
112228
def disp_str(
113229
buffer: str, colors: list[ColorSpan] | None = None, start_index: int = 0
114230
) -> tuple[CharBuffer, CharWidths]:

Lib/test/test_pyrepl/test_reader.py

Lines changed: 92 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
from _pyrepl.utils import TAG_TO_ANSI
1414

1515

16-
colors = {k[0].lower(): v for k, v in TAG_TO_ANSI.items() if k != "SYNC"}
17-
colors["z"] = TAG_TO_ANSI["SYNC"]
16+
overrides = {"SYNC": "z", "SOFT_KEYWORD": "K"}
17+
colors = {overrides.get(k, k[0].lower()): v for k, v in TAG_TO_ANSI.items()}
1818

1919

2020
class TestReader(ScreenEqualMixin, TestCase):
@@ -384,20 +384,20 @@ def funct(case: str = sys.platform) -> None:
384384
)
385385
expected = dedent(
386386
"""\
387-
{k}import{z} re, sys
388-
{a}{k}def{z} {d}funct{z}(case: {b}str{z} = sys.platform) -> {k}None{z}:
389-
match = re.search(
390-
{s}"(me)"{z},
387+
{k}import{z} re{o},{z} sys
388+
{a}{k}def{z} {d}funct{z}{o}({z}{K}case{z}{o}:{z} {b}str{z} {o}={z} sys{o}.{z}platform{o}){z} {o}->{z} {k}None{z}{o}:{z}
389+
{K}match{z} {o}={z} re{o}.{z}search{o}({z}
390+
{s}"(me)"{z}{o},{z}
391391
{s}'''{z}
392392
{s} Come on{z}
393393
{s} Come on now{z}
394394
{s} You know that it's time to emerge{z}
395-
{s} '''{z},
396-
)
397-
{k}match{z} case:
398-
{k}case{z} {s}"emscripten"{z}: {b}print{z}({s}"on the web"{z})
399-
{k}case{z} {s}"ios"{z} | {s}"android"{z}: {b}print{z}({s}"on the phone"{z})
400-
{k}case{z} {k}_{z}: {b}print{z}({s}'arms around'{z}, match.group(1))
395+
{s} '''{z}{o},{z}
396+
{o}){z}
397+
{K}match{z} {K}case{z}{o}:{z}
398+
{K}case{z} {s}"emscripten"{z}{o}:{z} {b}print{z}{o}({z}{s}"on the web"{z}{o}){z}
399+
{K}case{z} {s}"ios"{z} {o}|{z} {s}"android"{z}{o}:{z} {b}print{z}{o}({z}{s}"on the phone"{z}{o}){z}
400+
{K}case{z} {K}_{z}{o}:{z} {b}print{z}{o}({z}{s}'arms around'{z}{o},{z} {K}match{z}{o}.{z}group{o}({z}{n}1{z}{o}){z}{o}){z}
401401
"""
402402
)
403403
expected_sync = expected.format(a="", **colors)
@@ -419,3 +419,83 @@ def funct(case: str = sys.platform) -> None:
419419
self.assert_screen_equal(reader, expected_async)
420420
self.assertEqual(reader.pos, 21)
421421
self.assertEqual(reader.cxy, (6, 1))
422+
423+
def test_syntax_highlighting_incomplete_string_first_line(self):
424+
code = dedent(
425+
"""\
426+
def unfinished_function(arg: str = "still typing
427+
"""
428+
)
429+
expected = dedent(
430+
"""\
431+
{k}def{z} {d}unfinished_function{z}{o}({z}arg{o}:{z} {b}str{z} {o}={z} {s}"still typing{z}
432+
"""
433+
).format(**colors)
434+
events = code_to_events(code)
435+
reader, _ = handle_all_events(events, prepare_reader=reader_force_colors)
436+
self.assert_screen_equal(reader, code, clean=True)
437+
self.assert_screen_equal(reader, expected)
438+
439+
def test_syntax_highlighting_incomplete_string_another_line(self):
440+
code = dedent(
441+
"""\
442+
def unfinished_function(
443+
arg: str = "still typing
444+
"""
445+
)
446+
expected = dedent(
447+
"""\
448+
{k}def{z} {d}unfinished_function{z}{o}({z}
449+
arg{o}:{z} {b}str{z} {o}={z} {s}"still typing{z}
450+
"""
451+
).format(**colors)
452+
events = code_to_events(code)
453+
reader, _ = handle_all_events(events, prepare_reader=reader_force_colors)
454+
self.assert_screen_equal(reader, code, clean=True)
455+
self.assert_screen_equal(reader, expected)
456+
457+
def test_syntax_highlighting_incomplete_multiline_string(self):
458+
code = dedent(
459+
"""\
460+
def unfinished_function():
461+
'''Still writing
462+
the docstring
463+
"""
464+
)
465+
expected = dedent(
466+
"""\
467+
{k}def{z} {d}unfinished_function{z}{o}({z}{o}){z}{o}:{z}
468+
{s}'''Still writing{z}
469+
{s} the docstring{z}
470+
"""
471+
).format(**colors)
472+
events = code_to_events(code)
473+
reader, _ = handle_all_events(events, prepare_reader=reader_force_colors)
474+
self.assert_screen_equal(reader, code, clean=True)
475+
self.assert_screen_equal(reader, expected)
476+
477+
def test_syntax_highlighting_incomplete_fstring(self):
478+
code = dedent(
479+
"""\
480+
def unfinished_function():
481+
var = f"Single-quote but {
482+
1
483+
+
484+
1
485+
} multi-line!
486+
"""
487+
)
488+
expected = dedent(
489+
"""\
490+
{k}def{z} {d}unfinished_function{z}{o}({z}{o}){z}{o}:{z}
491+
var {o}={z} {s}f"{z}{s}Single-quote but {z}{o}{OB}{z}
492+
{n}1{z}
493+
{o}+{z}
494+
{n}1{z}
495+
{o}{CB}{z}{s} multi-line!{z}
496+
"""
497+
).format(OB="{", CB="}", **colors)
498+
events = code_to_events(code)
499+
reader, _ = handle_all_events(events, prepare_reader=reader_force_colors)
500+
self.assert_screen_equal(reader, code, clean=True)
501+
self.assert_screen_equal(reader, expected)

0 commit comments

Comments
 (0)