1+ import builtins
2+ import functools
3+ import keyword
14import re
5+ import token as T
6+ import tokenize
27import unicodedata
3- import functools
48
5- from idlelib import colorizer
6- from typing import cast , Iterator , Literal , Match , NamedTuple , Pattern , Self
9+ from io import StringIO
10+ from typing import cast , Iterator , Literal , Match , NamedTuple , Self
711from _colorize import ANSIColors
812
913from .types import CharBuffer , CharWidths
1216ANSI_ESCAPE_SEQUENCE = re .compile (r"\x1b\[[ -@]*[A-~]" )
1317ZERO_WIDTH_BRACKET = re .compile (r"\x01.*?\x02" )
1418ZERO_WIDTH_TRANS = str .maketrans ({"\x01 " : "" , "\x02 " : "" })
15- COLORIZE_RE : Pattern [str ] = colorizer .prog
16- IDENTIFIER_RE : Pattern [str ] = colorizer .idprog
1719IDENTIFIERS_AFTER = {"def" , "class" }
18- COLORIZE_GROUP_NAME_MAP : dict [str , str ] = colorizer .prog_group_name_to_tag
20+ BUILTINS = {str (name ) for name in dir (builtins ) if not name .startswith ('_' )}
21+
1922
2023type ColorTag = (
2124 Literal ["KEYWORD" ]
2225 | Literal ["BUILTIN" ]
2326 | Literal ["COMMENT" ]
2427 | Literal ["STRING" ]
28+ | Literal ["NUMBER" ]
29+ | Literal ["OP" ]
2530 | Literal ["DEFINITION" ]
31+ | Literal ["SOFT_KEYWORD" ]
2632 | Literal ["SYNC" ]
2733)
2834
@@ -38,6 +44,13 @@ def from_re(cls, m: Match[str], group: int | str) -> Self:
3844 re_span = m .span (group )
3945 return cls (re_span [0 ], re_span [1 ] - 1 )
4046
47+ @classmethod
48+ def from_token (cls , token : tokenize .TokenInfo , line_len : list [int ]) -> Self :
49+ return cls (
50+ line_len [token .start [0 ] - 1 ] + token .start [1 ],
51+ line_len [token .end [0 ] - 1 ] + token .end [1 ] - 1 ,
52+ )
53+
4154
4255class ColorSpan (NamedTuple ):
4356 span : Span
@@ -49,7 +62,10 @@ class ColorSpan(NamedTuple):
4962 "BUILTIN" : ANSIColors .CYAN ,
5063 "COMMENT" : ANSIColors .RED ,
5164 "STRING" : ANSIColors .GREEN ,
65+ "NUMBER" : ANSIColors .YELLOW ,
66+ "OP" : ANSIColors .RESET ,
5267 "DEFINITION" : ANSIColors .BOLD_WHITE ,
68+ "SOFT_KEYWORD" : ANSIColors .BOLD_GREEN , # FIXME: change to RESET
5369 "SYNC" : ANSIColors .RESET ,
5470}
5571
@@ -86,17 +102,19 @@ def unbracket(s: str, including_content: bool = False) -> str:
86102
87103
88104def gen_colors (buffer : str ) -> Iterator [ColorSpan ]:
89- """Returns a list of index spans to color using the given color tag.
90-
91- The input `buffer` should be a valid start of a Python code block, i.e.
92- it cannot be a block starting in the middle of a multiline string.
93- """
105+ # FIXME: delete this previous version, now only kept for debugging.
106+ from idlelib import colorizer
107+ COLORIZE_RE = colorizer .prog
94108 for match in COLORIZE_RE .finditer (buffer ):
95109 yield from gen_color_spans (match )
96110
97111
98112def gen_color_spans (re_match : Match [str ]) -> Iterator [ColorSpan ]:
99- """Generate non-empty color spans."""
113+ # FIXME: delete this previous version, now only kept for debugging.
114+ from idlelib import colorizer
115+ COLORIZE_GROUP_NAME_MAP = colorizer .prog_group_name_to_tag
116+ IDENTIFIER_RE = colorizer .idprog
117+
100118 for tag , data in re_match .groupdict ().items ():
101119 if not data :
102120 continue
@@ -109,6 +127,104 @@ def gen_color_spans(re_match: Match[str]) -> Iterator[ColorSpan]:
109127 yield ColorSpan (span , "DEFINITION" )
110128
111129
130+ def gen_colors (buffer : str ) -> Iterator [ColorSpan ]:
131+ """Returns a list of index spans to color using the given color tag.
132+
133+ The input `buffer` should be a valid start of a Python code block, i.e.
134+ it cannot be a block starting in the middle of a multiline string.
135+ """
136+ sio = StringIO (buffer )
137+ line_lengths = [0 ] + [len (line ) for line in sio .readlines ()]
138+ # make line_lengths cumulative
139+ for i in range (1 , len (line_lengths )):
140+ line_lengths [i ] += line_lengths [i - 1 ]
141+
142+ sio .seek (0 )
143+ gen = tokenize .generate_tokens (sio .readline )
144+ last_emitted = None
145+ try :
146+ for color in gen_colors_from_token_stream (gen , line_lengths ):
147+ yield color
148+ last_emitted = color
149+ except tokenize .TokenError as te :
150+ yield from recover_unterminated_string (
151+ te , line_lengths , last_emitted , buffer
152+ )
153+
154+
155+ def recover_unterminated_string (
156+ exc : tokenize .TokenError ,
157+ line_lengths : list [int ],
158+ last_emitted : ColorTag | None ,
159+ buffer : str ,
160+ ) -> Iterator [ColorSpan ]:
161+ msg , loc = exc .args
162+ if (
163+ msg .startswith ("unterminated string literal" )
164+ or msg .startswith ("unterminated f-string literal" )
165+ or msg .startswith ("EOF in multi-line string" )
166+ or msg .startswith ("unterminated triple-quoted f-string literal" )
167+ ):
168+ start = line_lengths [loc [0 ] - 1 ] + loc [1 ] - 1
169+ end = line_lengths [- 1 ] - 1
170+
171+ # in case FSTRING_START was already emitted
172+ if last_emitted and start <= last_emitted .span .start :
173+ trace ("before last emitted = {s}" , s = start )
174+ start = last_emitted .span .end + 1
175+
176+ span = Span (start , end )
177+ trace ("yielding span {a} -> {b}" , a = span .start , b = span .end )
178+ yield ColorSpan (span , "STRING" )
179+ else :
180+ trace (
181+ "unhandled token error({buffer}) = {te}" ,
182+ buffer = repr (buffer ),
183+ te = str (exc ),
184+ )
185+
186+
187+ def gen_colors_from_token_stream (
188+ token_generator : Iterator [tokenize .TokenInfo ],
189+ line_lengths : list [int ],
190+ ) -> Iterator [ColorSpan ]:
191+ is_def_name = False
192+ for token in token_generator :
193+ if token .start == token .end :
194+ continue
195+
196+ match token .type :
197+ case T .STRING | T .FSTRING_START | T .FSTRING_MIDDLE | T .FSTRING_END :
198+ span = Span .from_token (token , line_lengths )
199+ yield ColorSpan (span , "STRING" )
200+ case T .COMMENT :
201+ span = Span .from_token (token , line_lengths )
202+ yield ColorSpan (span , "COMMENT" )
203+ case T .NUMBER :
204+ span = Span .from_token (token , line_lengths )
205+ yield ColorSpan (span , "NUMBER" )
206+ case T .OP :
207+ span = Span .from_token (token , line_lengths )
208+ yield ColorSpan (span , "OP" )
209+ case T .NAME :
210+ if is_def_name :
211+ is_def_name = False
212+ span = Span .from_token (token , line_lengths )
213+ yield ColorSpan (span , "DEFINITION" )
214+ elif keyword .iskeyword (token .string ):
215+ span = Span .from_token (token , line_lengths )
216+ yield ColorSpan (span , "KEYWORD" )
217+ if token .string in IDENTIFIERS_AFTER :
218+ is_def_name = True
219+ elif keyword .issoftkeyword (token .string ):
220+ span = Span .from_token (token , line_lengths )
221+ yield ColorSpan (span , "SOFT_KEYWORD" )
222+ elif token .string in BUILTINS :
223+ span = Span .from_token (token , line_lengths )
224+ yield ColorSpan (span , "BUILTIN" )
225+ # TODO: soft keywords
226+
227+
112228def disp_str (
113229 buffer : str , colors : list [ColorSpan ] | None = None , start_index : int = 0
114230) -> tuple [CharBuffer , CharWidths ]:
0 commit comments