Skip to content

Commit 7ea88f7

Browse files
committed
py/lexer: Add support for nested f-strings within f-strings.
It turns out that it's relatively simple to support nested f-strings, which is what this commit implements. The way the MicroPython f-string parser works at the moment is: 1. it extracts the f-string arguments (things in curly braces) into a temporary buffer (a vstr) 2. once the f-string ends (reaches its closing quote) the lexer switches to tokenizing the temporary buffer 3. once the buffer is empty it switches back to the stream. The temporary buffer can easily hold f-strings itself (ie nested f-strings) and they can be re-parsed by the lexer using the same algorithm. The only thing stopping that from working is that the temporary buffer can't be reused for the nested f-string because it's currently being parsed. This commit fixes that by adding a second temporary buffer, which is the "injection" buffer. That allows arbitrary number of nestings with a simple modification to the original algorithm: 1. when an f-string is encountered the string is parsed and its arguments are extracted into `fstring_args` 2. when the f-string finishes, `fstring_args` is inserted into the current position in `inject_chrs` (which is the start of that buffer if no injection is ongoing) 3. `fstring_args` is now cleared and ready for any further f-strings (nested or not) 4. the lexer switches to `inject_chrs` if it's not already reading from it 5. if an f-string appeared inside the f-string then it is in `inject_chrs` and can be processed as before, extracting its arguments into `fstring_args`, which can then be inserted again into `inject_chrs` 6. once `inject_chrs` is exhausted (meaning that all levels of f-strings have been fully processed) the lexer switched back to tokenizing the stream. Amazingly, this scheme supports arbitrary numbers of nestings of f-strings using the same quote style. This adds some code size and a bit more memory usage for the lexer. In particular for a single (non-nested) f-string it now makes an extra copy of the `fstring_args` data, when copying it across to `inject_chrs`. Otherwise, memory use only goes up with the complexity of nested f-strings. Signed-off-by: Damien George <damien@micropython.org>
1 parent 617c7db commit 7ea88f7

5 files changed

Lines changed: 48 additions & 35 deletions

File tree

py/lexer.c

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -145,27 +145,17 @@ static void next_char(mp_lexer_t *lex) {
145145
lex->chr0 = lex->chr1;
146146
lex->chr1 = lex->chr2;
147147

148-
// and add the next byte from either the fstring args or the reader
148+
// and add the next byte from either inject_chrs or the reader
149149
mp_uint_t chr2;
150150
fetch_next_byte:
151151
#if MICROPY_PY_FSTRINGS
152-
if (lex->fstring_args_idx) {
153-
// if there are saved chars, then we're currently injecting fstring args
154-
if (lex->fstring_args_idx < lex->fstring_args.len) {
155-
chr2 = lex->fstring_args.buf[lex->fstring_args_idx++];
156-
} else {
157-
// no more fstring arg bytes
158-
chr2 = '\0';
159-
}
160-
161-
if (lex->chr0 == '\0') {
162-
// consumed all fstring data, restore saved input queue
163-
lex->chr0 = lex->chr0_saved;
164-
lex->chr1 = lex->chr1_saved;
165-
chr2 = lex->chr2_saved;
166-
// stop consuming fstring arg data
167-
vstr_reset(&lex->fstring_args);
168-
lex->fstring_args_idx = 0;
152+
if (lex->inject_chrs_idx) {
153+
// if there are saved chars, then we're currently injecting them
154+
chr2 = lex->inject_chrs.buf[lex->inject_chrs_idx++];
155+
if (lex->inject_chrs_idx >= lex->inject_chrs.len) {
156+
// consumed all injected characters, switch back to the input stream
157+
vstr_reset(&lex->inject_chrs);
158+
lex->inject_chrs_idx = 0;
169159
}
170160
} else
171161
#endif
@@ -346,8 +336,7 @@ static void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
346336
#if MICROPY_PY_FSTRINGS
347337
if (is_fstring) {
348338
// assume there's going to be interpolation, so prep the injection data
349-
// fstring_args_idx==0 && len(fstring_args)>0 means we're extracting the args.
350-
// only when fstring_args_idx>0 will we consume the arg data
339+
// len(fstring_args)>0 means we're extracting the args.
351340
// lex->fstring_args is reset when finished, so at this point there are two cases:
352341
// - lex->fstring_args is empty: start of a new f-string
353342
// - lex->fstring_args is non-empty: concatenation of adjacent f-strings
@@ -570,19 +559,26 @@ static bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
570559

571560
void mp_lexer_to_next(mp_lexer_t *lex) {
572561
#if MICROPY_PY_FSTRINGS
573-
if (lex->fstring_args.len && lex->fstring_args_idx == 0) {
562+
if (lex->fstring_args.len) {
574563
// moving onto the next token means the literal string is complete.
575564
// switch into injecting the format args.
576565
vstr_add_byte(&lex->fstring_args, ')');
577-
lex->chr0_saved = lex->chr0;
578-
lex->chr1_saved = lex->chr1;
579-
lex->chr2_saved = lex->chr2;
580-
lex->chr0 = lex->fstring_args.buf[0];
581-
lex->chr1 = lex->fstring_args.buf[1];
582-
lex->chr2 = lex->fstring_args.buf[2];
583-
// we've already extracted 3 chars, but setting this non-zero also
584-
// means we'll start consuming the fstring data
585-
lex->fstring_args_idx = 3;
566+
if (lex->inject_chrs_idx == 0) {
567+
// switch from stream to inject_chrs
568+
char *s = vstr_add_len(&lex->inject_chrs, 3);
569+
s[0] = lex->chr0;
570+
s[1] = lex->chr1;
571+
s[2] = lex->chr2;
572+
} else {
573+
// already consuming from inject_chrs, rewind cached chars to insert new ones
574+
assert(lex->inject_chrs_idx >= 3);
575+
lex->inject_chrs_idx -= 3;
576+
}
577+
vstr_ins_strn(&lex->inject_chrs, lex->inject_chrs_idx, lex->fstring_args.buf, lex->fstring_args.len);
578+
vstr_reset(&lex->fstring_args);
579+
lex->chr0 = lex->inject_chrs.buf[lex->inject_chrs_idx++];
580+
lex->chr1 = lex->inject_chrs.buf[lex->inject_chrs_idx++];
581+
lex->chr2 = lex->inject_chrs.buf[lex->inject_chrs_idx++];
586582
}
587583
#endif
588584

@@ -867,8 +863,9 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
867863
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
868864
vstr_init(&lex->vstr, 32);
869865
#if MICROPY_PY_FSTRINGS
866+
vstr_init(&lex->inject_chrs, 0);
867+
lex->inject_chrs_idx = 0;
870868
vstr_init(&lex->fstring_args, 0);
871-
lex->fstring_args_idx = 0;
872869
#endif
873870

874871
// store sentinel for first indentation level
@@ -925,6 +922,7 @@ void mp_lexer_free(mp_lexer_t *lex) {
925922
lex->reader.close(lex->reader.data);
926923
vstr_clear(&lex->vstr);
927924
#if MICROPY_PY_FSTRINGS
925+
vstr_clear(&lex->inject_chrs);
928926
vstr_clear(&lex->fstring_args);
929927
#endif
930928
m_del(uint16_t, lex->indent_level, lex->alloc_indent_level);

py/lexer.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,6 @@ typedef struct _mp_lexer_t {
164164

165165
uint32_t chr0; // first cached byte from source (32-bits for efficient access)
166166
uint8_t chr1, chr2; // subsequent cached bytes from source
167-
#if MICROPY_PY_FSTRINGS
168-
uint8_t chr0_saved, chr1_saved, chr2_saved; // current cached bytes from alt source
169-
#endif
170167

171168
size_t line; // current source line
172169
size_t column; // current source column
@@ -183,8 +180,9 @@ typedef struct _mp_lexer_t {
183180
mp_token_kind_t tok_kind; // token kind
184181
vstr_t vstr; // token data
185182
#if MICROPY_PY_FSTRINGS
183+
vstr_t inject_chrs; // characters currently being injected into the stream
184+
size_t inject_chrs_idx; // current index into inject_chrs
186185
vstr_t fstring_args; // extracted arguments to pass to .format()
187-
size_t fstring_args_idx; // how many bytes of fstring_args have been read
188186
#endif
189187
} mp_lexer_t;
190188

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Test nesting of f-strings within f-strings.
2+
3+
x = 1
4+
5+
# 2-level nesting, with padding.
6+
print(f"a{f'b{x:2}c':>5}d")
7+
8+
# 4-level nesting using the different styles of quotes.
9+
print(f"""a{f'''b{f"c{f'd{x}e'}f"}g'''}h""")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Test nesting of f-strings within f-strings.
2+
# These test rely on Python 3.12+ to use the same quote style for nesting.
3+
4+
x = 1
5+
6+
# 8-level nesting using the same quote style.
7+
print(f"a{f"b{f"c{f"d{f"e{f"f{f"g{f"h{x}i"}j"}k"}l"}m"}n"}o"}p")
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
abcdefgh1ijklmnop

0 commit comments

Comments
 (0)