From 9f155f4a45b6b4b76ec66dbfa89a180fa377d082 Mon Sep 17 00:00:00 2001 From: Chad Murphy Date: Fri, 24 Apr 2026 06:48:32 -0700 Subject: [PATCH] fix(emphasis): flip markers for nested emphasis round-trip (GH-12) Problem: emphasis uses the configured marker unconditionally, so `emphasis > emphasis > text` serializes as `**a**`, which re-parses as `strong > text`. The same path erases the outer type from `strong > emphasis > text` (`***a***` re-parses as `emphasis > strong`) and collapses any strict three-or-more nested emphasis chain (`***a***` re-parses as `emphasis > strong` rather than three nested emphases). GH-12 catalogued a broader family of related shapes: `***emphasis*in emphasis*`, `*a*_b__`, `a ***b*c d*`, and variants with different leading run lengths. As wooorm noted in the thread, "pulling a thread somewhere will have something happen somewhere entirely different": marker choice in one place interacts with flanking rules elsewhere, and covering every shape needs coordination beyond picking a single marker per node. Scope: land the minimal change that fixes the shapes guaranteed to drift under the current serializer, without regressing shapes where CommonMark's attention algorithm already recovers the original tree through fusion. Escape-based work on the remaining GH-12 shapes is left for follow-up. Approach: introduce `lib/util/emphasis-marker.js`. Both the emphasis handler and its peek route marker selection through it, keeping lookahead in `container-phrasing` consistent with what the handler emits. The helper flips in two narrow situations: 1. The emphasis is the only child of an attention parent (emphasis or strong), and both its opening and closing markers would sit immediately next to the parent's primary marker. Using the opposite marker (`*_a_*`, `**_a_**`) breaks the fusion into strong or em+strong. 2. The emphasis sits at the top of a strict same-type chain of depth two or more (every link has exactly one emphasis child) when the primary marker is `*`. Three-deep emphasis only round-trips with `_` on the outside, because `_`'s flanking rules are stricter than `*`'s. The rule is asymmetric on purpose: with primary `_` the first rule's adjacency flip alternates correctly on its own. Strong is never flipped. A run of four asterisks already pairs as two strong delimiters, six as three, and so on, so strong round-trips without help. Journey (what was tried and why the scope narrowed): - An earlier iteration flipped strong too and regressed ~18 corpus fixtures whose nested-strong shapes relied on long fused runs of asterisks. Strong was dropped from the flip to recover them. - Flipping whenever `info.before` or `info.after` matches the primary caused cascading flips on paragraph-level attention siblings: `[emphasis, strong, emphasis]` serialized as `_a___a__*a*`, where `_` + `__` at the em/strong boundary re-tokenised as a single `___` run. The flip was narrowed to attention parents only. - Widening rule 1 to first-or-last-child of any attention parent fixed several GH-12 shapes (`***emphasis*in emphasis*`, `***x*y z*`, `****x*y z*`) but regressed `***a*a*-*` (`emphasis > [emphasis > [emphasis, text], text]`): CommonMark's rule 17 uses the leading `***` fusion to recover the three-deep structure, and the flip broke that recovery. The rule was tightened to only-child plus strict-chain. - The only-child formulation plus the strict-chain rule is the widest version verified to cause zero transitions from ok to finding across 600 corpus files (commonmark, gfm, all configurations). Edge cases covered by new tests: - Plain emphasis and strong, with primaries `*` and `_`, showing the helper is inert on non-nested attention. - `emphasis > emphasis` with each of primary `*` and `_`, yielding `*_a_*` and `_*a*_`. - `strong > emphasis` yielding `**_a_**`. - `emphasis > strong` and `strong > strong` preserved at `***a***` and `****a****`, proving strong is untouched. - Strict three-deep emphasis chains with both primaries, both yielding `_*_a_*_` (chain flip vs adjacency flip arrive at the same output by different routes). - Emphasis parents with more than one child, demonstrating the only-child guard preserves shapes vanilla handles. - Middle-sibling emphasis, confirming no flip at non-boundary positions. - Top-level `[emphasis, strong, emphasis]` round-trip preserved. - `***a*a*-*` fusion shape explicitly preserved as a regression guard against future widening. - Round-trips for parsed `*_a_*`, parsed `_*_a_*_`, synthesised `emphasis > emphasis`, synthesised `strong > emphasis`, and a three-deep chain preceded by a text sibling. Refs: #12 --- lib/handle/emphasis.js | 16 +- lib/util/emphasis-marker.js | 77 +++++++++ test/index.js | 300 ++++++++++++++++++++++++++++++++++++ 3 files changed, 385 insertions(+), 8 deletions(-) create mode 100644 lib/util/emphasis-marker.js diff --git a/lib/handle/emphasis.js b/lib/handle/emphasis.js index 92be547..506a308 100644 --- a/lib/handle/emphasis.js +++ b/lib/handle/emphasis.js @@ -3,7 +3,7 @@ * @import {Emphasis, Parents} from 'mdast' */ -import {checkEmphasis} from '../util/check-emphasis.js' +import {emphasisMarker} from '../util/emphasis-marker.js' import {encodeCharacterReference} from '../util/encode-character-reference.js' import {encodeInfo} from '../util/encode-info.js' @@ -11,13 +11,13 @@ emphasis.peek = emphasisPeek /** * @param {Emphasis} node - * @param {Parents | undefined} _ + * @param {Parents | undefined} parent * @param {State} state * @param {Info} info * @returns {string} */ -export function emphasis(node, _, state, info) { - const marker = checkEmphasis(state) +export function emphasis(node, parent, state, info) { + const marker = emphasisMarker(node, parent, state, info) const exit = state.enter('emphasis') const tracker = state.createTracker(info) const before = tracker.move(marker) @@ -59,11 +59,11 @@ export function emphasis(node, _, state, info) { } /** - * @param {Emphasis} _ - * @param {Parents | undefined} _1 + * @param {Emphasis} node + * @param {Parents | undefined} parent * @param {State} state * @returns {string} */ -function emphasisPeek(_, _1, state) { - return state.options.emphasis || '*' +function emphasisPeek(node, parent, state) { + return emphasisMarker(node, parent, state, {before: '', after: ''}) } diff --git a/lib/util/emphasis-marker.js b/lib/util/emphasis-marker.js new file mode 100644 index 0000000..7be3d48 --- /dev/null +++ b/lib/util/emphasis-marker.js @@ -0,0 +1,77 @@ +/** + * @import {Emphasis, Parents} from 'mdast' + * @import {State} from 'mdast-util-to-markdown' + */ + +import {checkEmphasis} from './check-emphasis.js' + +/** + * Pick the marker to use for an emphasis node, flipping from the configured + * marker to its opposite when the configured marker would fuse with an + * adjacent attention delimiter and re-parse as a different construct. + * + * Only emphasis gets the flip. Strong already round-trips through the + * spec's attention algorithm because a run of 4 asterisks pairs as two + * strong delimiters, and a run of 6 as three, and so on. Nested emphasis + * is the asymmetric case: a run of 2 asterisks pairs as one strong, not as + * two nested emphases, so without a flip `emphasis > emphasis > text` + * round-trips as `strong > text`. + * + * Two situations drive a flip, both narrowly scoped to avoid disturbing + * shapes the serializer already handles via fusion: + * + * 1. The emphasis is an only child of an attention parent (emphasis or + * strong), and both its opening and closing markers would be adjacent + * to the parent's primary marker. Using the opposite marker (for + * example, `*_a_*` for `emphasis > emphasis > text` with primary + * `*`) breaks the fusion. + * + * 2. The emphasis sits at the top of a strict same-type chain of depth at + * least 2 (each link has exactly one emphasis child), with primary + * `*`. Three-deep emphasis collapses under rule 17 unless the + * outermost marker is `_`, because `_`'s flanking rules are stricter + * than `*`'s. The check is asymmetric by design: when the configured + * marker is already `_`, the adjacency flip in rule 1 alone is enough. + * + * @param {Emphasis} node + * @param {Parents | undefined} parent + * @param {State} state + * @param {{before: string, after: string}} info + * Only the `before` and `after` fields are read. + * @returns {'*' | '_'} + */ +export function emphasisMarker(node, parent, state, info) { + const primary = checkEmphasis(state) + const other = primary === '*' ? '_' : '*' + + if ( + parent && + (parent.type === 'emphasis' || parent.type === 'strong') && + 'children' in parent && + parent.children.length === 1 && + info.before.charAt(info.before.length - 1) === primary && + info.after.charAt(0) === primary + ) { + return other + } + + if (primary === '*' && strictChainDepth(node) >= 2) return other + + return primary +} + +/** + * Count the depth of a strict single-child emphasis chain descending from + * `node`. A chain is strict when every link has exactly one child and that + * child is also `emphasis`. + * + * @param {Emphasis} node + * @returns {number} + */ +function strictChainDepth(node) { + const children = node.children + if (!children || children.length !== 1) return 0 + const only = children[0] + if (only.type !== 'emphasis') return 0 + return 1 + strictChainDepth(only) +} diff --git a/test/index.js b/test/index.js index e47facd..b317b69 100644 --- a/test/index.js +++ b/test/index.js @@ -1262,6 +1262,306 @@ test('emphasis', async function (t) { ) } ) + + await t.test( + 'should flip the inner marker for emphasis nested in emphasis', + async function () { + assert.equal( + to({ + type: 'emphasis', + children: [{type: 'emphasis', children: [{type: 'text', value: 'a'}]}] + }), + '*_a_*\n' + ) + } + ) + + await t.test( + 'should flip the inner marker for emphasis nested in strong', + async function () { + assert.equal( + to({ + type: 'strong', + children: [{type: 'emphasis', children: [{type: 'text', value: 'a'}]}] + }), + '**_a_**\n' + ) + } + ) + + await t.test( + 'should not flip strong nested in emphasis (spec fusion handles it)', + async function () { + assert.equal( + to({ + type: 'emphasis', + children: [{type: 'strong', children: [{type: 'text', value: 'a'}]}] + }), + '***a***\n' + ) + } + ) + + await t.test( + 'should not flip strong nested in strong (spec fusion handles it)', + async function () { + assert.equal( + to({ + type: 'strong', + children: [{type: 'strong', children: [{type: 'text', value: 'a'}]}] + }), + '****a****\n' + ) + } + ) + + await t.test( + 'should alternate markers for a strict three-deep emphasis chain', + async function () { + assert.equal( + to({ + type: 'emphasis', + children: [ + { + type: 'emphasis', + children: [ + {type: 'emphasis', children: [{type: 'text', value: 'a'}]} + ] + } + ] + }), + '_*_a_*_\n' + ) + } + ) + + await t.test( + 'should flip emphasis-in-emphasis when primary is `_`', + async function () { + assert.equal( + to( + { + type: 'emphasis', + children: [ + {type: 'emphasis', children: [{type: 'text', value: 'a'}]} + ] + }, + {emphasis: '_'} + ), + '_*a*_\n' + ) + } + ) + + await t.test( + 'should alternate markers for a three-deep chain when primary is `_`', + async function () { + assert.equal( + to( + { + type: 'emphasis', + children: [ + { + type: 'emphasis', + children: [ + {type: 'emphasis', children: [{type: 'text', value: 'a'}]} + ] + } + ] + }, + {emphasis: '_'} + ), + '_*_a_*_\n' + ) + } + ) + + await t.test( + 'should not flip when the emphasis parent has more than one child', + async function () { + assert.equal( + to({ + type: 'emphasis', + children: [ + {type: 'emphasis', children: [{type: 'text', value: 'a'}]}, + {type: 'text', value: 'x'} + ] + }), + '**a*x*\n' + ) + } + ) + + await t.test( + 'should not flip when the nested emphasis is a middle sibling', + async function () { + assert.equal( + to({ + type: 'emphasis', + children: [ + {type: 'text', value: 'x'}, + {type: 'emphasis', children: [{type: 'text', value: 'a'}]}, + {type: 'text', value: 'y'} + ] + }), + '*x*a*y*\n' + ) + } + ) + + await t.test( + 'should keep adjacent attention siblings on their configured marker', + async function () { + assert.equal( + to({ + type: 'paragraph', + children: [ + {type: 'emphasis', children: [{type: 'text', value: 'a'}]}, + {type: 'strong', children: [{type: 'text', value: 'a'}]}, + {type: 'emphasis', children: [{type: 'text', value: 'a'}]} + ] + }), + '*a***a***a*\n' + ) + } + ) + + await t.test( + 'should roundtrip parsed emphasis-in-emphasis', + async function () { + const tree = from('*_a_*') + removePosition(tree, {force: true}) + const out = from(to(tree)) + removePosition(out, {force: true}) + assert.deepEqual(out, tree) + } + ) + + await t.test( + 'should roundtrip synthesized emphasis-in-emphasis', + async function () { + /** @type {Root} */ + const tree = { + type: 'root', + children: [ + { + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [ + {type: 'emphasis', children: [{type: 'text', value: 'a'}]} + ] + } + ] + } + ] + } + const out = from(to(tree)) + removePosition(out, {force: true}) + assert.deepEqual(out, tree) + } + ) + + await t.test( + 'should roundtrip a three-deep emphasis chain', + async function () { + const tree = from('_*_a_*_') + removePosition(tree, {force: true}) + const out = from(to(tree)) + removePosition(out, {force: true}) + assert.deepEqual(out, tree) + } + ) + + await t.test('should roundtrip strong wrapping emphasis', async function () { + /** @type {Root} */ + const tree = { + type: 'root', + children: [ + { + type: 'paragraph', + children: [ + { + type: 'strong', + children: [ + {type: 'emphasis', children: [{type: 'text', value: 'a'}]} + ] + } + ] + } + ] + } + const out = from(to(tree)) + removePosition(out, {force: true}) + assert.deepEqual(out, tree) + }) + + await t.test( + 'should roundtrip a three-deep chain that follows sibling content', + async function () { + /** @type {Root} */ + const tree = { + type: 'root', + children: [ + { + type: 'paragraph', + children: [ + {type: 'text', value: 'x'}, + { + type: 'emphasis', + children: [ + { + type: 'emphasis', + children: [ + { + type: 'emphasis', + children: [{type: 'text', value: 'a'}] + } + ] + } + ] + } + ] + } + ] + } + const out = from(to(tree)) + removePosition(out, {force: true}) + assert.deepEqual(out, tree) + } + ) + + await t.test( + 'should leave `emphasis > strong` adjacent-attention shapes untouched', + async function () { + // GH-12 edge case: intermediate emphasis whose sibling is a text run + // still fuses through the spec's attention algorithm. Verify this fix + // does not disturb a shape where the serializer and parser already + // agree: `*a***a***a*` round-trips as `[emphasis, strong, emphasis]`. + const tree = from('*a***a***a*') + removePosition(tree, {force: true}) + const out = from(to(tree)) + removePosition(out, {force: true}) + assert.deepEqual(out, tree) + } + ) + + await t.test( + 'should leave `***a*a*-*` fusion shape untouched', + async function () { + // GH-12 edge case: `emphasis > [emphasis > [emphasis, text], text]`. + // CommonMark's rule 17 pairs the leading `***` as fused em+strong + // openers and recovers the three-deep nesting. The helper must not + // intervene here, even though the outer emphasis has a first-child + // emphasis, because the fusion is what makes the roundtrip work. + const tree = from('***a*a*-*') + removePosition(tree, {force: true}) + const out = from(to(tree)) + removePosition(out, {force: true}) + assert.deepEqual(out, tree) + } + ) }) test('heading', async function (t) {