• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Yoast / wordpress-seo / 73b72946e659e5fc73d3db84de3e2f50d08337dc

17 Mar 2025 06:07PM UTC coverage: 58.067%. First build
73b72946e659e5fc73d3db84de3e2f50d08337dc

Pull #21974

github

web-flow
Merge 8798bef71 into 31bce8f10
Pull Request #21974: Converts the sentence beginnings assessment to use the HTML parser

7987 of 14094 branches covered (56.67%)

Branch coverage included in aggregate %.

77 of 79 new or added lines in 13 files covered. (97.47%)

13819 of 23459 relevant lines covered (58.91%)

101643.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.5
/packages/js/src/decorator/helpers/positionBasedAnnotationHelper.js
1
import { forEachRight } from "lodash";
2
import { helpers } from "yoastseo";
3

4
/**
5
 * Regex to detect HTML tags.
6
 * Please note that this regex will also detect non-HTML tags that are also wrapped in `<>`.
7
 * For example, in the following sentence, `<strong class="">cats <dogs> rabbit </strong>`,
8
 * we will match `<strong class="">`, `<dogs>` and `</strong>`. This is an edge case though.
9
 * @type {RegExp}
10
 */
11
const htmlTagsRegex = /(<([a-z]|\/)[^<>]+>)/ig;
8✔
12

13
/**
14
 * Regex to detect HTML entities.
15
 * @type {RegExp}
16
 */
17
const { entitiesWithoutGTSRegex } = helpers.htmlEntities;
8✔
18

19
/**
20
 * Adjusts the block start and end offset for a given Mark from the first section of a Yoast sub-block.
21
 *
22
 * For the first section Marks, we need to adjust the block start and end offset.
23
 *
24
 * This is because the first section of a Yoast block is always wrapped in `<strong>` tags.
25
 * In `yoastseo`, when calculating the position information of the matched token, we also take
26
 * into account the length of `<strong>` tags.
27
 * However, here, the HTML for the first section doesn't include the `<strong>` tags.
28
 * As a result, the position information of the matched token will be incorrect.
29
 * Hence, the block start and end offset of the Mark will be subtracted by the length
30
 * of the opening of the `<strong>` tag.
31
 *
32
 * @param {number}        blockStartOffset        The block start offset of the Mark to adjust.
33
 * @param {number}        blockEndOffset                The block end offset of the Mark to adjust.
34
 * @param {string}        blockName                        The block name.
35
 *
36
 * @returns {{blockStartOffset: number, blockEndOffset: number}} The adjusted start offset and end offset of the Mark.
37
 */
38
const adjustFirstSectionOffsets = ( blockStartOffset, blockEndOffset, blockName ) => {
8✔
39
        /*
40
         * Get the opening HTML tag for the first section of a Yoast sub-block.
41
         *
42
         * The Yoast sub-block's first section is always wrapped in `<strong>` tag with the following class name:
43
         * - For Yoast FAQ block, the class name is "schema-faq-question",
44
         * - For Yoast How-To block, the class name is "schema-how-to-step-name",
45
         */
46
        const firstSectionOpenTag = blockName === "yoast/faq-block"
16✔
47
                ? "<strong class=\"schema-faq-question\">"
48
                : "<strong class=\"schema-how-to-step-name\">";
49

50
        blockStartOffset = blockStartOffset - firstSectionOpenTag.length;
16✔
51
        blockEndOffset = blockEndOffset - firstSectionOpenTag.length;
16✔
52

53
        return { blockStartOffset, blockEndOffset };
16✔
54
};
55

56
/**
57
 * Retrieves the length for HTML tags, adjusts the length for `<br>` tags.
58
 * @param {[Object]} htmlTags Array of HTML tags.
59
 * @returns {number} The length of the given HTML tags.
60
 */
61
const getTagsLength = ( htmlTags ) => {
8✔
62
        let tagsLength = 0;
48✔
63
        forEachRight( htmlTags, ( htmlTag ) => {
48✔
64
                const [ tag ] = htmlTag;
64✔
65
                let tagLength = tag.length;
64✔
66
                // Here, we need to account for treating <br> tags as sentence delimiters, and subtract 1 from the tagLength.
67
                if ( /^<\/?br/.test( tag ) ) {
64!
68
                        tagLength -= 1;
×
69
                }
70

71
                tagsLength += tagLength;
64✔
72
        } );
73

74
        return tagsLength;
48✔
75
};
76

77
/**
78
 * Adjusts the block start and end offsets of a given Mark when the block HTML contains HTML tags.
79
 *
80
 * @param {string}        slicedBlockHtmlToStartOffset        The block HTML from the 0 index to the index of the block start offset.
81
 * @param {string}        slicedBlockHtmlToEndOffset                The block HTML from the 0 index to the index of the block end offset.
82
 * @param {number}        blockStartOffset                                The block start offset of the Mark to adjust.
83
 * @param {number}        blockEndOffset                                        The block end offset of the Mark to adjust.
84
 * @returns {{blockStartOffset: number, blockEndOffset: number}} The adjusted start offset and end offset of the Mark.
85
 */
86
const adjustOffsetsForHtmlTags = ( slicedBlockHtmlToStartOffset, slicedBlockHtmlToEndOffset, blockStartOffset, blockEndOffset ) => {
8✔
87
        /*
88
         * Loop through the found HTML tags backwards, and adjust the start and end offsets of the Mark
89
         * by subtracting them with the length of the found HTML tags.
90
         *
91
         * This step is necessary to account for the difference in the way we "parse" the block and calculate the token position
92
         * between `yoastseo` package and block annotation API.
93
         * Inside `yoastseo`, the token's position information also takes into account all the HTML tags surrounding it in a block.
94
         * However, the block annotation API applies annotations to "clean" text/html without any HTML tags.
95
         * As a result, the token position information we retrieve from `yoastseo` wouldn't match that of block annotation API.
96
         * Example:
97
         * From `yoastseo`:
98
         * - Text: This is a giant <strong>panda</strong>.
99
         * - Range of "panda": 24 - 29
100
         * In the block:
101
         * - Text: This is a giant panda.
102
         * - Range of "panda": 16 -21
103
         */
104
        const foundHtmlTagsToStartOffset = [ ...slicedBlockHtmlToStartOffset.matchAll( htmlTagsRegex ) ];
24✔
105
        blockStartOffset -= getTagsLength( foundHtmlTagsToStartOffset );
24✔
106

107
        const foundHtmlTagsToEndOffset = [ ...slicedBlockHtmlToEndOffset.matchAll( htmlTagsRegex ) ];
24✔
108
        blockEndOffset -= getTagsLength( foundHtmlTagsToEndOffset );
24✔
109

110
        return { blockStartOffset, blockEndOffset };
24✔
111
};
112

113
/**
114
 * Adjusts the block start and end offsets of a given Mark when the block HTML contains HTML entities.
115
 *
116
 * @param {string} html The block HTML.
117
 * @param {number} offset The block start or end offset of the Mark to adjust.
118
 * @param {string} richText The rich text of the block.
119
 * @returns {number} The adjusted offset.
120
 */
121
const adjustOffsetsForHtmlEntities = ( html, offset, richText ) => {
8✔
122
        const matchedHtmlEntities = [ ...html.matchAll( entitiesWithoutGTSRegex ) ];
48✔
123
        forEachRight( matchedHtmlEntities, ( matchedEntity ) => {
48✔
124
                /*
125
                 * If the matchedEntity is `&amp;`, matchedEntityWithoutAmp (the second element in the array) is `amp;`.
126
                 * To get the length of the HTML entity to be 1, we subtract the offset by the length of the matched entity minus the ampersand.
127
                 */
128
                const [ , matchedEntityWithoutAmp ] = matchedEntity;
8✔
129
                offset -= matchedEntityWithoutAmp.length;
8✔
130
        } );
131

132
        // Special case for `&nbsp;` and `&gt;` entities.
133
        // Non-breaking space sometimes is represented as its unicode character `\u00a0` in the block's HTML.
134
        // In this case, we need to adjust the offset by the length of the HTML entity.
135
        const nbspEntity = [ "&nbsp;", /\u00a0/ig ];
48✔
136
        const matchedNbsp = html.match( nbspEntity[ 1 ] );
48✔
137
        if ( matchedNbsp ) {
48!
NEW
138
                offset -= nbspEntity[ 0 ].length - 1;
×
139
        }
140
        // When adjusting the offset for `&gt;` entity, we need to consider only the entity that is found only in the rich text, and not in the HTML.
141
        // This way, we minimize the risk of adjusting the offset incorrectly for the `&gt;` entity inside an HTML tag, e.g. `<strong>` or `<a>`.
142
        const gtsEntity = [ "&gt;", /\u003e/ig ];
48✔
143
        const matchedGts = richText.match( gtsEntity[ 1 ] );
48✔
144
        if ( matchedGts ) {
48!
NEW
145
                offset -= gtsEntity[ 0 ].length - 1;
×
146
        }
147
        return offset;
48✔
148
};
149

150
/**
151
 * Adjusts the block start and end offsets of a given Mark when the block HTML contains HTML tags or entities.
152
 *
153
 * @param {number}        blockStartOffset        The block start offset of the Mark to adjust.
154
 * @param {number}        blockEndOffset                The block end offset of the Mark to adjust.
155
 * @param {string}        blockHtml                        The HTML of the block.
156
 * @param {string}        richText                        The rich text of the block.
157
 * @returns {{blockStartOffset: number, blockEndOffset: number}} The adjusted start offset and end offset of the Mark.
158
 */
159
const adjustMarkOffsets = ( blockStartOffset, blockEndOffset, blockHtml, richText ) => {
8✔
160
        const slicedBlockHtmlToStartOffset = blockHtml.slice( 0, blockStartOffset );
24✔
161
        const slicedBlockHtmlToEndOffset = blockHtml.slice( 0, blockEndOffset );
24✔
162

163
        // Adjust the offsets when there are HTML tags found between the start of the HTML and the start/end offset of the Mark.
164
        const adjustedOffsetsInCaseOfHtmlTags = adjustOffsetsForHtmlTags(
24✔
165
                slicedBlockHtmlToStartOffset,
166
                slicedBlockHtmlToEndOffset,
167
                blockStartOffset,
168
                blockEndOffset
169
        );
170
        blockStartOffset = adjustedOffsetsInCaseOfHtmlTags.blockStartOffset;
24✔
171
        blockEndOffset = adjustedOffsetsInCaseOfHtmlTags.blockEndOffset;
24✔
172

173
        // Adjust the offsets when there are HTML entities found between the start of the HTML and the start/end offset of the Mark.
174
        /*
175
         * In `yoastseo`, we process the HTML entities so that their length is the length of their extended version.
176
         * For example, the ampersand `&` length is the length of `&amp;` => 5.
177
         * However, in Gutenberg editor where we annotate the rich text, the ampersand is represented as `&`.
178
         * Hence, to say that its length is 5 is incorrect and will result in an incorrect annotation.
179
         * With this reason, we also need to adjust the Mark block start and end offset when the block's HTML contains HTML entities.
180
         *
181
         * Note: the comment below also applies to `adjustOffsetsForHtmlTags` function above.
182
         *
183
         * Additionally, it's important to have a separate step for adjusting the start and end offset.
184
         * This is because in the offsets range of the Mark, it's still possible that an HTML entity (or multiple) or an HTML tag is present.
185
         * This means that we also need to subtract the end offset by the length of the HTML entities/tags found in the Mark's offsets range.
186
         *
187
         * For example, we want to highlight the word "Bearâ„¢" of this HTML "The great <em><strong>Panda &amp; Bear</strong></em>&trade;"
188
         * The Mark's offsets from `yoastseo` are { blockStartOffset: 34, blockEndOffset: 53 }
189
         * However, since in Gutenberg we apply the annotation to the rich text "The great Panda & Bearâ„¢",
190
         * we need to adjust the offsets above to { blockStartOffset: 18, blockEndOffset: 23 }.
191
         * Only subtracting the end offset by the length of the HTML entities/tags found between the 0 index of the HTML
192
         * to the start offset of the Mark will result in incorrect position information.
193
         */
194
        blockStartOffset = adjustOffsetsForHtmlEntities( slicedBlockHtmlToStartOffset, blockStartOffset, richText );
24✔
195
        blockEndOffset = adjustOffsetsForHtmlEntities( slicedBlockHtmlToEndOffset, blockEndOffset, richText );
24✔
196

197
        return { blockStartOffset, blockEndOffset };
24✔
198
};
199

200

201
/**
202
 * Creates an annotation range if the given Mark has position information.
203
 * A helper for position-based highlighting.
204
 *
205
 * @param {Mark}   mark                                The Mark to apply to the content.
206
 * @param {string} blockClientId        The client id of the block.
207
 * @param {string} blockName                The name of the block.
208
 * @param {string} blockHtml                The HTML of the block: possibly contains HTML tags.
209
 * @param {string} richText                        The rich text of the block: the text without HTML tags.
210
 *
211
 * @returns {[{startOffset: number, endOffset: number}]} The array of annotation range object.
212
 */
213
export function createAnnotationsFromPositionBasedMarks( mark, blockClientId, blockName, blockHtml, richText ) {
214
        // If the block client id is the same as the Mark's block client id, it means that this Mark is intended for this block.
215
        if ( blockClientId === mark.getBlockClientId() ) {
62✔
216
                let blockStartOffset = mark.getBlockPositionStart();
46✔
217
                let blockEndOffset = mark.getBlockPositionEnd();
46✔
218

219
                // If the Mark is created for the first section of a Yoast sub-block, we need to adjust the block start and end offsets of the Mark.
220
                if ( mark.isMarkForFirstBlockSection() ) {
46✔
221
                        const adjustedFirstSectionOffsets = adjustFirstSectionOffsets( blockStartOffset, blockEndOffset, blockName );
16✔
222
                        blockStartOffset = adjustedFirstSectionOffsets.blockStartOffset;
16✔
223
                        blockEndOffset = adjustedFirstSectionOffsets.blockEndOffset;
16✔
224
                }
225

226
                // Get the HTML part from the block start offset of the Mark until the block end offset of the Mark.
227
                const slicedHtml = blockHtml.slice( blockStartOffset, blockEndOffset );
46✔
228
                // Get the rich text part from the block start offset of the Mark until the block end offset of the Mark.
229
                const slicedRichText = richText.slice( blockStartOffset, blockEndOffset );
46✔
230

231
                // If the HTML and the rich text are equal, return the current offsets.
232
                if ( slicedHtml === slicedRichText ) {
46✔
233
                        return [
22✔
234
                                {
235
                                        startOffset: blockStartOffset,
236
                                        endOffset: blockEndOffset,
237
                                },
238
                        ];
239
                }
240

241
                // If not, adjust the offsets further by checking for HTML tags or entities.
242
                const adjustedMarkOffsets = adjustMarkOffsets( blockStartOffset, blockEndOffset, blockHtml, richText );
24✔
243
                return [
24✔
244
                        {
245
                                startOffset: adjustedMarkOffsets.blockStartOffset,
246
                                endOffset: adjustedMarkOffsets.blockEndOffset,
247
                        },
248
                ];
249
        }
250
        return [];
16✔
251
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc