• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Yoast / wordpress-seo / 34a7d53098782b86fad431e478d55f430e3192a4

19 Mar 2025 09:24AM UTC coverage: 58.06%. First build
34a7d53098782b86fad431e478d55f430e3192a4

Pull #21974

github

web-flow
Merge 9e1b1bfbc into 92d7e7010
Pull Request #21974: Converts the sentence beginnings assessment to use the HTML parser

7988 of 14099 branches covered (56.66%)

Branch coverage included in aggregate %.

78 of 80 new or added lines in 13 files covered. (97.5%)

13828 of 23476 relevant lines covered (58.9%)

101569.95 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.59
/packages/js/src/decorator/helpers/positionBasedAnnotationHelper.js
1
import { forEachRight } from "lodash";
2
import { helpers } from "yoastseo";
3

4
/**
5
 * Regex to detect HTML tags.
6
 * Please note that this regex will also detect non-HTML tags that are also wrapped in `<>`.
7
 * For example, in the following sentence, `<strong class="">cats <dogs> rabbit </strong>`,
8
 * we will match `<strong class="">`, `<dogs>` and `</strong>`. This is an edge case though.
9
 * @type {RegExp}
10
 */
11
const htmlTagsRegex = /(<([a-z]|\/)[^<>]+>)/ig;
8✔
12

13
/**
14
 * Regex to detect HTML entities.
15
 * @type {RegExp}
16
 */
17
const { entitiesWithoutGTSRegex } = helpers.htmlEntities;
8✔
18

19
/**
20
 * Adjusts the block start and end offset for a given Mark from the first section of a Yoast sub-block.
21
 *
22
 * For the first section Marks, we need to adjust the block start and end offset.
23
 *
24
 * This is because the first section of a Yoast block is always wrapped in `<strong>` tags.
25
 * In `yoastseo`, when calculating the position information of the matched token, we also take
26
 * into account the length of `<strong>` tags.
27
 * However, here, the HTML for the first section doesn't include the `<strong>` tags.
28
 * As a result, the position information of the matched token will be incorrect.
29
 * Hence, the block start and end offset of the Mark will be subtracted by the length
30
 * of the opening of the `<strong>` tag.
31
 *
32
 * @param {number}        blockStartOffset        The block start offset of the Mark to adjust.
33
 * @param {number}        blockEndOffset                The block end offset of the Mark to adjust.
34
 * @param {string}        blockName                        The block name.
35
 *
36
 * @returns {{blockStartOffset: number, blockEndOffset: number}} The adjusted start offset and end offset of the Mark.
37
 */
38
const adjustFirstSectionOffsets = ( blockStartOffset, blockEndOffset, blockName ) => {
8✔
39
        /*
40
         * Get the opening HTML tag for the first section of a Yoast sub-block.
41
         *
42
         * The Yoast sub-block's first section is always wrapped in `<strong>` tag with the following class name:
43
         * - For Yoast FAQ block, the class name is "schema-faq-question",
44
         * - For Yoast How-To block, the class name is "schema-how-to-step-name",
45
         */
46
        const firstSectionOpenTag = blockName === "yoast/faq-block"
16✔
47
                ? "<strong class=\"schema-faq-question\">"
48
                : "<strong class=\"schema-how-to-step-name\">";
49

50
        blockStartOffset = blockStartOffset - firstSectionOpenTag.length;
16✔
51
        blockEndOffset = blockEndOffset - firstSectionOpenTag.length;
16✔
52

53
        return { blockStartOffset, blockEndOffset };
16✔
54
};
55

56
/**
57
 * Retrieves the length for HTML tags, adjusts the length for `<br>` tags.
58
 * @param {[Object]} htmlTags Array of HTML tags.
59
 * @returns {number} The length of the given HTML tags.
60
 */
61
const getTagsLength = ( htmlTags ) => {
8✔
62
        let tagsLength = 0;
48✔
63
        forEachRight( htmlTags, ( htmlTag ) => {
48✔
64
                const [ tag ] = htmlTag;
64✔
65
                let tagLength = tag.length;
64✔
66
                // Here, we need to account for treating <br> tags as sentence delimiters, and subtract 1 from the tagLength.
67
                if ( /^<\/?br/.test( tag ) ) {
64!
68
                        tagLength -= 1;
×
69
                }
70

71
                tagsLength += tagLength;
64✔
72
        } );
73

74
        return tagsLength;
48✔
75
};
76

77
/**
78
 * Adjusts the block start and end offsets of a given Mark when the block HTML contains HTML tags.
79
 *
80
 * @param {string}        slicedBlockHtmlToStartOffset        The block HTML from the 0 index to the index of the block start offset.
81
 * @param {string}        slicedBlockHtmlToEndOffset                The block HTML from the 0 index to the index of the block end offset.
82
 * @param {number}        blockStartOffset                                The block start offset of the Mark to adjust.
83
 * @param {number}        blockEndOffset                                        The block end offset of the Mark to adjust.
84
 * @returns {{blockStartOffset: number, blockEndOffset: number}} The adjusted start offset and end offset of the Mark.
85
 */
86
const adjustOffsetsForHtmlTags = ( slicedBlockHtmlToStartOffset, slicedBlockHtmlToEndOffset, blockStartOffset, blockEndOffset ) => {
8✔
87
        /*
88
         * Loop through the found HTML tags backwards, and adjust the start and end offsets of the Mark
89
         * by subtracting them with the length of the found HTML tags.
90
         *
91
         * This step is necessary to account for the difference in the way we "parse" the block and calculate the token position
92
         * between `yoastseo` package and block annotation API.
93
         * Inside `yoastseo`, the token's position information also takes into account all the HTML tags surrounding it in a block.
94
         * However, the block annotation API applies annotations to "clean" text/html without any HTML tags.
95
         * As a result, the token position information we retrieve from `yoastseo` wouldn't match that of block annotation API.
96
         * Example:
97
         * From `yoastseo`:
98
         * - Text: This is a giant <strong>panda</strong>.
99
         * - Range of "panda": 24 - 29
100
         * In the block:
101
         * - Text: This is a giant panda.
102
         * - Range of "panda": 16 -21
103
         */
104
        const foundHtmlTagsToStartOffset = [ ...slicedBlockHtmlToStartOffset.matchAll( htmlTagsRegex ) ];
24✔
105
        blockStartOffset -= getTagsLength( foundHtmlTagsToStartOffset );
24✔
106

107
        const foundHtmlTagsToEndOffset = [ ...slicedBlockHtmlToEndOffset.matchAll( htmlTagsRegex ) ];
24✔
108
        blockEndOffset -= getTagsLength( foundHtmlTagsToEndOffset );
24✔
109

110
        return { blockStartOffset, blockEndOffset };
24✔
111
};
112

113
/**
114
 * Adjusts the block start and end offsets of a given Mark when the block HTML contains HTML entities.
115
 *
116
 * @param {string} html The block HTML.
117
 * @param {number} offset The block start or end offset of the Mark to adjust.
118
 * @param {string} richText The rich text of the block.
119
 * @returns {number} The adjusted offset.
120
 */
121
const adjustOffsetsForHtmlEntities = ( html, offset, richText ) => {
8✔
122
        const matchedHtmlEntities = [ ...html.matchAll( entitiesWithoutGTSRegex ) ];
48✔
123
        forEachRight( matchedHtmlEntities, ( matchedEntity ) => {
48✔
124
                /*
125
                 * If the matchedEntity is `&amp;`, matchedEntityWithoutAmp (the second element in the array) is `amp;`.
126
                 * To get the length of the HTML entity to be 1, we subtract the offset by the length of the matched entity minus the ampersand.
127
                 */
128
                const [ , matchedEntityWithoutAmp ] = matchedEntity;
8✔
129
                offset -= matchedEntityWithoutAmp.length;
8✔
130
        } );
131

132
        // Special case for `&nbsp;` and `&gt;` entities.
133
        /*
134
         Non-breaking space sometimes is represented as its unicode character `\u00a0` in the block's HTML.
135
         In this case, we need to adjust the offset by the length of the HTML entity.
136
         */
137
        const nbspEntity = [ "&nbsp;", /\u00a0/ig ];
48✔
138
        const matchedNbsp = html.match( nbspEntity[ 1 ] );
48✔
139
        forEachRight( matchedNbsp, () => {
48✔
NEW
140
                offset -= nbspEntity[ 0 ].length - 1;
×
141
        } );
142
        /*
143
         When adjusting the offset for `&gt;` entity, we need to consider only the entity that is found only in the rich text, and not in the HTML.
144
         This way, we minimize the risk of adjusting the offset incorrectly for the `&gt;` entity inside an HTML tag, e.g. `<strong>` or `<a>`.
145
         */
146
        const gtsEntity = [ "&gt;", /\u003e/ig ];
48✔
147
        const matchedGts = richText.slice( 0, offset ).match( gtsEntity[ 1 ] );
48✔
148
        forEachRight( matchedGts, () => {
48✔
NEW
149
                offset -= gtsEntity[ 0 ].length - 1;
×
150
        } );
151
        return offset;
48✔
152
};
153

154
/**
155
 * Adjusts the block start and end offsets of a given Mark when the block HTML contains HTML tags or entities.
156
 *
157
 * @param {number}        blockStartOffset        The block start offset of the Mark to adjust.
158
 * @param {number}        blockEndOffset                The block end offset of the Mark to adjust.
159
 * @param {string}        blockHtml                        The HTML of the block.
160
 * @param {string}        richText                        The rich text of the block.
161
 * @returns {{blockStartOffset: number, blockEndOffset: number}} The adjusted start offset and end offset of the Mark.
162
 */
163
const adjustMarkOffsets = ( blockStartOffset, blockEndOffset, blockHtml, richText ) => {
8✔
164
        const slicedBlockHtmlToStartOffset = blockHtml.slice( 0, blockStartOffset );
24✔
165
        const slicedBlockHtmlToEndOffset = blockHtml.slice( 0, blockEndOffset );
24✔
166

167
        // Adjust the offsets when there are HTML tags found between the start of the HTML and the start/end offset of the Mark.
168
        const adjustedOffsetsInCaseOfHtmlTags = adjustOffsetsForHtmlTags(
24✔
169
                slicedBlockHtmlToStartOffset,
170
                slicedBlockHtmlToEndOffset,
171
                blockStartOffset,
172
                blockEndOffset
173
        );
174
        blockStartOffset = adjustedOffsetsInCaseOfHtmlTags.blockStartOffset;
24✔
175
        blockEndOffset = adjustedOffsetsInCaseOfHtmlTags.blockEndOffset;
24✔
176

177
        // Adjust the offsets when there are HTML entities found between the start of the HTML and the start/end offset of the Mark.
178
        /*
179
         * In `yoastseo`, we process the HTML entities so that their length is the length of their extended version.
180
         * For example, the ampersand `&` length is the length of `&amp;` => 5.
181
         * However, in Gutenberg editor where we annotate the rich text, the ampersand is represented as `&`.
182
         * Hence, to say that its length is 5 is incorrect and will result in an incorrect annotation.
183
         * With this reason, we also need to adjust the Mark block start and end offset when the block's HTML contains HTML entities.
184
         *
185
         * Note: the comment below also applies to `adjustOffsetsForHtmlTags` function above.
186
         *
187
         * Additionally, it's important to have a separate step for adjusting the start and end offset.
188
         * This is because in the offsets range of the Mark, it's still possible that an HTML entity (or multiple) or an HTML tag is present.
189
         * This means that we also need to subtract the end offset by the length of the HTML entities/tags found in the Mark's offsets range.
190
         *
191
         * For example, we want to highlight the word "Bearâ„¢" of this HTML "The great <em><strong>Panda &amp; Bear</strong></em>&trade;"
192
         * The Mark's offsets from `yoastseo` are { blockStartOffset: 34, blockEndOffset: 53 }
193
         * However, since in Gutenberg we apply the annotation to the rich text "The great Panda & Bearâ„¢",
194
         * we need to adjust the offsets above to { blockStartOffset: 18, blockEndOffset: 23 }.
195
         * Only subtracting the end offset by the length of the HTML entities/tags found between the 0 index of the HTML
196
         * to the start offset of the Mark will result in incorrect position information.
197
         */
198
        blockStartOffset = adjustOffsetsForHtmlEntities( slicedBlockHtmlToStartOffset, blockStartOffset, richText );
24✔
199
        blockEndOffset = adjustOffsetsForHtmlEntities( slicedBlockHtmlToEndOffset, blockEndOffset, richText );
24✔
200

201
        return { blockStartOffset, blockEndOffset };
24✔
202
};
203

204

205
/**
206
 * Creates an annotation range if the given Mark has position information.
207
 * A helper for position-based highlighting.
208
 *
209
 * @param {Mark}   mark                                The Mark to apply to the content.
210
 * @param {string} blockClientId        The client id of the block.
211
 * @param {string} blockName                The name of the block.
212
 * @param {string} blockHtml                The HTML of the block: possibly contains HTML tags.
213
 * @param {string} richText                        The rich text of the block: the text without HTML tags.
214
 *
215
 * @returns {[{startOffset: number, endOffset: number}]} The array of annotation range object.
216
 */
217
export function createAnnotationsFromPositionBasedMarks( mark, blockClientId, blockName, blockHtml, richText ) {
218
        // If the block client id is the same as the Mark's block client id, it means that this Mark is intended for this block.
219
        if ( blockClientId === mark.getBlockClientId() ) {
62✔
220
                let blockStartOffset = mark.getBlockPositionStart();
46✔
221
                let blockEndOffset = mark.getBlockPositionEnd();
46✔
222

223
                // If the Mark is created for the first section of a Yoast sub-block, we need to adjust the block start and end offsets of the Mark.
224
                if ( mark.isMarkForFirstBlockSection() ) {
46✔
225
                        const adjustedFirstSectionOffsets = adjustFirstSectionOffsets( blockStartOffset, blockEndOffset, blockName );
16✔
226
                        blockStartOffset = adjustedFirstSectionOffsets.blockStartOffset;
16✔
227
                        blockEndOffset = adjustedFirstSectionOffsets.blockEndOffset;
16✔
228
                }
229

230
                // Get the HTML part from the block start offset of the Mark until the block end offset of the Mark.
231
                const slicedHtml = blockHtml.slice( blockStartOffset, blockEndOffset );
46✔
232
                // Get the rich text part from the block start offset of the Mark until the block end offset of the Mark.
233
                const slicedRichText = richText.slice( blockStartOffset, blockEndOffset );
46✔
234

235
                // If the HTML and the rich text are equal, return the current offsets.
236
                if ( slicedHtml === slicedRichText ) {
46✔
237
                        return [
22✔
238
                                {
239
                                        startOffset: blockStartOffset,
240
                                        endOffset: blockEndOffset,
241
                                },
242
                        ];
243
                }
244

245
                // If not, adjust the offsets further by checking for HTML tags or entities.
246
                const adjustedMarkOffsets = adjustMarkOffsets( blockStartOffset, blockEndOffset, blockHtml, richText );
24✔
247
                return [
24✔
248
                        {
249
                                startOffset: adjustedMarkOffsets.blockStartOffset,
250
                                endOffset: adjustedMarkOffsets.blockEndOffset,
251
                        },
252
                ];
253
        }
254
        return [];
16✔
255
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc