23728902738

Committed 30 Mar 2026 05:07AM UTC coverage: 69.753%. First build

Build # 23728902738

Build Type

Pull #3

github

Committed by

cursoragent

Commit Message

style: format onnx embedding helper

Co-authored-by: gaoyang2024 <gaoyang2024@users.noreply.github.com>

Pull Request Pull Request #3: Cursor/ bc a79f8a8c ff83 498c a86e 8c1ae44409a8 cf0a

Coverage Stats

65 of 85 branches covered (76.47%)

Branch coverage included in aggregate %.

29 of 51 new or added lines in 3 files covered. (56.86%)

387 of 563 relevant lines covered (68.74%)

27.17 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

58.25

/src/core/markdown-parser.ts

// Markdown 解析器

import { copyFile } from 'node:fs/promises';
import { marked, type Token, type Tokens, Marked } from 'marked';
import { basename, dirname, extname, join } from 'node:path';
import type { Env, Heading, MarkdownFile } from '../types/index.js';
import { generateIdFromText, isMarkdownFile, mkdirAsync, readFile } from '../utils';
import { gitbookExtension } from './marked-plugins/gitbook.plugin.js';
import { katexExtension } from './marked-plugins/katex.plugin.js';
import { gitbookTabExtension } from './marked-plugins/gitbook-tab.plugin.js';
import { gitbookStepperExtension } from './marked-plugins/gitbook-stepper.plugin.js';
import {
  gitbookIncludeExtension,
  type GitbookIncludeToken,
  IncludeTokenType,
} from './marked-plugins/gitbook-include.plugin.js';

const renderer = new marked.Renderer();
renderer.heading = ({ tokens, depth }: Tokens.Heading) => {
  const token = tokens[0] as unknown as Heading;
  token.id = generateIdFromText(token.text);
  return `<h${depth} id="${token.id}">
  <a href="#${token.id}" class="anchor"></a>
  ${token.text}
</h${depth}>`;
};

export interface MarkdownParserOptions {
  env: Env;
}
export interface ToHTMLOptions {
  contentPath: string;
  destDir: string;
}
export class MarkdownParser {
  private marked: Marked;
  private readonly env: Env;

  constructor(options: MarkdownParserOptions) {
    this.env = options.env;
    this.marked = new Marked();
    this.marked.setOptions({
      gfm: true,
      breaks: true,
      renderer,
    });
    this.marked.use(gitbookExtension);
    this.marked.use(gitbookTabExtension);
    this.marked.use(gitbookStepperExtension);
    this.marked.use(katexExtension);
    this.marked.use(gitbookIncludeExtension);
  }

  /**
   * 解析 markdown 文件
   */
  async parseFile(filePath: string): Promise<MarkdownFile> {
    const content = await readFile(filePath);
    const headings = this.extractHeadings(content);
    const title = this.extractTitle(content, headings);

    return {
      path: filePath,
      title,
      content,
      headings,
    };
  }

  public toPlainText(content: string): string {
    return content
      .replace(/\r\n?/g, '\n')
      .replace(/^---[\s\S]*?\n---\n?/m, ' ')
      .replace(/```[\s\S]*?```/g, ' ')
      .replace(/`([^`]+)`/g, '$1')
      .replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
      .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
      .replace(/^>\s?/gm, '')
      .replace(/^#{1,6}\s+/gm, '')
      .replace(/[*_~]/g, ' ')
      .replace(/\|/g, ' ')
      .replace(/\n+/g, ' ')
      .replace(/\s+/g, ' ')
      .trim();
  }

  /**
   * 提取标题结构
   */
  private extractHeadings(content: string): Heading[] {
    const normalizedStr = content.replace(/\r\n?/g, '\n');
    const lines = normalizedStr.split('\n');
    // logger.info('normalized', normalizedStr);
    const headings: Heading[] = [];
    const stack: Heading[] = [];

    for (const line of lines) {
      const match = line.trim().match(/^(#{1,6})\s+(.+)$/);
      if (match) {
        const level = match[1].length;
        const text = match[2].trim();
        const id = generateIdFromText(text);

        const heading: Heading = {
          level,
          text,
          id,
          children: [],
        };

        // 找到合适的父级标题
        while (stack.length > 0 && stack[stack.length - 1].level >= level) {
          stack.pop();
        }

        if (stack.length === 0) {
          headings.push(heading);
        } else {
          stack[stack.length - 1].children.push(heading);
        }

        stack.push(heading);
      }
    }

    return headings;
  }

  /**
   * 提取文档标题
   */
  private extractTitle(_content: string, headings: Heading[]): string {
    // 优先使用第一个一级标题
    const firstH1 = headings.find((h) => h.level === 1);
    if (firstH1) {
      return firstH1.text;
    }

    // 如果没有一级标题，使用第一个标题
    if (headings.length > 0) {
      return headings[0].text;
    }

    // 如果没有任何标题，使用文件名
    return 'Untitled';
  }

  private async copyResource(src: string, options: ToHTMLOptions) {
    const decodedSrc = decodeURIComponent(src);
    const imageFromPath = join(dirname(options.contentPath), decodedSrc);
    const imageToPath = join(options.destDir, decodedSrc);
    const imageToDir = dirname(imageToPath);
    await mkdirAsync(imageToDir);
    await copyFile(imageFromPath, imageToPath);
    return imageToPath;
  }

  // /**
  //  * 解析表格单元格中的链接
  //  */
  // private parseTableCellLinks(cell: Tokens.TableCell): void {
  //   // 如果单元格内容只是纯文本且包含链接格式，则解析为链接
  //   if (cell.tokens.length === 1 && cell.tokens[0].type === 'text') {
  //     const textToken = cell.tokens[0] as Tokens.Text;
  //     const linkMatch = textToken.text.match(/^\[([^\]]+)\]\(([^)]+)\)$/);
  //     if (linkMatch) {
  //       let href = linkMatch[2];
  //       if (isMarkdownFile(href)) {
  //         const path = dirname(href);
  //         const filename = basename(href, extname(href));
  //         href = path === '.' ? `./${filename}.html` : `${path}/${filename}.html`;
  //       }
  //       // 将文本 token 替换为链接 token
  //       const linkToken: Tokens.Link = {
  //         type: 'link',
  //         raw: textToken.raw,
  //         href,
  //         title: null,
  //         text: linkMatch[1],
  //         tokens: [
  //           {
  //             type: 'text',
  //             raw: linkMatch[1],
  //             text: linkMatch[1],
  //           },
  //         ],
  //       };
  //       cell.tokens = [linkToken];
  //     }
  //   }
  // }

  /**
   * 将 markdown 转换为 HTML
   */
  async toHtml(content: string, options: ToHTMLOptions): Promise<string> {
    const html = await this.marked.parse(content, {
      async: true,
      walkTokens: async (token: Token) => {
        if (token.type === 'image') {
          const src = token.href;
          if (
            !src
            || src.startsWith('http')
            || src.startsWith('data:image/')
            || src.startsWith('blob:')
            || src.startsWith('//')
          ) {
            return;
          }
          const imageToPath = await this.copyResource(src, options);
          if (this.env === 'pdf') {
            const buffer = await readFile(imageToPath, 'base64');
            const ext = extname(imageToPath).slice(1);
            token.href = `data:image/${ext};base64,${buffer}`;
          }
        } else if (token.type === 'link') {
          const href = token.href;
          // 检查 href 是否存在
          if (!href) {
            return;
          }
          if (href.startsWith('http') || href.startsWith('https')) {
            return;
          }
          if (!isMarkdownFile(href)) {
            await this.copyResource(href, options);
            return;
          }
          const path = dirname(href);
          const filename = basename(href, extname(href));
          // 确保路径格式正确：如果 path 是 '.'，则使用相对路径
          const link = path === '.' ? `./${filename}.html` : `${path}/${filename}.html`;
          token.href = link;
        } else if (token.type === 'code') {
          // 处理 mermaid 代码块
          const codeToken = token as Tokens.Code;
          if (codeToken.lang === 'mermaid') {
            const diagram = codeToken.text;
            token.type = 'html';
            token.text = `<pre class="mermaid">
${diagram}
</pre>`;
          } else if (!codeToken.lang) {
            // 如果没有指定语言，设置为 plain text
            codeToken.lang = 'plain';
          }
        } else if (token.type === IncludeTokenType) {
          const includeToken = token as GitbookIncludeToken;
          const includeContent = await readFile(
            join(dirname(options.contentPath), includeToken.path),
          );
          token.type = 'html';
          token.text = await this.toHtml(includeContent, options);
        }
      },
    });
    return html;
  }
}

1	// Markdown 解析器
2
3	import { copyFile } from 'node:fs/promises';
4	import { marked, type Token, type Tokens, Marked } from 'marked';
5	import { basename, dirname, extname, join } from 'node:path';
6	import type { Env, Heading, MarkdownFile } from '../types/index.js';
7	import { generateIdFromText, isMarkdownFile, mkdirAsync, readFile } from '../utils';
8	import { gitbookExtension } from './marked-plugins/gitbook.plugin.js';
9	import { katexExtension } from './marked-plugins/katex.plugin.js';
10	import { gitbookTabExtension } from './marked-plugins/gitbook-tab.plugin.js';
11	import { gitbookStepperExtension } from './marked-plugins/gitbook-stepper.plugin.js';
12	import {
13	gitbookIncludeExtension,
14	type GitbookIncludeToken,
15	IncludeTokenType,
16	} from './marked-plugins/gitbook-include.plugin.js';
17
18	const renderer = new marked.Renderer();	15✔
19	renderer.heading = ({ tokens, depth }: Tokens.Heading) => {	15✔
20	const token = tokens[0] as unknown as Heading;	57✔
21	token.id = generateIdFromText(token.text);	57✔
22	return `<h${depth} id="${token.id}">	57✔
23	<a href="#${token.id}" class="anchor"></a>
24	${token.text}
25	</h${depth}>`;
26	};
27
28	export interface MarkdownParserOptions {
29	env: Env;
30	}
31	export interface ToHTMLOptions {
32	contentPath: string;
33	destDir: string;
34	}
35	export class MarkdownParser {
36	private marked: Marked;
37	private readonly env: Env;
38
39	constructor(options: MarkdownParserOptions) {
40	this.env = options.env;	123✔
41	this.marked = new Marked();	123✔
42	this.marked.setOptions({	123✔
43	gfm: true,
44	breaks: true,
45	renderer,
46	});
47	this.marked.use(gitbookExtension);	123✔
48	this.marked.use(gitbookTabExtension);	123✔
49	this.marked.use(gitbookStepperExtension);	123✔
50	this.marked.use(katexExtension);	123✔
51	this.marked.use(gitbookIncludeExtension);	123✔
52	}
53
54	/**
55	* 解析 markdown 文件
56	*/
57	async parseFile(filePath: string): Promise<MarkdownFile> {
58	const content = await readFile(filePath);	48✔
59	const headings = this.extractHeadings(content);	45✔
60	const title = this.extractTitle(content, headings);	45✔
61
62	return {	45✔
63	path: filePath,
64	title,
65	content,
66	headings,
67	};
68	}
69
70	public toPlainText(content: string): string {
NEW 71	return content	×
72	.replace(/\r\n?/g, '\n')
73	.replace(/^---[\s\S]*?\n---\n?/m, ' ')
74	.replace(/```[\s\S]*?```/g, ' ')
75	.replace(/`([^`]+)`/g, '$1')
76	.replace(/!\[([^\]]*)\]\([^)]+\)/g, '$1')
77	.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
78	.replace(/^>\s?/gm, '')
79	.replace(/^#{1,6}\s+/gm, '')
80	.replace(/[*_~]/g, ' ')
81	.replace(/\\|/g, ' ')
82	.replace(/\n+/g, ' ')
83	.replace(/\s+/g, ' ')
84	.trim();
85	}
86
87	/**
88	* 提取标题结构
89	*/
90	private extractHeadings(content: string): Heading[] {
91	const normalizedStr = content.replace(/\r\n?/g, '\n');	54✔
92	const lines = normalizedStr.split('\n');	54✔
93	// logger.info('normalized', normalizedStr);
94	const headings: Heading[] = [];	54✔
95	const stack: Heading[] = [];	54✔
96
97	for (const line of lines) {	54✔
98	const match = line.trim().match(/^(#{1,6})\s+(.+)$/);	858✔
99	if (match) {	858✔
100	const level = match[1].length;	153✔
101	const text = match[2].trim();	153✔
102	const id = generateIdFromText(text);	153✔
103
104	const heading: Heading = {	153✔
105	level,
106	text,
107	id,
108	children: [],
109	};
110
111	// 找到合适的父级标题
112	while (stack.length > 0 && stack[stack.length - 1].level >= level) {	153✔
113	stack.pop();	57✔
114	}
115
116	if (stack.length === 0) {	153✔
117	headings.push(heading);	51✔
118	} else {
119	stack[stack.length - 1].children.push(heading);	102✔
120	}
121
122	stack.push(heading);	153✔
123	}
124	}
125
126	return headings;	54✔
127	}
128
129	/**
130	* 提取文档标题
131	*/
132	private extractTitle(_content: string, headings: Heading[]): string {
133	// 优先使用第一个一级标题
134	const firstH1 = headings.find((h) => h.level === 1);	45✔
135	if (firstH1) {	45✔
136	return firstH1.text;	42✔
137	}
138
139	// 如果没有一级标题，使用第一个标题
140	if (headings.length > 0) {	3✔
141	return headings[0].text;	×
142	}
143
144	// 如果没有任何标题，使用文件名
145	return 'Untitled';	3✔
146	}
147
148	private async copyResource(src: string, options: ToHTMLOptions) {
149	const decodedSrc = decodeURIComponent(src);	×
150	const imageFromPath = join(dirname(options.contentPath), decodedSrc);	×
151	const imageToPath = join(options.destDir, decodedSrc);	×
152	const imageToDir = dirname(imageToPath);	×
153	await mkdirAsync(imageToDir);	×
154	await copyFile(imageFromPath, imageToPath);	×
155	return imageToPath;	×
156	}
157
158	// /**
159	// * 解析表格单元格中的链接
160	// */
161	// private parseTableCellLinks(cell: Tokens.TableCell): void {
162	// // 如果单元格内容只是纯文本且包含链接格式，则解析为链接
163	// if (cell.tokens.length === 1 && cell.tokens[0].type === 'text') {
164	// const textToken = cell.tokens[0] as Tokens.Text;
165	// const linkMatch = textToken.text.match(/^\[([^\]]+)\]\(([^)]+)\)$/);
166	// if (linkMatch) {
167	// let href = linkMatch[2];
168	// if (isMarkdownFile(href)) {
169	// const path = dirname(href);
170	// const filename = basename(href, extname(href));
171	// href = path === '.' ? `./${filename}.html` : `${path}/${filename}.html`;
172	// }
173	// // 将文本 token 替换为链接 token
174	// const linkToken: Tokens.Link = {
175	// type: 'link',
176	// raw: textToken.raw,
177	// href,
178	// title: null,
179	// text: linkMatch[1],
180	// tokens: [
181	// {
182	// type: 'text',
183	// raw: linkMatch[1],
184	// text: linkMatch[1],
185	// },
186	// ],
187	// };
188	// cell.tokens = [linkToken];
189	// }
190	// }
191	// }
192
193	/**
194	* 将 markdown 转换为 HTML
195	*/
196	async toHtml(content: string, options: ToHTMLOptions): Promise<string> {
197	const html = await this.marked.parse(content, {	57✔
198	async: true,
199	walkTokens: async (token: Token) => {
200	if (token.type === 'image') {	264✔
201	const src = token.href;	×
202	if (	×
203	!src
204	\|\| src.startsWith('http')
205	\|\| src.startsWith('data:image/')
206	\|\| src.startsWith('blob:')
207	\|\| src.startsWith('//')
208	) {
209	return;	×
210	}
211	const imageToPath = await this.copyResource(src, options);	×
212	if (this.env === 'pdf') {	×
213	const buffer = await readFile(imageToPath, 'base64');	×
214	const ext = extname(imageToPath).slice(1);	×
215	token.href = `data:image/${ext};base64,${buffer}`;	×
216	}
217	} else if (token.type === 'link') {	264✔
218	const href = token.href;	×
219	// 检查 href 是否存在
220	if (!href) {	×
221	return;	×
222	}
223	if (href.startsWith('http') \|\| href.startsWith('https')) {	×
224	return;	×
225	}
226	if (!isMarkdownFile(href)) {	×
227	await this.copyResource(href, options);	×
228	return;	×
229	}
230	const path = dirname(href);	×
231	const filename = basename(href, extname(href));	×
232	// 确保路径格式正确：如果 path 是 '.'，则使用相对路径
233	const link = path === '.' ? `./${filename}.html` : `${path}/${filename}.html`;	×
234	token.href = link;	×
235	} else if (token.type === 'code') {	264✔
236	// 处理 mermaid 代码块
237	const codeToken = token as Tokens.Code;	3✔
238	if (codeToken.lang === 'mermaid') {	3✔
239	const diagram = codeToken.text;	×
240	token.type = 'html';	×
241	token.text = `<pre class="mermaid">	×
242	${diagram}
243	</pre>`;
244	} else if (!codeToken.lang) {	3✔
245	// 如果没有指定语言，设置为 plain text
246	codeToken.lang = 'plain';	×
247	}
248	} else if (token.type === IncludeTokenType) {	261✔
249	const includeToken = token as GitbookIncludeToken;	×
250	const includeContent = await readFile(	×
251	join(dirname(options.contentPath), includeToken.path),
252	);
253	token.type = 'html';	×
254	token.text = await this.toHtml(includeContent, options);	×
255	}
256	},
257	});
258	return html;	57✔
259	}
260	}

yunnysunny / bookforge / 23728902738

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous