Skip to content

Commit e0994fa

Browse files
committed
✨ feat: refactor with new version
1 parent 8762b3d commit e0994fa

File tree

6 files changed

+98
-36
lines changed

6 files changed

+98
-36
lines changed

api/parse.ts

Lines changed: 0 additions & 19 deletions
This file was deleted.

api/v1/_utils.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import { Readability } from '@mozilla/readability';
2+
import { JSDOM } from 'jsdom';
3+
import { NodeHtmlMarkdown } from 'node-html-markdown';
4+
5+
const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io';
6+
const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN;
7+
8+
export const htmlToMarkdown = (html: string, url: string) => {
9+
const doc = new JSDOM(html, { url });
10+
11+
const article = new Readability(doc.window.document).parse();
12+
const content = NodeHtmlMarkdown.translate(article?.content || '', {});
13+
14+
return { ...article, content };
15+
};
16+
17+
const runner = async ({ url }: { url: string }) => {
18+
const input = {
19+
gotoOptions: { waitUntil: 'networkidle2' },
20+
url,
21+
};
22+
23+
try {
24+
const res = await fetch(`${BASE_URL}/content?token=${BROWSERLESS_TOKEN}`, {
25+
body: JSON.stringify(input),
26+
headers: {
27+
'Content-Type': 'application/json',
28+
},
29+
method: 'POST',
30+
});
31+
const html = await res.text();
32+
33+
const article = htmlToMarkdown(html, url);
34+
35+
return { content: article.content, title: article?.title, url, website: article?.siteName };
36+
} catch (error) {
37+
console.error(error);
38+
return { content: '抓取失败', errorMessage: (error as any).message, url };
39+
}
40+
};
41+
42+
export default runner;

api/v1/index.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import fetchContent from './_utils';
2+
3+
export const config = {
4+
runtime: 'edge',
5+
};
6+
7+
export default async (req: Request) => {
8+
if (req.method !== 'POST') return new Response('Method Not Allowed', { status: 405 });
9+
10+
const args = (await req.json()) as { url: string };
11+
12+
const result = await fetchContent(args);
13+
14+
return new Response(JSON.stringify(result));
15+
};

api/v1/type.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
export type Result = {
2+
content: string;
3+
title?: string;
4+
url: string;
5+
website?: string;
6+
};
7+
8+
export interface ParserResponse {
9+
/** author metadata */
10+
byline: string;
11+
12+
/** HTML string of processed article content */
13+
content: string;
14+
15+
/** content direction */
16+
dir: string;
17+
18+
/** article description, or short excerpt from the content */
19+
excerpt: string;
20+
21+
/** content language */
22+
lang: string;
23+
24+
/** length of an article, in characters */
25+
length: number;
26+
27+
/** name of the site */
28+
siteName: string;
29+
30+
/** text content of the article, with all the HTML tags removed */
31+
textContent: string;
32+
33+
/** article title */
34+
title: string;
35+
}

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
{
2-
"name": "lobehub-html-parser",
2+
"name": "@lobehub/chat-plugin-web-crawler",
33
"version": "1.0.1",
44
"private": true,
5-
"description": "HTML 转 markdown 服务",
6-
"repository": "https://github.com/arvinxx/vercel-serverless-api-template.git",
5+
"description": "Lobe Chat 网页抓取服务",
6+
"repository": "https://github.com/lobehub/chat-plugin-web-crawler.git",
77
"scripts": {
88
"ci": "npm run lint && npm run type-check",
99
"lint": "npm run lint:js && npm run lint:prettier",

tests/parse.test.ts

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,14 @@
1-
import { VercelRequest, VercelResponse } from '@vercel/node';
21
import { readFileSync } from 'node:fs';
32
import * as path from 'node:path';
43

54
import { expect } from 'vitest';
6-
import Api from '../api/parse';
5+
import { htmlToMarkdown } from '../api/v1/_utils';
76

87
describe('html-to-markdown', () => {
9-
it('Zhihu', async () => {
8+
it('Zhihu', () => {
109
const html = readFileSync(path.join(__dirname, './html/zhihu.html'), { encoding: 'utf8' });
1110

12-
const data = await Api(
13-
<VercelRequest>(<unknown>{
14-
body: {
15-
html,
16-
url: 'https://zhuanlan.zhihu.com/p/641434725',
17-
},
18-
}),
19-
<VercelResponse>(<unknown>{
20-
send: () => {},
21-
}),
22-
);
11+
const data = htmlToMarkdown(html, 'https://zhuanlan.zhihu.com/p/641434725');
2312

2413
expect(data).toMatchSnapshot();
2514
});

0 commit comments

Comments
 (0)