-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextHandle.js
130 lines (109 loc) · 3.65 KB
/
textHandle.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
const axios = require('axios');
const cheerio = require('cheerio');
const { parseJsonSafely } = require('./utils')
async function fetchPageContent(url) {
try {
const response = await axios.get(url);
return response.data; // 页面的HTML内容
} catch (error) {
console.error(`Error fetching page content from ${url}: `, error);
return ''; // 出错时返回空字符串
}
}
function parseJobDetail(htmlContent) {
const $ = cheerio.load(htmlContent);
// 提取所有<p>标签的文本内容
let allText = '';
$('p').each((i, elem) => {
allText += $(elem).text().trim() + ' '; // 将所有段落文本合并,每个段落后面加一个空格以保持间隔
});
// 如果需要,可以进一步处理allText以提取或格式化信息
// 例如,去除过长的空格、合并断行等
// 返回包含所有提取文本的对象
return { allText };
}
async function fetchJobDetails(searchResults) {
const jobDetails = [];
for (const result of searchResults) {
const detailPageContent = await fetchPageContent(result.href);
const jobInfo = parseJobDetail(detailPageContent);
jobDetails.push(jobInfo);
}
return jobDetails;
}
const fetchWebPageContent = async (url)=> {
try {
const response = await axios.get(url);
return response.data;
} catch (error) {
console.error(`Failed to fetch ${url}:`);
return null;
}
};
const extractTextFromHtml = (html) => {
const $ = cheerio.load(html);
let text = $('p').text()||$('span').text()
text = formatText(text);
return limitTextLength(text);
};
const fetchAndParseUrls = async (urlsJson, searchRes) => {
// 如果urlsJson是字符串,则解析为对象
urlsJson = parseJsonSafely(urlsJson);
const results = {};
for (const key in urlsJson) {
//console.log(key)
const index = urlsJson[key].序号;
const htmlContent = await fetchWebPageContent(searchRes[index].href);
if (htmlContent !== null) {
const textContent = extractTextFromHtml(htmlContent);
results[key] = {
content: textContent,
url: searchRes[index].href
};
} else {
results[key] = null;
}
}
return results;
};
/**
* @description: 限制文本内容的长度不超过2000字,如果超出限制则截断并在末尾加上省略号
* @param {string} htmlContent
* @returns {string} 筛选后的文本内容
*/
const limitTextLength = (htmlContent) => {
const maxLength = 2000;
if (htmlContent.length > maxLength) {
return htmlContent.slice(0, maxLength) + '...';
}
return htmlContent;
}
/**
* @description: 去除文本内容中的换行符、多余空格等
* @param {string} text
* @returns {string} 格式化后的文本
*/
const formatText = (text) => {
return text.replace(/\s+/g, ' ').trim();
}
// 示例用法
// const urlsJson = {
// "最有价值的搜索结果1": {
// 链接: "https://developer.mozilla.org/zh-CN/docs/learn/Front-end_web_developer"
// },
// "最有价值的搜索结果2": {
// 链接: "https://www.zhihu.com/topic/19550901/intro"
// },
// "最有价值的搜索结果3": {
// 链接: "https://zhuanlan.zhihu.com/p/337513783"
// }
// };
// fetchAndParseUrls(urlsJson).then(results => {
// console.log(JSON.stringify(results, null, 2));
// });
module.exports = {
fetchAndParseUrls,
fetchPageContent,
parseJobDetail,
fetchJobDetails
}