文章目录
一、TypeScript-爬虫-基本实现
-
示例代码
class Crowller { // 定义需要爬虫的url链接 private url: string = "https://learning.sohu.com/?spm=smpc.news-home.header.7.1580904485627XJyFma3"; // 定义处理好的数据保存路径 private filePath = path.resolve(__dirname, "../data/news.json"); /** * 处理html中的数据 * @param html */ getJsonInfo(html: string): string[] { const contents: string[] = []; const $ = cheerio.load(html); const lines = $(".z-head-news_item"); lines.map((index, element) => { const childs = $(element).find("a"); const content = childs.text(); contents.push(content); }); return contents; } /** * 获取网页内容 */ async getRawHtml() { const html = await superagent.get(this.url); return html.text; } /** * 读物文件内容 * @param data */ generateJsonContent(data: string[]) { let fileContent: Content = {}; if (fs.existsSync(this.filePath)) { fileContent = JSON.parse(fs.readFileSync(this.filePath, "utf-8")); } let time = new Date().getTime(); fileContent[time] = data; return fileContent; } /** * 将处理好的数据写入文件中 * @param content */ writeFile(content: string) { fs.writeFileSync(this.filePath, content); } /** * 1. 获取html * 2. 处理htmlz中的数据 * 3. 获取文件中的内容 * 4. 将处理好的数据写入文件中 */ async initSpiderRrocess() { const html = await this.getRawHtml(); const jsonInfo = this.getJsonInfo(html); const content = this.generateJsonContent(jsonInfo); this.writeFile(JSON.stringify(content)); } constructor() { this.initSpiderRrocess(); } } const crowller = new Crowller();
- 缺点:
- 网页爬取和数据分析逻辑高度冗合
- 缺点:
-
package.json配置
{
"name": "demo-2",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"dev": "ts-node ./src/crowller.ts"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"cheerio": "^1.0.0-rc.3",
"superagent": "^5.2.2"
},
"devDependencies": {
"@types/cheerio": "^0.22.18",
"@types/superagent": "^4.1.7"
}
}