一、TypeScript-爬虫-扩展-组合模式
功能拆分:
- 将爬取网页内容和数据写入文件放在一起
- 将网页数据处理提炼出来,同时实现统一数据处理接口
具体实现:
-
新建NewAnalyzer
import fs from "fs"; import cheerio from "cheerio"; import { Analyzer } from "./crowller"; interface Content { [propName: number]: string[]; } export default class NewAnalyzer implements Analyzer { /** * 处理html中的数据 * @param html */ private getJsonInfo(html: string): string[] { const contents: string[] = []; const $ = cheerio.load(html); const lines = $(".z-head-news_item"); lines.map((index, element) => { const childs = $(element).find("a"); const content = childs.text(); contents.push(content); }); return contents; } /** * 读物文件内容 * @param data */ generateJsonContent(data: string[], filePath: string) { let fileContent: Content = {}; if (fs.existsSync(filePath)) { fileContent = JSON.parse(fs.readFileSync(filePath, "utf-8")); } let time = new Date().getTime(); fileContent[time] = data; return fileContent; } public analyze(html: string, filePath: string): string { const jsonInfo = this.getJsonInfo(html); const content = this.generateJsonContent(jsonInfo, filePath); return JSON.stringify(content); } }
-
通用网页爬取,数据读取与写入
import superagent from "superagent"; import fs from "fs"; import path from "path"; import NewsAnalyzer from "./newsAnalyzer"; export interface Analyzer { analyze(html: string, filePath: string): string; // analyze: (html: string, filePath: string) => string; } class Crowller { // 定义处理好的数据保存路径 private filePath = path.resolve(__dirname, "../data/news.json"); /** * 获取网页内容 */ async getRawHtml() { const html = await superagent.get(this.url); return html.text; } /** * 将处理好的数据写入文件中 * @param content */ writeFile(content: string) { fs.writeFileSync(this.filePath, content); } /** * 1. 获取html * 2. 处理htmlz中的数据 * 3. 获取文件中的内容 * 4. 将处理好的数据写入文件中 */ async initSpiderRrocess() { const html = await this.getRawHtml(); const content = this.analyzer.analyze(html, this.filePath); this.writeFile(content); } constructor(private analyzer: Analyzer, private url: string) { this.initSpiderRrocess(); } } // 定义需要爬虫的url链接 const url: string = "https://learning.sohu.com/?spm=smpc.news-home.header.7.1580904485627XJyFma3"; const newsAnalyzer = new NewsAnalyzer(); new Crowller(newsAnalyzer, url);