1:首先可以先获取你需要采集的网页的地址:url
2:将你需要采集的url用nodejs实现请求:
const cheerio = require('cheerio');
const restler = Object.assign({}, require("restler"));
import htmlObject from '../utils/html-str.js';
import functionTool from '../utils/function.js';
import md5 from 'md5';
parseHtml: function (result, link, charset) {
let $ = cheerio.load(result);
let linkId = md5(link);
toolManager.charsets[linkId] = charset;
toolManager.docs[linkId] = result;
let $head = $('head'), $body = $('body');
$head.append('<script type="text/javascript">' + functionTool.injectJavascript + '</script>');
$head.append(htmlObject.injectCss);
$body.append($(htmlObject.injectMenu));
$body.append($(htmlObject.injectToolbar));
$body.append($(htmlObject.injectLinkId).val(linkId));
return $.html();
},
processDoc(req, response) {
let link = req.query['link'];
restler.get(link).on('complete', function (result, res) {
if (result instanceof Error) {
console.log('Error:', result.message);
} else {
let charset = res.headers['content-type'];
if (charset) {
charset = /\bcharset=(.+)(?:;|$)/i.exec(charset);
if (charset) {
charset = charset[1].trim().toLowerCase();
}
}
response.write(parseHtml(result, link, charset));
}
});
}
上面的functionTool.injectJavascript为你需要嵌入到采集到的网页中的js代码
同样下面的injectCss,injectMenu,injectToolbar等分别代表样式文件与html代码
其中引入cheerio模块的话 可以直接操作采集下来的页面(api:https://cnodejs.org/topic/5203a71844e76d216a727d2e)
通过restler模块去请求界面的话,它会帮你处理好界面中文的乱码问题,(api:https:www.npmjs.com/package/restler)