'use strict';
const rp = require('request-promise');
// http请求库
const request = rp.defaults({
jar: rp.jar(),
gzip: true,
headers: {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
},
});
// 网页解析库
const cheerio = require('cheerio');
// 读写文件库
const fs = require('fs');
async function main(){
const dataList = [];
const fs = require('fs');
// 该文件由数组对象直接写入,中间有逗号分隔
const List = fs.readFileSync('zhuanyezixunUrl.txt', 'utf-8').split(',');
// let i = 0;
for (let url of List){
// 注意:一定要在循环内声明
const data = {};
// console.log(i++);
// 发送请求,异步函数中,需加await,否则会直接执行下面的语句,发生错误
const html = await request(url);
const $ = cheerio.load(html, {decodeEntities: false});
data.title = $('h1').text();
data.time = $('.conter_main_one_nav').children('p').text();
// 除去所有img标签
$('#article-content img').remove();
// 替换a标签,并释放出里面的内容
$('#article-content a').replaceWith(function(){ return $(this).html()});
// 除去所有tag的style和class
$('#article-content [style]').removeAttr('style');
$('#article-content [class]').removeAttr('class');
// 除去p标签中有(推荐阅读:)的后面所有内容,然后把自己也干掉
$('p:contains(推荐阅读:)').nextAll().remove();
$('p:contains(推荐阅读:)').remove();
//取出处理好的正文内容
data.content = $('#article-content').html().trim();
// 替换关键字
data.content = data.content.replace(/出国留学网/g, '智课网');
dataList.push(data);
}
const json = JSON.stringify(dataList, 0, 2);
fs.writeFileSync('zhuanyezixun.json', json, 'utf-8');
console.log('ok');
}
main().catch(err => {
console.error(err.stack);
});
// async function main() {
// console.log('正在爬取网页。。。。。。');
// const urlList = [];
// for (let i = 0; i < 10; i++) {
// const url = `https://www.example.com/tiaojian/${(i === 0) ? '' : `${i + 1}.html`}`;
// console.log(`正在爬取第${i + 1}页+${url}`);
// const html = await request(url);
// const $ = cheerio.load(html);
// $('.news-title').each(function(){
// urlList.push($('a', this).attr('href'))
// console.log($('a', this).attr('href'));
// });
// }
// fs.writeFileSync('tiaojianUrl.txt', urlList, 'utf-8');
// console.log('写入完毕');
// }
// main().catch(err => {
// console.error(err.stack);
// });
python 中 pyquery对比用法
url = 'https://www.example.com/a/3764073.html'
html = requests.get(url)
html.encoding = 'utf-8'
doc = pq(html.text)
doc('#article-content img').remove()
doc('#article-content [style]').remove_attr('style')
doc('#article-content [class]').remove_attr('class')
doc('p:contains(推荐阅读:)').next_all().remove()
doc('p:contains(推荐阅读:)').remove()
# 释放a标签中的内容还未解决
doc('#article-content [href]').remove_attr('href')
doc('#article-content [target]').remove_attr('target')
doc = doc('#article-content').html().strip().replace('<a>', '').replace('</a>', '')
print(doc)