风是自由的,你也是
import puppeteer from 'puppeteer-core';
import pLimit from 'p-limit';
import xpath from 'xpath'
import {DOMParser} from 'xmldom'
import {MongoClient} from 'mongodb'
const limit = pLimit(1);
const performPuppeteerTask = async (task) => {
const browser = await puppeteer.launch({
headless: false,
args: [
'--no-sandbox',
'--proxy-server=http://127.0.0.1:2333',
"--user-agent='Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36'"
],
executablePath: '/Applications/Chromium.app/Contents/MacOS/Chromium',
ignoreHTTPSErrors: true,
});
const page = await browser.newPage();
page.setDefaultTimeout(20000);
try {
var pUrl = task['url']
await page.goto(pUrl);
} catch (error) {
if (error) {
console.log('超时' + error)
await client.close();
} else {
console.log('其他错误' + error)
await client.close();
}
}
const htmlSource = await page.content();
await browser.close();
var response = new DOMParser({
errorHandler: {
warning: function (w) {
}, error: function (e) {
}, fatalError: function (e) {
console.error(e)
}
}
}).parseFromString(htmlSource);
var rootPath = xpath.select("//tr[@class='gsc_a_tr']", response)
for (let i = 0; i < rootPath.length; i++) {
var title = xpath.select("string(.//a[@class='gsc_a_at'])", rootPath[i])
var year = xpath.select("string(.//a[@class='gsc_a_ac gs_ibl'])", rootPath[i])
var quote = xpath.select("string(.//span[@class='gsc_a_h gsc_a_hc gs_ibl'])", rootPath[i])
var authorList = xpath.select("string(.//div[@class='gs_gray'][1])", rootPath[i])
var category = xpath.select("string(.//div[@class='gs_gray'][2])", rootPath[i])
var DetailUrl = xpath.select(".//a[@class='gsc_a_at']/@href", rootPath[i])[0].value
DetailUrl = 'https://scholar.lanfanshu.cn' + DetailUrl
var item = {
title: title,
DetailUrl: DetailUrl,
year: year,
quote: quote,
authorList: authorList,
category: category,
author : task['name'],
authorDesc: task['description'],
authorUrl: task['url'],
authorCover: task['imgage'],
}
console.log(item)
}
console.log("Url:" + pUrl + "采集完成")
}
var urlTasks = [{'url': 'https://scholar.lanfanshu.cn/citations?view_op=search_authors&hl=zh-CN&mauthors=smith&btnG='}]
const tasks = urlTasks.map(task => limit(() => performPuppeteerTask(task)));
Promise.all(tasks)
.then(() => {
console.log('所有任务完成');
})
.catch(error => {
console.error('出现错误:', error);
});