【谷歌学术】爬虫数据采集案例google

风是自由的,你也是

import puppeteer from 'puppeteer-core';
import pLimit from 'p-limit';

import xpath from 'xpath'
import {DOMParser} from 'xmldom'
import {MongoClient} from 'mongodb'


const limit = pLimit(1);   // 控制并发数为 1



const performPuppeteerTask = async (task) => {
    const browser = await puppeteer.launch({
        headless: false, //是否显示浏览器
        //设置超时时间
        args: [
            '--no-sandbox',
            '--proxy-server=http://127.0.0.1:2333',
            "--user-agent='Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36'"
        ],
        executablePath: '/Applications/Chromium.app/Contents/MacOS/Chromium',
        ignoreHTTPSErrors: true,
        // dumpio: false,
        // devtools: true,

    });
    const page = await browser.newPage();
    page.setDefaultTimeout(20000);

    try {

        var pUrl = task['url']
        await page.goto(pUrl);
    } catch (error) {
        if (error) {
            // 处理超时错误
            console.log('超时' + error)
            await client.close();
        } else {
            // 处理其他错误
            console.log('其他错误' + error)
            await client.close();

        }
    }
    //等待加载完成
    // await page.waitForSelector('.views-view-sidebar__content');

    const htmlSource = await page.content();
    await browser.close();

    var response = new DOMParser({
        errorHandler: {
            warning: function (w) {
            }, error: function (e) {
            }, fatalError: function (e) {
                console.error(e)
            }
        }
    }).parseFromString(htmlSource);
    var rootPath = xpath.select("//tr[@class='gsc_a_tr']", response)
    for (let i = 0; i < rootPath.length; i++) {
        var title = xpath.select("string(.//a[@class='gsc_a_at'])", rootPath[i])
        var year = xpath.select("string(.//a[@class='gsc_a_ac gs_ibl'])", rootPath[i])
        var quote = xpath.select("string(.//span[@class='gsc_a_h gsc_a_hc gs_ibl'])", rootPath[i])
        var authorList = xpath.select("string(.//div[@class='gs_gray'][1])", rootPath[i])
        var category = xpath.select("string(.//div[@class='gs_gray'][2])", rootPath[i])
        var DetailUrl = xpath.select(".//a[@class='gsc_a_at']/@href", rootPath[i])[0].value
        DetailUrl = 'https://scholar.lanfanshu.cn' + DetailUrl


        var item = {
            title: title,
            DetailUrl: DetailUrl,
            year: year,
                quote: quote,
            authorList: authorList,
            category: category,
            author : task['name'],
            authorDesc: task['description'],
            authorUrl: task['url'],
            authorCover: task['imgage'],
        }
        console.log(item)
    }


    console.log("Url:" + pUrl + "采集完成")

}


var urlTasks = [{'url': 'https://scholar.lanfanshu.cn/citations?view_op=search_authors&hl=zh-CN&mauthors=smith&btnG='}]

const tasks = urlTasks.map(task => limit(() => performPuppeteerTask(task)));
// The Cure
Promise.all(tasks)
    .then(() => {
        console.log('所有任务完成');
    })
    .catch(error => {
        console.error('出现错误:', error);
    });



  • 10
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值