使用Nodejs爬取网页某个数据并把爬到的数据写入excel (服务端部分)

2 篇文章 0 订阅
2 篇文章 0 订阅

结合我的上一篇文章(地址如下),请求发送过来之后服务端拿到数据之后对数据进行操作解析然后一一请求

https://blog.csdn.net/qq_45104282/article/details/127669095

详情请看代码:

有些方法用了封装方式,增加代码可读性减少代码重复性

启动服务需要node脚手架可自行安装    启动命令为  node 命名 

例如:node scrapingPage.js

//引入模块
const express=require('express'),  
urls  = require('url'),
querystring=require('querystring'),
morgan=require('morgan'),
request = require('superagent'),
fs = require('fs'),
cheerio = require('cheerio'),
excelJS = require('exceljs')

let list,data = []   //存储数据

const app=express()

let accessLogStream = fs.createWriteStream('./access.log', {flags: 'a'});  //日志请求时间

app.use(morgan('short',{stream:accessLogStream}));

app.all('/scrapingPage',(request,response)=>{

    let pathName=urls.parse(request.url).pathname;

    if(pathName=='/scrapingPage'){
        let names=''
        request.on('data',(chunk)=>{
            names+=chunk
        });
        request.on('end',()=>{
            let params=querystring.parse(names);
            var str=params.code;
            // 输入请求间隔时间(秒)
            var intervalTimeValue=params.time
            str=JSON.parse(str);
            let strLen = str.length
            console.log('---------------链接',str)
            resultData(str,intervalTimeValue)
            response.end()
        })
    }
})

app.listen(8888,()=>{
    console.log('服务器已启动,监听端口中')
})

function resultData(urlArr,intervalTimeValue) {

      for (let index = 0; index  < urlArr.length; index ++) {
          setTimeout(() => {
              capturePageResult(urlArr[index])
          }, i * intervalTimeValue * 1000)
      }

}

function capturePageResult(urlArr) {
    request.get(urlArr.toString())
        .end((err, res) => {
            if (!err) {    
                let
                    html = res.text,  
                    $ = cheerio.load(html, {decodeEntities: false}),    // 加载获取到的 html 数据
                    $itemMod = $('#productDetails_detailBullets_sections1'),
                    $itemMods = $('#detailBulletsWrapper_feature_div > ul > li'),
                    tableLen = $itemMod.length,
                    spanLen = $itemMods.length,
                    rankStr;
                console.log('tableLen长度', tableLen)
                console.log('spanLen长度', spanLen)
                if (tableLen > 0) {
                    // console.log(urlArr[i]+'join')
                    $itemMod.each((i, e) => {
                        $e = $(e);  // 缓存

                        rankStr = $e.find('span').text().split("\n")[0]
                        if($e.find('span').text().split("ratings").slice(-1).toString() === ""){
                            if(rankStr.split("#").length >= 6){
                                let rankStrArr = noRepeat(rankStr.split("#"))
                                let rankStrSave = ''
                                for (let j = 0; j < rankStrArr.length; j++) {
                                    rankStrSave =rankStrArr[0] + '\n#' + rankStrArr[1] +'\n#'+ rankStrArr[2] +'\n#'+ rankStrArr[3] +'\n#'+ rankStrArr[4]
                                }
                                readXlsxFile(urlArr, rankStrSave, formattingDate())
                            }
                            //  console.log(rankStr.split("#"))
                        }

                        if($e.find('span').text().split("ratings").slice(-1).toString() !== ""){

                            data.push($e.find('span').text().split("ratings").slice(-1).toString());

                            var rankNewStr = noRepeat(data.toString().split('#'))
                            console.log('rankNewStr',rankNewStr)
                            let rankData = ''
                            for (let j = 0; j < rankNewStr.length; j++) {
                                if (rankNewStr.length >= 3) rankData = '#' + rankNewStr[1] + '\n#' + rankNewStr[2] + (rankNewStr[3] === undefined ? '' : '\n#' + rankNewStr[3])
                                if (rankNewStr.length === 2) rankData = '#' + rankNewStr[1]
                            }
                            console.log('rankData',rankData)
                            // list = data.slice(-1).toString();
                            readXlsxFile(urlArr, rankData, formattingDate())

                            /**
                             * 去空格
                             */
                            // var str1 = str.replace(/#{1}\s*/g,"\n#");
                        }
                    });

                    //清空数组
                    data.length = []
                    // data.pop()
                }

                if (spanLen === 0 && tableLen === 0){
                    console.log(urlArr + '没有商品排名')
                    readXlsxFile(urlArr,"Not Data",formattingDate())
                }

                if(spanLen > 0){
                    $itemMods.each((i, e) => {
                        $e = $(e);  // 缓存

                        data.push($e.find('span').text().trim())

                    });
                    let removeBest = data.toString().split("Customer Reviews:")[0].toString()

                    console.log(removeBest.split("Best Sellers Rank:  ")[1])

                    if (removeBest.split("Best Sellers Rank:  ")[1] === undefined){
                        console.log(urlArr + '没有商品排名')
                        readXlsxFile(urlArr,"The commodity doesn't rank(该商品没有排名)",formattingDate())
                    }else {
                        readXlsxFile(urlArr,removeBest.split("Best Sellers Rank:  ")[1].replace(/#{1}\s*/g,"\n#"),formattingDate())
                    }
                    data.length = []
                }

            } else {
                console.log(urlArr + '请求延时稍后再试');
                readXlsxFile(urlArr,"The request timeout please try again later(请求延时)",formattingDate())
                return;
            }
        });
}

/**
 * 去重
 * @param arr
 * @returns {*[]}
 */
function noRepeat(arr) {
    let newArr=[];
    for(let i=0; i<arr.length; i++) {

        if(newArr.indexOf(arr[i].toString().replace(/^\s*|\s*$/g,"")) === -1) {  
          // 判断数组中有没有字符串值,如果没有则返回 -1
            newArr.push(arr[i].toString().replace(/^\s*|\s*$/g,""));
        }
    }
    return newArr
}

/**
 * 格式化时间
 * @returns {string}
 */
function formattingDate() {
    let formatDate = new Date()
    let fullYear = formatDate.getFullYear()
    let month = formatDate.getMonth() + 1
    let dates = formatDate.getDate()
    let hours = formatDate.getHours()
    let minutes = formatDate.getMinutes() < 10 ? '0'+formatDate.getMinutes() : formatDate.getMinutes()
    let seconds = formatDate.getSeconds() < 10 ? '0'+formatDate.getSeconds() : formatDate.getSeconds()
    let dateGroup = fullYear+"/"+month+"/"+dates+" "+hours+":"+minutes+":"+seconds
    return dateGroup
}

/**
 * 读写excel
 * @param code
 * @param name
 * @param date
 */
function readXlsxFile(code,name,date) {

    // let file = './'+code.toString().split("/")[4]+'.xlsx'
    // let files = './'+code.toString().split("/")[4]+'.xlsx'

    let readFileData = []
    readFileData.push([code,name,date])
    console.log(readFileData)

    let workbook = new excelJS.Workbook()

    let files = './readFile.xlsx'

    fs.readFile('./readFile.xlsx', 'utf-8', async function (err) {
        if (err) {

            // 表标签名
            let sheet = workbook.addWorksheet('商品排名统计表', {views: [{state: 'frozen', ySplit: 1}]})
            //初始化数据 跟绑定
            sheet.addRow(['编号', '商品排名详细信息', '获取时间']).height = 30
            sheet.columns = [
                {header: '编号', key: readFileData[0], width:40},
                {header: '商品排名详细信息', key: readFileData[1], width: 130},
                {header: '获取时间', key: readFileData[2], width: 20}
            ]
            // 筛选跟设置单元格格式
            sheet.autoFilter = 'A1:C1'
            sheet.getCell('A1').alignment = {vertical: 'middle', horizontal: 'center'};
            sheet.getCell('B1').alignment = {vertical: 'middle', horizontal: 'center'};
            sheet.getCell('C1').alignment = {vertical: 'middle', horizontal: 'center'}

            for (let i = 0; i < readFileData.length; i++) {
                sheet.addRow(readFileData[i]).height = 100
                sheet.getCell('A2').alignment = {vertical: 'middle', horizontal: 'center'};
                sheet.getCell('B2').alignment = {vertical: 'middle', horizontal: 'center'};
                sheet.getCell('C2').alignment = {vertical: 'middle', horizontal: 'center'}
            }
            //清空数组
             readFileData.length = []
            //写入
            workbook.xlsx.writeFile(files)
                    .then(function () {
                        console.log(`${code}的内容已保存到${files.substring(2,10)}文件`)
                    });
        } else {

            try{
                /**
                 * 读取信息
                 */
                let sheet = await workbook.xlsx.readFile(files)

                let sheetL = []
                sheet.eachSheet(function (sheet, sheetId) {
                    //打印当前表的名字(标签名)
                    sheetL.push(sheet.name)
                })
                let length = sheetL.length
                let s = sheet.getWorksheet(length)
                let sheetLine = s.lastRow.number
                console.log('sheetLine=====>', sheetLine)

                //  如果行数据大于20000 那就再新建一个工作表
                if (sheetLine > 20000) {
                    sheet.addWorksheet(`商品排名统计表${sheetL.length + 1}`, {views: [{state: 'frozen', ySplit: 1}]})
                    s = sheet.getWorksheet(`商品排名统计表${sheetL.length + 1}`)
                    s.addRow(['编号', '商品排名详细信息', '获取时间']).height = 30
                }

                for (let i = 0; i < readFileData.length; i++) {
                    //把后续进来的每一个单元格设置高度以及居中格式
                    s.addRow(readFileData[i]).height = 100
                    s.getCell(`A${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                    s.getCell(`B${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                    s.getCell(`C${sheetLine+1}`).alignment = {vertical: 'middle', horizontal: 'center'};
                }
                 //清空数组
                 readFileData.length = []

                //写入
                sheet.xlsx.writeFile(files)
                    .then(function () {
                        console.log(`${code}的内容已保存到${files.substring(2,10)}文件(2号入口)`)
                    });
            }catch (e) {
                // console.log('error==========>',e)
                console.log(`${code}失效  请重新获取`)
            }
        }
    })
}


服务端启动之后,结合我的上一篇文章,前端提交了四个链接跟五秒间隔,看是否写进成功

 

爬到的数据写入成功,且间隔是每五秒请求一次。overover~~~

如果有不懂的欢迎在评论区留言,我会一一解答的,或者私聊也可。下期见!!

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值