爬取工商银行的休闲食品

最新推荐文章于 2021-05-27 21:58:36 发布

置顶夜逍尘

最新推荐文章于 2021-05-27 21:58:36 发布

阅读量223

点赞数

分类专栏： python爬虫文章标签： mongodb python 多线程爬虫 xpath

本文链接：https://blog.csdn.net/weixin_50589453/article/details/116784655

版权

python爬虫专栏收录该内容

37 篇文章 1 订阅

订阅专栏

一、找数据

休闲食品中我们除了要获取商品名称和已售的数据还有积分价和促销价我们就分别来讲

积分价

1.获取改变数据的js函数

积分价的数据全是在源代码里的JavaScript语言表示

$("#merPromPrice").html("<span class='jifen_1'>"+changeNum(promotionIntegralPriceJson[prodSkuObject.prodSkuId])+"</span>"+"<span class='jifen_2'>积分</span>"+"<span class='jifen_2'>+</span>"+"<span class='jifen_3'>"+$("#prodPriceUnit").val() +promPrice+"</span>");

顺藤摸瓜获取数据的函数

function changeNum(val) {
var temp = parseFloat(val) / 10000;
if (temp < 1) {
    val = parseFloat(val).toFixed(2);
} else {
    val = temp.toFixed(2);
}
return val;
}

function famout(s, n) {
	n = n > 0 && n<=20 ? n: 2;
	s = parseFloat((s + "").replace(/[^\d\.-]/g,"")).toFixed(n)+"";
	var l = s.split(".")[0].split("").reverse();
	var r  = s.split(".")[1];
	var len = (s.indexOf("-") != -1) ? l.length-1: l.length;
	var t= "";
	for(var i = 0; i<len;i++){
		t+= l[i] + ((i+1)%3 == 0 && (i+1)!=len? ",":"");
	}
	return ((s.indexOf("-")!= -1)? "-" : "") + t.split("").reverse().join("")+"."+r;	
}
function fNum(val) {
if (val) {
    return val == "" ? val: famout(parseFloat(val), 2);
}
}

把这些代码分别封存到js文件中用于python
在这里插入图片描述

2.获取传入函数的参数

在这里插入图片描述

在这里插入图片描述
找到之后使用正则把它取出来

jsons = re.findall('jQuery.parseJSON.*?":".*?"}', res.text)

                for x, json in enumerate(jsons[:2]):
                    item = {}
                    js = re.search('jQuery.parseJSON.*?":"(.*?)"}', json).group(1)

促销价

促销价就在源代码的标签中直接取出即可

html = etree.HTML(res.text)
span = html.xpath('//ul[@class="prop"]/div[1]//text()')
jiage = span[3] + span[4]

二、爬取数据

目标：使用多线程的方式爬取数据存入MonGoDB数据库

# 加载模块
import requests
from fake_useragent import UserAgent
from lxml import etree
import chardet
import re
import execjs
import pymongo
import threading
import queue


node = execjs.get()

def mulu():
    url = 'https://mall.icbc.com.cn/searchproducts/pv.jhtml?query=&catId=&displayCatId=000000000010048&selectCatId=&storeName=&isCoupon=&searchType=DISPLAYCATEGORY'
    headers = {
        'User-Agent': UserAgent().random,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'If-None-Match': '"09dfdbc81c92a8ca373ecf04a56621c0a"',
        'Referer': 'https://mall.icbc.com.cn/searchproducts/pv.jhtml?searchType=DISPLAYCATEGORY&displayCatId=000000000010048',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin'
    }
    range_list = []
    data = {
        'storeFilter': '[]',
        'categoryFilter': '[]',
        'propFilter': '[]',
        'displayCategoryFilter': '[]',
        'leftSideDC': '',
        'brandFilter': '[]',
        'skuFilter': '[]',
        'provinceFilter': '',
        'cityFilter': '',
        'priceRegionFilter': '{}',
        'sortFilter': '0',
        'dirFilter': '1',
        'promotionFilter': '[]',
        'freeDelivery': '',
        'installment': '',
        'sev_refund': '',
        'guarantee_pay': '',
        'score_pay': '',
        'loan_pay': '',
        'over_sea': '',
        'resultSearchFilter': '',
        'viewType': 'large',
        'selectedSearch': '[]',
        'exQuery': '',
        'currentPage': '1',
    }
    res = requests.post(url, headers=headers, data=data)
    html = etree.HTML(res.text)
    lis = html.xpath('//div[@id="ajaxQueryContent"]/ul[1]/li')
    for li in lis:
        href = li.xpath('./div[1]/div[1]/a/@href')
        join_href = 'https://mall.icbc.com.cn' + href[0]
        range_list.append(join_href)

    return range_list

class Sp(threading.Thread):

    def __init__(self, page_queue, data_queue, *args, **kwargs):
        super(Sp, self).__init__(*args, **kwargs)

        self.page = page_queue
        self.data = data_queue
        self.headers = {
            'User-Agent': UserAgent().random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'If-None-Match': '"09dfdbc81c92a8ca373ecf04a56621c0a"',
            'Referer': 'https://mall.icbc.com.cn/searchproducts/pv.jhtml?searchType=DISPLAYCATEGORY&displayCatId=000000000010048',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin'
        }

    def run(self) -> None:

        url = self.page.get()
        res = requests.get(url, headers=self.headers)
        res.encoding = chardet.detect(res.content)['encoding']
        html = etree.HTML(res.text)

        title = html.xpath('//h3[@class="cfx"]/div[1]/p[1]//text()')
        Sold = html.xpath('//ul[@class="prop"]/li[6]//text()')

        if Sold[0][-3:-1] != '已售':
            Sold = html.xpath('//ul[@class="prop"]/li[5]//text()')
        # print(Sold)
        jfj = html.xpath('//ul[@class="prop"]/div[1]/li[2]/span[1]//text()')
        str_jfj = str(jfj[0]).replace('\n', '').replace('\r', '').replace('  ', '').strip()[:-1]
        str_title = ''.join(title).replace('\n', '').replace('\r', '').replace('  ', '').strip()
        str_Sold = ''.join(Sold).replace('\n', '').replace('\r', '').replace('  ', '').replace('已售：', '').replace('										', '').strip()
        # print(str_Sold)

        jf_list = []
        cxj = []
        a = 0

        if '扶贫特产' not in str_title:


            if str_jfj == '积分价':
                a = 1

                jsons = re.findall('jQuery.parseJSON.*?":".*?"}', res.text)

                for x, json in enumerate(jsons[:2]):
                    item = {}
                    js = re.search('jQuery.parseJSON.*?":"(.*?)"}', json).group(1)

                    if x == 1:
                        gs1 = node.compile(open(r'./工商1.js',
                                                encoding='utf-8').read())
                        funcName = 'changeNum("' + js + '")'
                        jf = gs1.eval(funcName)
                        jf_list.append(jf + '万')

                    if x == 0:
                        gs2 = node.compile(open(r'./工商2.js',
                                                encoding='utf-8').read())
                        funcName = 'fNum("' + js + '")'
                        zjj = gs2.eval(funcName)
                        rmb = re.search('prodPriceUnit" value="(.*?)"/>', res.text).group(1)
                        jf_list.append(rmb + zjj)

            if str_jfj == '促销价':
                a = 2
                span = html.xpath('//ul[@class="prop"]/div[1]//text()')
                jiage = span[3] + span[4]
                cxj.append(jiage)

            if jf_list and a == 1:
                join_jfj = '积分价：' + jf_list[1] + '积分+' + jf_list[0]

                self.data.put({'商品名称': str_title, '已售': str_Sold, '积分价': join_jfj})

            if cxj and a == 2:
                jg = cxj[0]
                self.data.put({'商品名称': str_title, '已售': str_Sold, '促销价': jg})



class Mongo(threading.Thread):

    def __init__(self, data_queue, *args, **kwargs):
        super(Mongo, self).__init__(*args, **kwargs)
        self.data = data_queue

        # 启动mongodb 并设置存放数据的位置
        client = pymongo.MongoClient(host='127.0.0.1', port=27017)
        self.db = client['xiaoye']['jf']

    def run(self) -> None:
        try:
            boj = self.data.get(timeout=10)

            self.db.insert_one(boj)

        except:
            pass


def main():
    # 创建页数的队列(队列一)
    page_queue = queue.Queue()

    # 创建图片队列(队列二)
    data_queue = queue.Queue()

    # 目标url 翻页的
    r = mulu()

    for i in r:

        # 修改url
        page_url = i
        # 把修改后的url放到队列当中
        page_queue.put(page_url)

    # 开启子线程一
    for x in range(60):
        th = Sp(page_queue, data_queue)
        th.start()

    # 开启子线程二
    for x in range(100):
        th = Mongo(data_queue)
        th.start()


if __name__ == '__main__':
    main()