一、找数据
休闲食品中我们除了要获取商品名称和已售的数据 还有积分价和促销价 我们就分别来讲
积分价
1.获取改变数据的js函数
积分价的数据全是在源代码里的JavaScript语言表示
$("#merPromPrice").html("<span class='jifen_1'>"+changeNum(promotionIntegralPriceJson[prodSkuObject.prodSkuId])+"</span>"+"<span class='jifen_2'>积分</span>"+"<span class='jifen_2'>+</span>"+"<span class='jifen_3'>"+$("#prodPriceUnit").val() +promPrice+"</span>");
顺藤摸瓜获取数据的函数
function changeNum(val) {
var temp = parseFloat(val) / 10000;
if (temp < 1) {
val = parseFloat(val).toFixed(2);
} else {
val = temp.toFixed(2);
}
return val;
}
function famout(s, n) {
n = n > 0 && n<=20 ? n: 2;
s = parseFloat((s + "").replace(/[^\d\.-]/g,"")).toFixed(n)+"";
var l = s.split(".")[0].split("").reverse();
var r = s.split(".")[1];
var len = (s.indexOf("-") != -1) ? l.length-1: l.length;
var t= "";
for(var i = 0; i<len;i++){
t+= l[i] + ((i+1)%3 == 0 && (i+1)!=len? ",":"");
}
return ((s.indexOf("-")!= -1)? "-" : "") + t.split("").reverse().join("")+"."+r;
}
function fNum(val) {
if (val) {
return val == "" ? val: famout(parseFloat(val), 2);
}
}
把这些代码分别封存到js文件中 用于python
2.获取传入函数的参数
找到之后使用正则把它取出来
jsons = re.findall('jQuery.parseJSON.*?":".*?"}', res.text)
for x, json in enumerate(jsons[:2]):
item = {}
js = re.search('jQuery.parseJSON.*?":"(.*?)"}', json).group(1)
促销价
促销价就在源代码的标签中 直接取出即可
html = etree.HTML(res.text)
span = html.xpath('//ul[@class="prop"]/div[1]//text()')
jiage = span[3] + span[4]
二、爬取数据
目标:使用多线程的方式爬取数据 存入MonGoDB数据库
# 加载模块
import requests
from fake_useragent import UserAgent
from lxml import etree
import chardet
import re
import execjs
import pymongo
import threading
import queue
node = execjs.get()
def mulu():
url = 'https://mall.icbc.com.cn/searchproducts/pv.jhtml?query=&catId=&displayCatId=000000000010048&selectCatId=&storeName=&isCoupon=&searchType=DISPLAYCATEGORY'
headers = {
'User-Agent': UserAgent().random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'If-None-Match': '"09dfdbc81c92a8ca373ecf04a56621c0a"',
'Referer': 'https://mall.icbc.com.cn/searchproducts/pv.jhtml?searchType=DISPLAYCATEGORY&displayCatId=000000000010048',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin'
}
range_list = []
data = {
'storeFilter': '[]',
'categoryFilter': '[]',
'propFilter': '[]',
'displayCategoryFilter': '[]',
'leftSideDC': '',
'brandFilter': '[]',
'skuFilter': '[]',
'provinceFilter': '',
'cityFilter': '',
'priceRegionFilter': '{}',
'sortFilter': '0',
'dirFilter': '1',
'promotionFilter': '[]',
'freeDelivery': '',
'installment': '',
'sev_refund': '',
'guarantee_pay': '',
'score_pay': '',
'loan_pay': '',
'over_sea': '',
'resultSearchFilter': '',
'viewType': 'large',
'selectedSearch': '[]',
'exQuery': '',
'currentPage': '1',
}
res = requests.post(url, headers=headers, data=data)
html = etree.HTML(res.text)
lis = html.xpath('//div[@id="ajaxQueryContent"]/ul[1]/li')
for li in lis:
href = li.xpath('./div[1]/div[1]/a/@href')
join_href = 'https://mall.icbc.com.cn' + href[0]
range_list.append(join_href)
return range_list
class Sp(threading.Thread):
def __init__(self, page_queue, data_queue, *args, **kwargs):
super(Sp, self).__init__(*args, **kwargs)
self.page = page_queue
self.data = data_queue
self.headers = {
'User-Agent': UserAgent().random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'If-None-Match': '"09dfdbc81c92a8ca373ecf04a56621c0a"',
'Referer': 'https://mall.icbc.com.cn/searchproducts/pv.jhtml?searchType=DISPLAYCATEGORY&displayCatId=000000000010048',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin'
}
def run(self) -> None:
url = self.page.get()
res = requests.get(url, headers=self.headers)
res.encoding = chardet.detect(res.content)['encoding']
html = etree.HTML(res.text)
title = html.xpath('//h3[@class="cfx"]/div[1]/p[1]//text()')
Sold = html.xpath('//ul[@class="prop"]/li[6]//text()')
if Sold[0][-3:-1] != '已售':
Sold = html.xpath('//ul[@class="prop"]/li[5]//text()')
# print(Sold)
jfj = html.xpath('//ul[@class="prop"]/div[1]/li[2]/span[1]//text()')
str_jfj = str(jfj[0]).replace('\n', '').replace('\r', '').replace(' ', '').strip()[:-1]
str_title = ''.join(title).replace('\n', '').replace('\r', '').replace(' ', '').strip()
str_Sold = ''.join(Sold).replace('\n', '').replace('\r', '').replace(' ', '').replace('已售:', '').replace(' ', '').strip()
# print(str_Sold)
jf_list = []
cxj = []
a = 0
if '扶贫特产' not in str_title:
if str_jfj == '积分价':
a = 1
jsons = re.findall('jQuery.parseJSON.*?":".*?"}', res.text)
for x, json in enumerate(jsons[:2]):
item = {}
js = re.search('jQuery.parseJSON.*?":"(.*?)"}', json).group(1)
if x == 1:
gs1 = node.compile(open(r'./工商1.js',
encoding='utf-8').read())
funcName = 'changeNum("' + js + '")'
jf = gs1.eval(funcName)
jf_list.append(jf + '万')
if x == 0:
gs2 = node.compile(open(r'./工商2.js',
encoding='utf-8').read())
funcName = 'fNum("' + js + '")'
zjj = gs2.eval(funcName)
rmb = re.search('prodPriceUnit" value="(.*?)"/>', res.text).group(1)
jf_list.append(rmb + zjj)
if str_jfj == '促销价':
a = 2
span = html.xpath('//ul[@class="prop"]/div[1]//text()')
jiage = span[3] + span[4]
cxj.append(jiage)
if jf_list and a == 1:
join_jfj = '积分价:' + jf_list[1] + '积分+' + jf_list[0]
self.data.put({'商品名称': str_title, '已售': str_Sold, '积分价': join_jfj})
if cxj and a == 2:
jg = cxj[0]
self.data.put({'商品名称': str_title, '已售': str_Sold, '促销价': jg})
class Mongo(threading.Thread):
def __init__(self, data_queue, *args, **kwargs):
super(Mongo, self).__init__(*args, **kwargs)
self.data = data_queue
# 启动mongodb 并设置存放数据的位置
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
self.db = client['xiaoye']['jf']
def run(self) -> None:
try:
boj = self.data.get(timeout=10)
self.db.insert_one(boj)
except:
pass
def main():
# 创建页数的队列(队列一)
page_queue = queue.Queue()
# 创建图片队列(队列二)
data_queue = queue.Queue()
# 目标url 翻页的
r = mulu()
for i in r:
# 修改url
page_url = i
# 把修改后的url放到队列当中
page_queue.put(page_url)
# 开启子线程一
for x in range(60):
th = Sp(page_queue, data_queue)
th.start()
# 开启子线程二
for x in range(100):
th = Mongo(data_queue)
th.start()
if __name__ == '__main__':
main()
如果代码有错误或者需要优化的地方欢迎评论区讨论
我是正在努力学习爬虫的夜逍尘 谢谢观看