python购物打折_python爬取优惠信息

大部分银行的官网上都会公布促销信息,有些银行做的很规范,有些做的非常混乱,今天通过python爬取一家相对比较规范的银行促销信息,如下图:

当我看到这个促销信息,我内心是高兴的,特别的规整。

我通过chrome浏览器可以查看此页面源码,通过检测发现所有数据都是由ajax获取json来动态加载,这样就省去了遍历页面的麻烦,可以直接解析json数据。

首先我们通过chrome检测功能中的network来拦截消息,如下图

同样json格式获取信息:

整体代码如下:# 导入包

import os

import requests

import json

from lxml import etree

from multiprocessing import Pool

def getHtml(url):

# 定义req为一个requests请求的对象

req = requests.get(url)

# req这个请求对象的status_code方法获取请求的状态码

status_code = req.status_code

if (status_code!= 200):

return 'req_error'

# print(status_code)

# 指定网页解码方式

req.encoding = 'utf-8'

# 获取网页源码 用html变量接收 text content方法灵活运用

# https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=1&dictrict=&kindName=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=16&typeName=

html = req.text

return html

#请求url

def get_fenye(url):

html = getHtml(url)

# print(html.strip())

if(html!='req_error'):

json_citys = json.loads(html.strip())

# json_citys = [{'tvalue': '重庆', 'tkey': '28', 'py': 'Z'}]

# print(type(json_citys))

for json_city in json_citys:

# print(json_city['tkey'],json_city['tvalue'],json_city['py'])

area_url = 'https://creditcard.cmbc.com.cn/fe/getType.gsp?city=%s' % (json_city['tvalue'])

html2 = getHtml(area_url)

# print(html2.strip())

if (html2 != 'req_error'):

json_areas = json.loads(html2.strip())

# print(type(json_areas))

if( isinstance(json_areas, list)):

businessAreaLists = json_areas[0]['cityList'][0]['businessAreaList']

# print(type(businessAreaLists),businessAreaLists)

for businessAreaList in businessAreaLists:

# print(json_city['tvalue'],businessAreaList['businessAreaId'],businessAreaList['businessAreaName'])

cx_url = 'https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=%s&dictrict=%s&kindName=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=10&typeName=' % (json_city['tkey'], businessAreaList['businessAreaId'])

# print(cx_url)

# https: // creditcard.cmbc.com.cn / fe / common / shop - Business - info.jsp?shopid = 107141

html3 = getHtml(cx_url)

if (html3 != 'req_error'):

json_cxs = json.loads(html3.strip())

row_cnt = json_cxs[0]['rowCount']

# print(row_cnt, int(row_cnt / 10))

if(row_cnt>0):

for num in range(1,int(row_cnt / 10)+2):

# print(num)

fy_url = 'https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=%s&dictrict=%s&kindName=&latitude=&longitude=&mark=pc&page=%d&resolving=&rows=10&typeName=' % (json_city['tkey'], businessAreaList['businessAreaId'],num)

print(fy_url)

html4 = getHtml(fy_url)

if (html4 != 'req_error'):

json_cxsps = json.loads(html4.strip())

ShopLists = json_cxsps[0]['ShopList']

# print(type(ShopLists),ShopLists)

for ShopList in ShopLists:

tcity = ShopList['tcity']

tlongitude = ShopList['tlongitude']

tstatus = ShopList['tstatus']

tshopId = ShopList['tshopId']

timgType = ShopList['timgType']

tlatitude = ShopList['tlatitude']

tmerchName = ShopList['tmerchName']

taddress = ShopList['taddress']

# timgName = ShopList['timgName']

tdiscount = ShopList['tdiscount']

# https://creditcard.cmbc.com.cn/fe/find/fingShopByShopName.gsp?mark =pc&resolving=&rowser=&rowserEdition=&shopId=107141

shop_url = 'https://creditcard.cmbc.com.cn/fe/find/fingShopByShopName.gsp?mark =pc&resolving=&rowser=&rowserEdition=&shopId=%s' % (tshopId)

html5 = getHtml(shop_url)

if (html5 != 'req_error'):

json_shops = json.loads(html5.strip())

# print(json_shops)

print(shop_url)

ShopDetailList = json_shops[0]['ShopDetailList'][0]

timgName = json_shops[0]['ShopImgList'][0]['timgName']

# https://creditcard.cmbc.com.cn/

# print(ShopDetailList)

print(json_city['tvalue'],businessAreaList['businessAreaName'],tcity, tlongitude, tstatus, tshopId, timgType, tlatitude, tmerchName, taddress,

timgName, tdiscount,timgName)

# print(ShopImgList)

tbriefInfoAdv = ShopDetailList['tbriefInfoAdv']

ditails = ShopDetailList['ditail']

content = '名称:' + tmerchName + '\n' + '地址:' + taddress + '\n' + '优惠信息:'+ tdiscount + '\n'

for ditail in ditails:

# print(ditail['key'],ditail['value'])

content = content + ditail['key'] + ':' + ditail['value'] + '\n'

print('--------------------------------------')

content = content + '图片地址:' + 'https://creditcard.cmbc.com.cn/'+timgName + '\n' + '--------------------------------------' + '\n'

writeTxt('cmbc',json_city['tvalue'], businessAreaList['businessAreaName'], content)

def writeTxt(bank_name,menu_name,short_name,content):

base_dir = os.path.abspath(__file__)

parent_dir = os.path.dirname(base_dir)

menu_dir = os.path.join(parent_dir, bank_name, menu_name)

if os.path.isdir(menu_dir):

pass

else:

os.makedirs(menu_dir)

os.chdir(menu_dir)

file_name = os.path.join(menu_dir, short_name + '.txt')

with open(file_name, 'a', encoding='utf-8') as file:

file.write(content)

if __name__ == '__main__':

root_url = 'https://creditcard.cmbc.com.cn/fe/getCityList.gsp'

get_fenye(root_url)

运行结果:

后续可以加上线程池方式来加速爬取优惠信息。。。。。。。。。。

1,增加状态返回码,单进程爬取时候发现报错,但是重复爬有没有错误信息,所以增加响应码。

2,将爬取信息写到文件中,按城市划分目录,地区来命名。

3,

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值