python 真好玩
# !-*-coding:utf-8-*-
# ! 2019/3/6 13:51
# !@Author:Cy 2019 03
# !@File:jdsk.py
import os
import time
import json
import datetime
import requests
import re
url="https://item.jd.com/7293066.html#askAnswer"
class Crawl(object):
def __init__(self):
'''
说明:类初始化操作
'''
self.headers={
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,br',
'Accept-Language':'zh-CN,zh;q=0.9',
'User-Agent':'Mozilla/5.0(Windows NT 10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/71.0.3578.80Safari/537.36',
'Referer':'https://miaosha.jd.com/category.html'
}
f_dir= os.getcwd()
f_name='京东秒杀%s.json'%str(datetime.datetime.now().date())
print("爬虫_QQ小招 自检完成,初始化成功" + f_dir + f_name)
self.path= os.path.join(f_dir, f_name)
self.fp = open(self.path,'w',encoding='utf-8')
self.fp.write('[\n')
print("准备进入主函数")
def crawl(self,url,cate):#爬虫
print('进入爬虫函数,现在采集',cate,'这类')
url='https://ai.jd.com/index_new'
data = {
'app': 'Seckill',
'action': 'pcSeckillCategoryGoods',
'callback': 'pcSeckillCategoryGoods',
'id': self.category_id[cate],
'_': int(time.time())
}
res=self.response_handler(url,data)
res=res.text
#print(res)
re1=r'pcSeckillCategoryGoods\((.*?)\);'
aaa=re.findall(re1,res,re.S | re.M)
#print(aaa[0])
datas=json.loads(aaa[0]).get('goodsList',[])
#print(datas)
items=self.parse(datas)
for item in items:
#print(item)
self.witer(item)
def response_handler(self,url,data):#向拼好的链接发post请求,返回requests
print('开始构造响应')
res=requests.post(url=url,data=data,headers=self.headers)
print('构造响应成功')
return res
def parse(self,res):
#解析返回的res,形成结构化数据
#print("开始解析数据")
#print(res)
items = []
for data in range(len(res)):
#print(res[data]['wname'])#.split(' ')[0:3]
#print(res[data]['miaoShaPrice'])
item={}
item[res[data]['wname']]= res[data]['miaoShaPrice']
items.append(item)
#print(items)
print('解析数据完成')
return items
def witer(self,item):
#print('存入数据',item)
date = json.dumps(item, ensure_ascii=False)
print('存入数据',date)
self.fp.write(date+',\n')
def close(self):
self.fp.write(']')
self.fp.close()
print('采集完成!数据存储在%s' % self.fp)
print('完成爬虫')
def main(self):
url="https://miaosha.jd.com/category.html"
print(url)
self.category_id={
'电脑办公':29,
'生活电器':19,
'手机通讯':30,
'大家电':25,
'智能数码':31,
'饮料酒水':45,
'家居家装':37,
'母婴童装':43,
'食品生鲜':44
}
print("进入主函数了")
for cate in self.category_id.keys():
print(cate,self.category_id[cate])
#print('准备爬%s类 链接为%s'%cate,self.category_id[cate])
self.crawl(url,cate)
self.close()
if __name__ == '__main__':
c=Crawl()
c.main()