#
import request
import csv
import random
import re
import time
# 定义字典,全局变量
dataDict = {}
def acquireData(goodsName):
headers = {
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://s.taobao.com/search?q=%E5%B9%B3%E6%9D%BF&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'miid=886571451609503279; thw=cn; tracknick=%5Cu6F20%5Cu4E36%5Cu6D41%5Cu5E74; _cc_=UIHiLt3xSw%3D%3D; cna=XPY/Gcr64gICAd9oPwK6cZxh; __guid=154677242.610519076024746100.1624545374981.7412; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=2c5f0fe716a02efbd2f0d1edc6bc542d_1627305813161; _m_h5_tk_enc=dd9c3694c5024570f06e00e5f4d17631; xlly_s=1; _samesite_flag_=true; cookie2=18f9e16f49510185a1b9646a8b714fe3; t=438d163c95833f6c8d65bb2403c13f3f; _tb_token_=ee55ebef1a838; sgcookie=E1004c16ExrCAD5x5j7kF8p4hlGs4f%2B1NcdPUrpCS4ycfh2OEZL0cUdvMGO8AtaHwNsybpZF4YFvqxGzT0M%2B337f%2Fw%3D%3D; unb=2747656955; uc3=nk2=p2bODe0%2FwE4%3D&vt3=F8dCuwJMTt25WuxtoX4%3D&lg2=V32FPkk%2Fw0dUvg%3D%3D&id2=UU8M9asjFAHX6A%3D%3D; csg=52a61766; lgc=%5Cu6F20%5Cu4E36%5Cu6D41%5Cu5E74; cancelledSubSites=empty; cookie17=UU8M9asjFAHX6A%3D%3D; dnk=%5Cu6F20%5Cu4E36%5Cu6D41%5Cu5E74; skt=d16ddf255b675ee3; existShop=MTYyNzI5ODMxOA%3D%3D; uc4=id4=0%40U22LO6DdRA6fglmTmrOjRlQC%2BSH7&nk4=0%40pVB%2Bf9qqQIYv2F%2BQgBgw%2FAOEuA%3D%3D; _l_g_=Ug%3D%3D; sg=%E5%B9%B454; _nk_=%5Cu6F20%5Cu4E36%5Cu6D41%5Cu5E74; cookie1=AiHKKKelDUvWRpdjtVjie2WqiG9GbZMAsBqLCOkT2H8%3D; enc=%2BhbdZ6vvp5GZYz6EoiYGDpJGNDOmzrBisa4yuSPRAlZ7ldU6FBRni09MHfDOHw%2BsIbxlchkO1d3fvAYynfTNzw%3D%3D; JSESSIONID=B53B6E4271AACD4A0697EA9F3178F9F9; monitor_count=1; mt=ci=88_1; uc1=cookie14=Uoe2yte7RKedug%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&pas=0&existShop=false&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=WqG3DMC9VAQiUQ%3D%3D; tfstk=cFIPBsASVuEzkb-QW3tEdRQVBRxRZSXlKmJ6rZa7snwQEI8liRcpnuUJ3pisPUf..; l=eBPZ4LmIvZr8PwmDBOfwourza77OSIRAIuPzaNbMiOCP_85p5e_dW6TU29Y9C3GVh6jJR37vCcawBeYBq61Inxv92j-la_kmn; isg=BL-_Q5PT4wiHOtks72scEePaTpNJpBNG4JERY1GMW261YN_iWXSjlj04pjCeGeu-',
}
for page in range(100):
params = (
('q', goodsName), #对应获取商品的名称,变量
('imgfile', ''),
('commend', 'all'),
('ssid', 's5-e'),
('search_type', 'item'),
('sourceId', 'tb.index'),
('spm', 'a21bo.2017.201856-taobao-item.1'),
('ie', 'utf8'),
('initiative_id', 'tbindexz_20170306'),
('s',str(page*44)), #页数
)
response = requests.get('https://s.taobao.com/search', headers=headers, params=params)
goods = re.findall('"raw_title":"(.*?)"', response.text) #商品
price = re.findall('"view_price":"(.*?)"', response.text) #价格
pointOrigin = re.findall('"item_loc":"(.*?)"', response.text) #发货地
sales = re.findall('"view_sales":"(.*?)人付款"', response.text) #付款人数,销售量
merchant = re.findall('"nick":"(.*?)"', response.text) #店铺
#1页是44条数据,把所有数据全部保存到一个dataDict
#把商家信息和销量存入字典中,就是JAVA中的Map集合
for i in range(44):
#判断字典,也就是Map集合中存在不存在该商家
#如果存在则累加销售数量
salesnumber=0
if merchant[i] in dataDict:
# 修改字典中商家对应的销量值,根据字典的键获取值+新增加的
if '万+' in sales[i][:-3]:
salesnumber = int(sales[i][:-5]) * 10000
dataDict[merchant[i]] =dataDict.get(merchant[i])+salesnumber
elif '+' in sales[i][:-3]:
salesnumber = int(sales[i][:-4])
# 如果不存在,则加入字典中
dataDict[merchant[i]] =dataDict.get(merchant[i])+salesnumber
else:
salesnumber = int(sales[i][:-3])
# 如果不存在,则加入字典中
dataDict[merchant[i]] =dataDict.get(merchant[i])+salesnumber
else:
# 修改字典中商家对应的销量值,根据字典的键获取值+新增加的
if '万+' in sales[i][:-3]:
salesnumber = int(sales[i][:-5]) * 10000
# 如果不存在,则加入字典中
dataDict.setdefault(merchant[i].salesnumber)
elif '+' in sales[i][:-3]:
salesnumber = int(sales[i][:-4])
# 如果不存在,则加入字典中
dataDict.setdefault(merchant[i].salesnumber)
else:
salesnumber = int(sales[i][:-3])
# 如果不存在,则加入字典中
dataDict.setdefault(merchant[i].salesnumber)
# print(f"已爬取完第{page+1}页数据.......")
time.sleep(random.randint(2, 4))
# print(f'总共爬取{page+1}页数据.......')
return dataDict
#统计数据、找出该件商品销售前100的商家
def statisticData(dataDict1):
#把字典按照value从大到小排序
dataDict2= sorted(dataDict.items(),key=lambda x:x[1],reverse=True)
count=0
for key in dataDict2.keys():
if count>10:
break
else:
count = count + 1
with open('demo.txt', 'a') as file_obj:
file_obj.write(key)
file_obj.write(str(dataDict2[key])+"\n")
if __name__ == '__main__':
#输入商品名称
goodsName=input("请输入一个商品名称: ")
#爬取数据,并且放入一个字典中,也就是Map集合中
dataDict1=acquireData(goodsName)
#统计数据,找出该件商品销售前100的商家,保存到文件中
statisticData(dataDict1)
python爬虫2
最新推荐文章于 2022-08-12 20:46:18 发布