记录一下
- 重点
- 由于淘宝需要登录,我们需要在headers中加入cookie
- 获得的json格式,我们可以用在线json解析工具查看结构,这里也包括了两个自己画树状结构的函数
import requests
import bs4
import re
import json
# json_loads()是将json这种字符串格式的转化为python数据结构
# json_dumps()是将python数据结构的数据转化json这种字符串格式
# 打开链接
def open_url(keyword):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
"Cookie": "cna=ecr8FaGj4BkCAXQBA/ISif7h; t=74708015a4c8f195ffecf6910094518a; cookie2=1ba81d62075785c1aba0fef97e44a639; v=0; _tb_token_=ee75efeb6a6f3; _samesite_flag_=true; sgcookie=Ee1UjSL5Xy%2FoGCIy2YTej; unb=2567975303; uc3=nk2=BM4UqWtIwrLmo%2FTS&id2=UU20srK2IG68%2FA%3D%3D&lg2=Vq8l%2BKCLz3%2F65A%3D%3D&vt3=F8dBxGXFryZCZN77wes%3D; csg=ded34206; lgc=g812cm%5Cu7684%5Cu73AE%5Cu54E5; cookie17=UU20srK2IG68%2FA%3D%3D; dnk=g812cm%5Cu7684%5Cu73AE%5Cu54E5; skt=ff3aa2e9fdb99acd; existShop=MTU4OTM3NDg0Ng%3D%3D; uc4=id4=0%40U2%2Fz993QMiMBcqer33PZH8r8U2VT&nk4=0%40BsogzKLUdUAKNt4RccUT3lfRsxtfiYY%3D; tracknick=g812cm%5Cu7684%5Cu73AE%5Cu54E5; _cc_=WqG3DMC9EA%3D%3D; _l_g_=Ug%3D%3D; sg=%E5%93%A535; _nk_=g812cm%5Cu7684%5Cu73AE%5Cu54E5; cookie1=BxY5GoxuA9R6Jz%2FjbN3SW2nHhWhRwOZ7xqnSsqpvp6E%3D; enc=ueaGTkz%2FLBTfJlmU57xXHLpBRvG8gMuUQ1vbsr%2FC7%2BznvJM9wz9CcNW9oZJziPT5aGuke9p6l6uOqtAPluKTkg%3D%3D; tfstk=c8UOBww_q9XiuqNZ4rIncloQSHClZc_qddMXkszqEo5AFo8Ainzuyu4R5jvtJ6C..; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; mt=ci=112_1; uc1=cookie14=UoTUM2M25mx%2F5g%3D%3D&cookie16=UtASsssmPlP%2Ff1IHDsDaPRu%2BPw%3D%3D&existShop=false&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; JSESSIONID=31174B96F0394FF0592B8156FBA4E94D; l=eBEwguePQlV4qScBBOfwPurza77OSIRAguPzaNbMiT5P9Hfp5khhWZbg1u89C3GVh6D9R3ykIQI_BeYBqIv4n5U62j-la_kmn; isg=BObmTB7XhMrekFBu76YpHChkN1xoxyqB1oftY9CP0onkU4ZtOFd6kcwhq09feyKZ"}
payload = {'q':keyword,'sort':'sale-desc','tab':'mall'}
url = "https://s.taobao.com/search"
res = requests.get(url,params=payload, headers = headers)
return res
# 获取列表页的所有商品
def get_items(res):
g_page_config = re.search(r"g_page_config = (.*?);\n", res.text)
page_config_json = json.loads(g_page_config.group(1))
page_items = page_config_json['mods']['itemlist']['data']['auctions']
result = []
for each_item in page_items:
dict1 = dict.fromkeys(('nid','title','detail_url','view_price','view_sales','nick'))
dict1['nid'] = each_item['nid']
dict1['title'] = each_item['title']
dict1['detail_url'] = each_item['detail_url']
dict1['view_price'] = each_item['view_price']
dict1['view_sales'] = each_item['view_sales']
dict1['nick'] = each_item['nick']
result.append(dict1)
return result
# 统计该页面所有商品的销量
def count_sales(items):
count = 0
for each in items:
if '华为' in each['title']:
count += int(re.search(r'\d+',each['view_sales']).group())
return count
def svae_to_text(res):
with open("taobao.txt","w",encoding="utf-8") as file:
file.write(res.text)
# 根据节点画树状结构
def get_space_end(level):
return ' '*level + '-'
def get_space_expand(level):
return ' '*level + '+'
# 递归调用,获取所有的json键
def find_keys(targets,level):
keys = iter(targets)
for each in keys:
if type(targets[each]) is not dict:
print(get_space_end(level)+each)
else:
next_level = level + 1
print(get_space_expand(level)+each)
find_keys(targets[each],next_level)
# 从下载下来的整个页面中,找到需要的部分另存为一个文件,并递归调用find_keys函数画出树状结构
def read_g_page_config_from_text():
with open("taobao.txt","r",encoding="utf-8") as file1:
g_page_config = re.search(r"g_page_config = (.*?);\n",file1.read())
print(g_page_config)
with open("g_page_config.txt","w",encoding="utf-8") as file2:
file2.write(g_page_config.group(1))
page_config_json = json.loads(g_page_config.group(1))
find_keys(page_config_json,1)
# 这里实现的功能是在笔记本电脑的第一个页面中,看华为电脑的销售量
# 如果需要所有页面,需要改链接中的page,并加循环读取
def main():
#keyword = input("请输入搜索关键词:")
total = 0
keyword = "笔记本电脑"
res = open_url(keyword)
#svae_to_text(res)
#read_g_page_config_from_text()
items = get_items(res)
print(items)
total += count_sales(items)
print(total)
if __name__ == '__main__':
main()