写了玩的。爬淘宝商品信息
技术不到家,没法自动化爬。
一页页的爬内容
import urllib.request
from lxml import etree
# page = 1
# base_url = 'https://s.taobao.com/search?initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.21814703.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q=%E7%94%B7%E8%A3%852021%E6%96%B0%E6%AC%BE&suggest=history_1&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=4&ntoffset=4&p4ppushleft=2%2C48&s='
# url = base_url + str((page-1)*44)
url = 'https://s.taobao.com/search?initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.21814703.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q=%E7%94%B7%E8%A3%852021%E6%96%B0%E6%AC%BE&suggest=history_1&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=-23&ntoffset=-23&p4ppushleft=2%2C48&s=396'
head = {
'accept':' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':' zh-CN,zh;q=0.9',
'cache-control':' max-age=0',
'cookie':' cna=bqFwGQoUE0cCAXZwSsHATdPy; tracknick=%5Cu5B50%5Cu975E%5Cu9C7C%5Cu4F55%5Cu4EE5%5Cu77E5%5Cu9C7C; thw=cn; enc=3csyiHS08RoAWoycacYbmTdKfmE1LsPf6xOhW8OWER0INiXDvT1ffD1THLf7bzIvIauBGPEs4739%2Fv85HmQlNg%3D%3D; miid=2581005711670701776; _uab_collina=162849209050575801307335; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _samesite_flag_=true; cookie2=10674bf5876339e11c0e06fbf191e876; t=a4c6c94c7ad4882b7fb55b42ce32217c; v=0; mt=ci%3D-1_1; lgc=%5Cu5B50%5Cu975E%5Cu9C7C%5Cu4F55%5Cu4EE5%5Cu77E5%5Cu9C7C; cancelledSubSites=empty; dnk=%5Cu5B50%5Cu975E%5Cu9C7C%5Cu4F55%5Cu4EE5%5Cu77E5%5Cu9C7C; _header_fixed_=1; sgcookie=E100Uzr12LRBmmIPXmrrho%2Fwlq2vFn3uNxNLlPLgUBQjlfZNjlxYWiN%2FuFGFEhvot8wIx5fFQesvAhmlva%2FjYn1I7ycoeHcfsEP8repPyP9pFG4%3D; uc3=nk2=tMy2OgMB%2FePd%2BxBYe%2BI%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&id2=UU6lT9S9SvObug%3D%3D&vt3=F8dCujaDVxG%2FNJkS85Y%3D; csg=68576094; skt=7bd2b48204a5c2f4; existShop=MTYzMzI0NTE2Mg%3D%3D; uc4=nk4=0%40tiqdeH7CmokvPOHTyluT7J67ouzB8EK9GQ%3D%3D&id4=0%40U2xo%2F%2F4qWJJEusvl7Ce8eweCyOk9; _cc_=V32FPkk%2Fhw%3D%3D; _m_h5_tk=42e95057a8dd55e29d36461d5ca11be4_1633342328627; _m_h5_tk_enc=ae7cd740e5a7a5d225183260c4e3a81b; _tb_token_=e7b805854feeb; uc1=cookie21=Vq8l%2BKCLjhS4UhJVbhgU&existShop=false&pas=0&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie14=Uoe3dP4gtXoqyw%3D%3D; l=eB_a0v64j8RVsIN2BO5Z-urza77tWnOflsPzaNbMiIncC6FPTJJ9qG-QcV0l8d-RR8XViGTv4-i7OIwT8ezu-ykjJ0YEae1VivIBCe8C.; isg=BNracjW_b4-6uOKQp69TDs3cK4D8C17l_Pv4J-RC8W-NV3CRwZov9LHlJyNLh9Z9; JSESSIONID=8E02C0558CEE9FCF32F6865BD5360A6D',
'sec-ch-ua':' "Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile':' ?0',
'sec-ch-ua-platform':' "Windows"',
'sec-fetch-dest':' document',
'sec-fetch-mode':' navigate',
'sec-fetch-site':' same-origin',
'sec-fetch-user':' ?1',
'upgrade-insecure-requests':' 1',
'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=head)
res = urllib.request.urlopen(request)
content = res.read().decode('utf-8')
print(content)
'''
tree = etree.HTML(content)
# goods_name = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="pic"]/a/img/@alt')
goods_name = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="row row-2 title"]/a')
goods_img = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="pic"]/a/img/@src')
goods_price = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="price g_price g_price-highlight"]/strong/text()')
goods_evaluation = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="deal-cnt"]/text()')
# shop_name = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="shop"]/a/span[2]/text()')
shop_name = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="wangwang"]/span/@data-nick')
shop_address = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="location"]/text()')
shop_url = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="shop"]/a/@href')
# goods_url = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="pic"]/a/@data-href')
goods_url = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="row row-2 title"]/a/@href')
'''
爬到的内容。复制粘贴关键内容。制作成JSON文件。
# 创建10个空的JSON文件
for i in range(1,11):
fp = open("F:Taobao/taobao"+str(i)+".json","w")
fp.write('')
fp.close()
解析JSON文件,生成DataFrame和Excel
import json
import jsonpath
import pandas as pd
goods_id = []
goods_name = []
goods_img = []
goods_price = []
view_sales = []
comment_count = []
comment_url = []
goods_url = []
shop_id = []
shop_name = []
shop_address = []
shop_url = []
for num in range(1,11):
obj = json.load(open('F:/Taobao/taobao'+str(num)+'.json', 'r', encoding='utf-8'))
tmp_goods_id = jsonpath.jsonpath(obj,'$.goods..nid')
tmp_goods_name = jsonpath.jsonpath(obj,'$.goods..raw_title')
tmp_goods_img = jsonpath.jsonpath(obj,'$.goods..pic_url')
tmp_goods_price = jsonpath.jsonpath(obj,'$.goods..view_price')
tmp_view_sales = jsonpath.jsonpath(obj,'$.goods..view_sales')
tmp_comment_count = jsonpath.jsonpath(obj,'$.goods..comment_count')
tmp_comment_url = jsonpath.jsonpath(obj,'$.goods..comment_url')
tmp_goods_url = jsonpath.jsonpath(obj,'$.goods..detail_url')
tmp_shop_id = jsonpath.jsonpath(obj,'$.goods..user_id')
tmp_shop_name = jsonpath.jsonpath(obj,'$.goods..nick')
tmp_shop_address = jsonpath.jsonpath(obj,'$.goods..item_loc')
tmp_shop_url = jsonpath.jsonpath(obj,'$.goods..shopLink')
for i in range(len(tmp_goods_img)):
if tmp_goods_img[i].startswith('//'):
tmp_goods_img[i] = "https:"+tmp_goods_img[i]
if tmp_comment_url[i].startswith('//'):
tmp_comment_url[i] = "https:"+tmp_comment_url[i]
if tmp_goods_url[i].startswith('//'):
tmp_goods_url[i] = "https:"+tmp_goods_url[i]
if tmp_shop_url[i].startswith('//'):
tmp_shop_url[i] = "https:"+tmp_shop_url[i]
goods_id = goods_id + tmp_goods_id
goods_name = goods_name + tmp_goods_name
goods_img = goods_img + tmp_goods_img
goods_price = goods_price + tmp_goods_price
view_sales = view_sales + tmp_view_sales
comment_count = comment_count + tmp_comment_count
comment_url = comment_url + tmp_comment_url
goods_url = goods_url + tmp_goods_url
shop_id = shop_id + tmp_shop_id
shop_name = shop_name + tmp_shop_name
shop_address = shop_address + tmp_shop_address
shop_url = shop_url + tmp_shop_url
df=pd.DataFrame({'商品ID':goods_id,
'商品名称':goods_name,
'商品图片':goods_img,
'商品价格':goods_price,
'商品详情':goods_url,
'销量':view_sales,
'评论数':comment_count,
'评论详情':comment_url,
'店铺ID':shop_id,
'店铺名称':shop_name,
'店铺地址':shop_address,
'店铺详情':shop_url
})
print(df)
# 写入文件
writer=pd.ExcelWriter(r'F:\TaoBao\Taobao.xlsx')
df.to_excel(writer,sheet_name='taobao ',index=False)
writer.save()
生成一个新的JSON文件(留着备用)
import json
import jsonpath
goods_id = []
goods_name = []
goods_img = []
goods_price = []
view_sales = []
comment_count = []
comment_url = []
goods_url = []
shop_id = []
shop_name = []
shop_address = []
shop_url = []
for num in range(1,11):
obj = json.load(open('C:/Taobao/taobao'+str(num)+'.json', 'r', encoding='utf-8'))
tmp_goods_id = jsonpath.jsonpath(obj,'$.goods..nid')
tmp_goods_name = jsonpath.jsonpath(obj,'$.goods..raw_title')
tmp_goods_img = jsonpath.jsonpath(obj,'$.goods..pic_url')
tmp_goods_price = jsonpath.jsonpath(obj,'$.goods..view_price')
tmp_view_sales = jsonpath.jsonpath(obj,'$.goods..view_sales')
tmp_comment_count = jsonpath.jsonpath(obj,'$.goods..comment_count')
tmp_comment_url = jsonpath.jsonpath(obj,'$.goods..comment_url')
tmp_goods_url = jsonpath.jsonpath(obj,'$.goods..detail_url')
tmp_shop_id = jsonpath.jsonpath(obj,'$.goods..user_id')
tmp_shop_name = jsonpath.jsonpath(obj,'$.goods..nick')
tmp_shop_address = jsonpath.jsonpath(obj,'$.goods..item_loc')
tmp_shop_url = jsonpath.jsonpath(obj,'$.goods..shopLink')
for i in range(len(tmp_goods_img)):
if tmp_goods_img[i].startswith('//'):
tmp_goods_img[i] = "https:"+tmp_goods_img[i]
if tmp_comment_url[i].startswith('//'):
tmp_comment_url[i] = "https:"+tmp_comment_url[i]
if tmp_goods_url[i].startswith('//'):
tmp_goods_url[i] = "https:"+tmp_goods_url[i]
if tmp_shop_url[i].startswith('//'):
tmp_shop_url[i] = "https:"+tmp_shop_url[i]
goods_id = goods_id + tmp_goods_id
goods_name = goods_name + tmp_goods_name
goods_img = goods_img + tmp_goods_img
goods_price = goods_price + tmp_goods_price
view_sales = view_sales + tmp_view_sales
comment_count = comment_count + tmp_comment_count
comment_url = comment_url + tmp_comment_url
goods_url = goods_url + tmp_goods_url
shop_id = shop_id + tmp_shop_id
shop_name = shop_name + tmp_shop_name
shop_address = shop_address + tmp_shop_address
shop_url = shop_url + tmp_shop_url
# 定义一个变量
jsontext = {'goods':[]}
# 为他赋值
for index in range(len(goods_name)):
jsontext['goods'].append({'goods_id':goods_id[i],
'goods_name':goods_name[i],
'goods_img':goods_img[i],
'goods_price':goods_price[i],
'view_sales':view_sales[i],
'goods_url':goods_url[i],
'comment_count':comment_count[i],
'comment_url':comment_url[i],
'shop_id':shop_id[i],
'shop_name':shop_name[i],
'shop_address':shop_address[i],
'shop_url':shop_url[i],
})
# 后面的参数是调整生成的json的格式,不加也行,就是丑点
jsondata = json.dumps(jsontext,indent=4,separators=(',', ' : '))
# print(jsondata)
f = open('Taobao.json', 'w')
f.write(jsondata)
f.close()