写了玩的。爬淘宝商品信息


技术不到家,没法自动化爬。

一页页的爬内容

import urllib.request
from lxml import etree

# page = 1
# base_url = 'https://s.taobao.com/search?initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.21814703.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q=%E7%94%B7%E8%A3%852021%E6%96%B0%E6%AC%BE&suggest=history_1&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=4&ntoffset=4&p4ppushleft=2%2C48&s='
# url = base_url + str((page-1)*44)

url = 'https://s.taobao.com/search?initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.21814703.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q=%E7%94%B7%E8%A3%852021%E6%96%B0%E6%AC%BE&suggest=history_1&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=-23&ntoffset=-23&p4ppushleft=2%2C48&s=396'


head = {
    'accept':' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language':' zh-CN,zh;q=0.9',
    'cache-control':' max-age=0',
    'cookie':' cna=bqFwGQoUE0cCAXZwSsHATdPy; tracknick=%5Cu5B50%5Cu975E%5Cu9C7C%5Cu4F55%5Cu4EE5%5Cu77E5%5Cu9C7C; thw=cn; enc=3csyiHS08RoAWoycacYbmTdKfmE1LsPf6xOhW8OWER0INiXDvT1ffD1THLf7bzIvIauBGPEs4739%2Fv85HmQlNg%3D%3D; miid=2581005711670701776; _uab_collina=162849209050575801307335; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _samesite_flag_=true; cookie2=10674bf5876339e11c0e06fbf191e876; t=a4c6c94c7ad4882b7fb55b42ce32217c; v=0; mt=ci%3D-1_1; lgc=%5Cu5B50%5Cu975E%5Cu9C7C%5Cu4F55%5Cu4EE5%5Cu77E5%5Cu9C7C; cancelledSubSites=empty; dnk=%5Cu5B50%5Cu975E%5Cu9C7C%5Cu4F55%5Cu4EE5%5Cu77E5%5Cu9C7C; _header_fixed_=1; sgcookie=E100Uzr12LRBmmIPXmrrho%2Fwlq2vFn3uNxNLlPLgUBQjlfZNjlxYWiN%2FuFGFEhvot8wIx5fFQesvAhmlva%2FjYn1I7ycoeHcfsEP8repPyP9pFG4%3D; uc3=nk2=tMy2OgMB%2FePd%2BxBYe%2BI%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&id2=UU6lT9S9SvObug%3D%3D&vt3=F8dCujaDVxG%2FNJkS85Y%3D; csg=68576094; skt=7bd2b48204a5c2f4; existShop=MTYzMzI0NTE2Mg%3D%3D; uc4=nk4=0%40tiqdeH7CmokvPOHTyluT7J67ouzB8EK9GQ%3D%3D&id4=0%40U2xo%2F%2F4qWJJEusvl7Ce8eweCyOk9; _cc_=V32FPkk%2Fhw%3D%3D; _m_h5_tk=42e95057a8dd55e29d36461d5ca11be4_1633342328627; _m_h5_tk_enc=ae7cd740e5a7a5d225183260c4e3a81b; _tb_token_=e7b805854feeb; uc1=cookie21=Vq8l%2BKCLjhS4UhJVbhgU&existShop=false&pas=0&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie14=Uoe3dP4gtXoqyw%3D%3D; l=eB_a0v64j8RVsIN2BO5Z-urza77tWnOflsPzaNbMiIncC6FPTJJ9qG-QcV0l8d-RR8XViGTv4-i7OIwT8ezu-ykjJ0YEae1VivIBCe8C.; isg=BNracjW_b4-6uOKQp69TDs3cK4D8C17l_Pv4J-RC8W-NV3CRwZov9LHlJyNLh9Z9; JSESSIONID=8E02C0558CEE9FCF32F6865BD5360A6D',
    'sec-ch-ua':' "Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
    'sec-ch-ua-mobile':' ?0',
    'sec-ch-ua-platform':' "Windows"',
    'sec-fetch-dest':' document',
    'sec-fetch-mode':' navigate',
    'sec-fetch-site':' same-origin',
    'sec-fetch-user':' ?1',
    'upgrade-insecure-requests':' 1',
    'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
    }


request = urllib.request.Request(url=url,headers=head)
res = urllib.request.urlopen(request)
content = res.read().decode('utf-8')

print(content)



'''
tree = etree.HTML(content)

# goods_name = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="pic"]/a/img/@alt')
goods_name = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="row row-2 title"]/a')
goods_img = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="pic"]/a/img/@src')
goods_price = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="price g_price g_price-highlight"]/strong/text()')
goods_evaluation = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="deal-cnt"]/text()')
# shop_name = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="shop"]/a/span[2]/text()')
shop_name = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="wangwang"]/span/@data-nick')
shop_address = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="location"]/text()')
shop_url = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="shop"]/a/@href')
# goods_url = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="pic"]/a/@data-href')
goods_url = tree.xpath('//div[@class="item J_MouserOnverReq"]//div[@class="row row-2 title"]/a/@href')
'''

爬到的内容。复制粘贴关键内容。制作成JSON文件。

# 创建10个空的JSON文件
for i in range(1,11):
    fp = open("F:Taobao/taobao"+str(i)+".json","w")
    fp.write('')
    fp.close()

解析JSON文件,生成DataFrame和Excel

import json
import jsonpath
import pandas as pd


goods_id = []
goods_name = []
goods_img = []
goods_price = []
view_sales = []
comment_count = []
comment_url = []
goods_url = []
shop_id = []
shop_name = []
shop_address = []
shop_url = []

for num in range(1,11):
    obj = json.load(open('F:/Taobao/taobao'+str(num)+'.json', 'r', encoding='utf-8'))

    tmp_goods_id = jsonpath.jsonpath(obj,'$.goods..nid')
    tmp_goods_name = jsonpath.jsonpath(obj,'$.goods..raw_title')
    tmp_goods_img = jsonpath.jsonpath(obj,'$.goods..pic_url')
    tmp_goods_price = jsonpath.jsonpath(obj,'$.goods..view_price')
    tmp_view_sales = jsonpath.jsonpath(obj,'$.goods..view_sales')
    tmp_comment_count = jsonpath.jsonpath(obj,'$.goods..comment_count')
    tmp_comment_url = jsonpath.jsonpath(obj,'$.goods..comment_url')
    tmp_goods_url = jsonpath.jsonpath(obj,'$.goods..detail_url')
    tmp_shop_id = jsonpath.jsonpath(obj,'$.goods..user_id')
    tmp_shop_name = jsonpath.jsonpath(obj,'$.goods..nick')
    tmp_shop_address = jsonpath.jsonpath(obj,'$.goods..item_loc')
    tmp_shop_url = jsonpath.jsonpath(obj,'$.goods..shopLink')


    for i in range(len(tmp_goods_img)):
        if tmp_goods_img[i].startswith('//'):
            tmp_goods_img[i] = "https:"+tmp_goods_img[i]
        if tmp_comment_url[i].startswith('//'):
            tmp_comment_url[i] = "https:"+tmp_comment_url[i]
        if tmp_goods_url[i].startswith('//'):
            tmp_goods_url[i] = "https:"+tmp_goods_url[i]
        if tmp_shop_url[i].startswith('//'):
            tmp_shop_url[i] = "https:"+tmp_shop_url[i]

    goods_id = goods_id + tmp_goods_id
    goods_name = goods_name + tmp_goods_name
    goods_img = goods_img + tmp_goods_img
    goods_price = goods_price + tmp_goods_price
    view_sales = view_sales + tmp_view_sales
    comment_count = comment_count + tmp_comment_count
    comment_url = comment_url + tmp_comment_url
    goods_url = goods_url + tmp_goods_url
    shop_id = shop_id + tmp_shop_id
    shop_name = shop_name + tmp_shop_name
    shop_address = shop_address + tmp_shop_address
    shop_url = shop_url + tmp_shop_url



df=pd.DataFrame({'商品ID':goods_id,
                 '商品名称':goods_name,
                 '商品图片':goods_img,
                 '商品价格':goods_price,
                 '商品详情':goods_url,
                 '销量':view_sales,
                 '评论数':comment_count,
                 '评论详情':comment_url,
                 '店铺ID':shop_id,
                 '店铺名称':shop_name,
                 '店铺地址':shop_address,
                 '店铺详情':shop_url
                 })

print(df)

# 写入文件
writer=pd.ExcelWriter(r'F:\TaoBao\Taobao.xlsx')
df.to_excel(writer,sheet_name='taobao ',index=False)
writer.save()

生成一个新的JSON文件(留着备用)

import json
import jsonpath


goods_id = []
goods_name = []
goods_img = []
goods_price = []
view_sales = []
comment_count = []
comment_url = []
goods_url = []
shop_id = []
shop_name = []
shop_address = []
shop_url = []

for num in range(1,11):
    obj = json.load(open('C:/Taobao/taobao'+str(num)+'.json', 'r', encoding='utf-8'))

    tmp_goods_id = jsonpath.jsonpath(obj,'$.goods..nid')
    tmp_goods_name = jsonpath.jsonpath(obj,'$.goods..raw_title')
    tmp_goods_img = jsonpath.jsonpath(obj,'$.goods..pic_url')
    tmp_goods_price = jsonpath.jsonpath(obj,'$.goods..view_price')
    tmp_view_sales = jsonpath.jsonpath(obj,'$.goods..view_sales')
    tmp_comment_count = jsonpath.jsonpath(obj,'$.goods..comment_count')
    tmp_comment_url = jsonpath.jsonpath(obj,'$.goods..comment_url')
    tmp_goods_url = jsonpath.jsonpath(obj,'$.goods..detail_url')
    tmp_shop_id = jsonpath.jsonpath(obj,'$.goods..user_id')
    tmp_shop_name = jsonpath.jsonpath(obj,'$.goods..nick')
    tmp_shop_address = jsonpath.jsonpath(obj,'$.goods..item_loc')
    tmp_shop_url = jsonpath.jsonpath(obj,'$.goods..shopLink')


    for i in range(len(tmp_goods_img)):
        if tmp_goods_img[i].startswith('//'):
            tmp_goods_img[i] = "https:"+tmp_goods_img[i]
        if tmp_comment_url[i].startswith('//'):
            tmp_comment_url[i] = "https:"+tmp_comment_url[i]
        if tmp_goods_url[i].startswith('//'):
            tmp_goods_url[i] = "https:"+tmp_goods_url[i]
        if tmp_shop_url[i].startswith('//'):
            tmp_shop_url[i] = "https:"+tmp_shop_url[i]

    goods_id = goods_id + tmp_goods_id
    goods_name = goods_name + tmp_goods_name
    goods_img = goods_img + tmp_goods_img
    goods_price = goods_price + tmp_goods_price
    view_sales = view_sales + tmp_view_sales
    comment_count = comment_count + tmp_comment_count
    comment_url = comment_url + tmp_comment_url
    goods_url = goods_url + tmp_goods_url
    shop_id = shop_id + tmp_shop_id
    shop_name = shop_name + tmp_shop_name
    shop_address = shop_address + tmp_shop_address
    shop_url = shop_url + tmp_shop_url



# 定义一个变量
jsontext = {'goods':[]}
# 为他赋值
for index in range(len(goods_name)):
    jsontext['goods'].append({'goods_id':goods_id[i],
                              'goods_name':goods_name[i],
                              'goods_img':goods_img[i],
                              'goods_price':goods_price[i],
                              'view_sales':view_sales[i],
                              'goods_url':goods_url[i],
                              'comment_count':comment_count[i],
                              'comment_url':comment_url[i],
                              'shop_id':shop_id[i],
                              'shop_name':shop_name[i],
                              'shop_address':shop_address[i],
                              'shop_url':shop_url[i],
                              })

# 后面的参数是调整生成的json的格式,不加也行,就是丑点
jsondata = json.dumps(jsontext,indent=4,separators=(',', ' : '))

# print(jsondata)

f = open('Taobao.json', 'w')
f.write(jsondata)
f.close()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值