python爬虫2

#
import request
import csv
import random
import re
import time

# 定义字典,全局变量
dataDict = {}
def acquireData(goodsName):
    headers = {
       'authority': 's.taobao.com',
       'cache-control': 'max-age=0',
       'upgrade-insecure-requests': '1',
       'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
       'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
       'sec-fetch-site': 'same-origin',
       'sec-fetch-mode': 'navigate',
       'sec-fetch-user': '?1',
       'sec-fetch-dest': 'document',
       'referer': 'https://s.taobao.com/search?q=%E5%B9%B3%E6%9D%BF&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306',
       'accept-language': 'zh-CN,zh;q=0.9',
       'cookie': 'miid=886571451609503279; thw=cn; tracknick=%5Cu6F20%5Cu4E36%5Cu6D41%5Cu5E74; _cc_=UIHiLt3xSw%3D%3D; cna=XPY/Gcr64gICAd9oPwK6cZxh; __guid=154677242.610519076024746100.1624545374981.7412; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=2c5f0fe716a02efbd2f0d1edc6bc542d_1627305813161; _m_h5_tk_enc=dd9c3694c5024570f06e00e5f4d17631; xlly_s=1; _samesite_flag_=true; cookie2=18f9e16f49510185a1b9646a8b714fe3; t=438d163c95833f6c8d65bb2403c13f3f; _tb_token_=ee55ebef1a838; sgcookie=E1004c16ExrCAD5x5j7kF8p4hlGs4f%2B1NcdPUrpCS4ycfh2OEZL0cUdvMGO8AtaHwNsybpZF4YFvqxGzT0M%2B337f%2Fw%3D%3D; unb=2747656955; uc3=nk2=p2bODe0%2FwE4%3D&vt3=F8dCuwJMTt25WuxtoX4%3D&lg2=V32FPkk%2Fw0dUvg%3D%3D&id2=UU8M9asjFAHX6A%3D%3D; csg=52a61766; lgc=%5Cu6F20%5Cu4E36%5Cu6D41%5Cu5E74; cancelledSubSites=empty; cookie17=UU8M9asjFAHX6A%3D%3D; dnk=%5Cu6F20%5Cu4E36%5Cu6D41%5Cu5E74; skt=d16ddf255b675ee3; existShop=MTYyNzI5ODMxOA%3D%3D; uc4=id4=0%40U22LO6DdRA6fglmTmrOjRlQC%2BSH7&nk4=0%40pVB%2Bf9qqQIYv2F%2BQgBgw%2FAOEuA%3D%3D; _l_g_=Ug%3D%3D; sg=%E5%B9%B454; _nk_=%5Cu6F20%5Cu4E36%5Cu6D41%5Cu5E74; cookie1=AiHKKKelDUvWRpdjtVjie2WqiG9GbZMAsBqLCOkT2H8%3D; enc=%2BhbdZ6vvp5GZYz6EoiYGDpJGNDOmzrBisa4yuSPRAlZ7ldU6FBRni09MHfDOHw%2BsIbxlchkO1d3fvAYynfTNzw%3D%3D; JSESSIONID=B53B6E4271AACD4A0697EA9F3178F9F9; monitor_count=1; mt=ci=88_1; uc1=cookie14=Uoe2yte7RKedug%3D%3D&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&pas=0&existShop=false&cookie21=UtASsssmeW6lpyd%2BB%2B3t&cookie15=WqG3DMC9VAQiUQ%3D%3D; tfstk=cFIPBsASVuEzkb-QW3tEdRQVBRxRZSXlKmJ6rZa7snwQEI8liRcpnuUJ3pisPUf..; l=eBPZ4LmIvZr8PwmDBOfwourza77OSIRAIuPzaNbMiOCP_85p5e_dW6TU29Y9C3GVh6jJR37vCcawBeYBq61Inxv92j-la_kmn; isg=BL-_Q5PT4wiHOtks72scEePaTpNJpBNG4JERY1GMW261YN_iWXSjlj04pjCeGeu-',
    }
    for page in range(100):
        params = (
            ('q', goodsName),  #对应获取商品的名称,变量
            ('imgfile', ''),
            ('commend', 'all'),
            ('ssid', 's5-e'),
            ('search_type', 'item'),
            ('sourceId', 'tb.index'),
            ('spm', 'a21bo.2017.201856-taobao-item.1'),
            ('ie', 'utf8'),
            ('initiative_id', 'tbindexz_20170306'),
            ('s',str(page*44)), #页数
                    )
        response = requests.get('https://s.taobao.com/search', headers=headers, params=params)

        goods = re.findall('"raw_title":"(.*?)"', response.text) #商品
        price = re.findall('"view_price":"(.*?)"', response.text) #价格
        pointOrigin = re.findall('"item_loc":"(.*?)"', response.text) #发货地
        sales = re.findall('"view_sales":"(.*?)人付款"', response.text) #付款人数,销售量
        merchant = re.findall('"nick":"(.*?)"', response.text) #店铺

        #1页是44条数据,把所有数据全部保存到一个dataDict
        #把商家信息和销量存入字典中,就是JAVA中的Map集合
        for i in range(44):
           #判断字典,也就是Map集合中存在不存在该商家
           #如果存在则累加销售数量
            salesnumber=0
            if merchant[i] in dataDict:
                # 修改字典中商家对应的销量值,根据字典的键获取值+新增加的
                if '万+' in sales[i][:-3]:
                    salesnumber = int(sales[i][:-5]) * 10000
                    dataDict[merchant[i]] =dataDict.get(merchant[i])+salesnumber
                elif '+' in sales[i][:-3]:
                    salesnumber = int(sales[i][:-4])
                    # 如果不存在,则加入字典中
                    dataDict[merchant[i]] =dataDict.get(merchant[i])+salesnumber
                else:
                    salesnumber = int(sales[i][:-3])
                    # 如果不存在,则加入字典中
                    dataDict[merchant[i]] =dataDict.get(merchant[i])+salesnumber
            else:
                # 修改字典中商家对应的销量值,根据字典的键获取值+新增加的
                if '万+' in sales[i][:-3]:
                    salesnumber = int(sales[i][:-5]) * 10000
                    # 如果不存在,则加入字典中
                    dataDict.setdefault(merchant[i].salesnumber)
                elif '+' in sales[i][:-3]:
                    salesnumber = int(sales[i][:-4])
                    # 如果不存在,则加入字典中
                    dataDict.setdefault(merchant[i].salesnumber)
                else:
                    salesnumber = int(sales[i][:-3])
                    # 如果不存在,则加入字典中
                    dataDict.setdefault(merchant[i].salesnumber)
               
        # print(f"已爬取完第{page+1}页数据.......")
        time.sleep(random.randint(2, 4))
    # print(f'总共爬取{page+1}页数据.......')
    return dataDict

#统计数据、找出该件商品销售前100的商家
def statisticData(dataDict1):
    #把字典按照value从大到小排序
    dataDict2= sorted(dataDict.items(),key=lambda x:x[1],reverse=True)
    count=0
    for key in dataDict2.keys():
        if count>10:
            break
        else:
            count = count + 1
            with open('demo.txt', 'a') as file_obj:
                file_obj.write(key)
                file_obj.write(str(dataDict2[key])+"\n")
if __name__ == '__main__':
    #输入商品名称
    goodsName=input("请输入一个商品名称: ")
    #爬取数据,并且放入一个字典中,也就是Map集合中
    dataDict1=acquireData(goodsName)
    #统计数据,找出该件商品销售前100的商家,保存到文件中
    statisticData(dataDict1)

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值