爬取tb100页商品信息完整代码

淘宝网站在升级,所以如果要爬取多页商品信息,要绕开反爬机制,下面提供在上一期基础上的翻页操作。先看我的上一篇文章。

1.打开淘宝网站登录并且搜索商品信息。这一次我们找到search?这个文件右击复制

2.在在线curl命令转代码网站将复制过来的东西转换成python代码后,运行得到response。

import requests

cookies = {
    'miid': '141439373371660658',
    't': 'aa8febcd45b715e1dccbe4f13510772a',
    'cna': 'bGsBGuHwHmQCAXju+IJgl+YX',
    'lgc': 'tb9330232098',
    'tracknick': 'tb9330232098',
    'thw': 'cn',
    'tk_trace': 'oTRxOWSBNwn9evfHtXQifmafmYzSU80EW6dedixskcJ%2Bc8J0R27pVpwR009bxUw3pJYMDEOjj9VTZncJdZFFzzCuQZlxUrfnSMxQAzTdsNfSmKKyqWcqKPn9v89RJSQ0oG%2FogMwgfvZEKclhcLXWhjzh3JBkQjA%2FL%2FOJkbI0oZ5jqzn2VG%2BKJci6PiuNfv4VwjmPxjt4W%2F83B9FUm3W%2FUoJxmXPRtc3s4xkDIrIG7gqxqJXEReV7jHT%2FgHjGUbuPSdePWpHou6KT1yXAyBnPwkfkI02byxjELdw9%2FCnKL7Iq4zadeWIERo47JW2ATJ2%2BGwPsEvJ48c%2Bk77bgKLvHB80SCgzUiEahg5BC9UQup1n9buUdjJ6NTEX8h6oM2W02ynYh5%2B3JyqGl0epQEXidvXp1hWDRmM5RtzB27%2FxcxSF84Q%3D%3D',
    'cookie2': '1d92c1a5ff6fdef92705799a5af7735b',
    '_tb_token_': '635e35bd9098',
    'mt': 'ci=-1_0',
    '_samesite_flag_': 'true',
    '3PcFlag': '1708143394509',
    'sgcookie': 'E100gH1TeVwBo1xKjL1pQrgJh%2Fw71O6qROkVBu0vqL5SWD%2BjBGqAkWHco8sWiWQQ7oaW%2BQQCV64b7dl3L8RWeoWhvZjGGN5IiG4%2Bw1ZXVqi51LkJSAY%2FZy4RpgVBtidp%2FJmF',
    'unb': '2212187893946',
    'uc1': 'existShop=false&cookie14=UoYenby%2B65DgNA%3D%3D&cookie15=URm48syIIVrSKA%3D%3D&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&pas=0&cookie21=W5iHLLyFe3xm',
    'uc3': 'lg2=V32FPkk%2Fw0dUvg%3D%3D&id2=UUpgRKyMGLkMj4yM%2Bg%3D%3D&nk2=F5RMGoevINQpxZ1H&vt3=F8dD3eu2Nt1EHpHuHKw%3D',
    'csg': '7c4e6fb4',
    'cancelledSubSites': 'empty',
    'cookie17': 'UUpgRKyMGLkMj4yM%2Bg%3D%3D',
    'dnk': 'tb9330232098',
    'skt': 'a559ddde015e7a2d',
    'existShop': 'MTcwODE0MzQ0OQ%3D%3D',
    'uc4': 'nk4=0%40FY4HX7UsnkJMHcfV%2FvxuM4Mhvok%2BNjE%3D&id4=0%40U2gqy133ULUMfb1HFmsUv%2FAJNYiWzPUf',
    '_cc_': 'URm48syIZQ%3D%3D',
    '_l_g_': 'Ug%3D%3D',
    'sg': '86a',
    '_nk_': 'tb9330232098',
    'cookie1': 'BYS%2B9xq28PSJVkxozh5affcpHv1mWy0wrw92uFs5TBg%3D',
    'JSESSIONID': 'C27CE0B0B7D5D995E6B227AE936C3DEB',
    'mtop_partitioned_detect': '1',
    '_m_h5_tk': 'dda9dfe4b79bfd4871dda7dadc5863fe_1708163763078',
    '_m_h5_tk_enc': 'e333c6602045e023701a7ff3537717be',
    'ariaDefaultTheme': 'undefined',
    'isg': 'BDo6VM3LC2U7hof1pESeYVo9i2Bc677FxC_ChkQz5k2aN9pxLHsO1QBFg8XrpzZd',
}

headers = {
    'authority': 's.taobao.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'cache-control': 'max-age=0',
    # Requests sorts cookies= alphabetically
    # 'cookie': 'miid=141439373371660658; t=aa8febcd45b715e1dccbe4f13510772a; cna=bGsBGuHwHmQCAXju+IJgl+YX; lgc=tb9330232098; tracknick=tb9330232098; thw=cn; tk_trace=oTRxOWSBNwn9evfHtXQifmafmYzSU80EW6dedixskcJ%2Bc8J0R27pVpwR009bxUw3pJYMDEOjj9VTZncJdZFFzzCuQZlxUrfnSMxQAzTdsNfSmKKyqWcqKPn9v89RJSQ0oG%2FogMwgfvZEKclhcLXWhjzh3JBkQjA%2FL%2FOJkbI0oZ5jqzn2VG%2BKJci6PiuNfv4VwjmPxjt4W%2F83B9FUm3W%2FUoJxmXPRtc3s4xkDIrIG7gqxqJXEReV7jHT%2FgHjGUbuPSdePWpHou6KT1yXAyBnPwkfkI02byxjELdw9%2FCnKL7Iq4zadeWIERo47JW2ATJ2%2BGwPsEvJ48c%2Bk77bgKLvHB80SCgzUiEahg5BC9UQup1n9buUdjJ6NTEX8h6oM2W02ynYh5%2B3JyqGl0epQEXidvXp1hWDRmM5RtzB27%2FxcxSF84Q%3D%3D; cookie2=1d92c1a5ff6fdef92705799a5af7735b; _tb_token_=635e35bd9098; mt=ci=-1_0; _samesite_flag_=true; 3PcFlag=1708143394509; sgcookie=E100gH1TeVwBo1xKjL1pQrgJh%2Fw71O6qROkVBu0vqL5SWD%2BjBGqAkWHco8sWiWQQ7oaW%2BQQCV64b7dl3L8RWeoWhvZjGGN5IiG4%2Bw1ZXVqi51LkJSAY%2FZy4RpgVBtidp%2FJmF; unb=2212187893946; uc1=existShop=false&cookie14=UoYenby%2B65DgNA%3D%3D&cookie15=URm48syIIVrSKA%3D%3D&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&pas=0&cookie21=W5iHLLyFe3xm; uc3=lg2=V32FPkk%2Fw0dUvg%3D%3D&id2=UUpgRKyMGLkMj4yM%2Bg%3D%3D&nk2=F5RMGoevINQpxZ1H&vt3=F8dD3eu2Nt1EHpHuHKw%3D; csg=7c4e6fb4; cancelledSubSites=empty; cookie17=UUpgRKyMGLkMj4yM%2Bg%3D%3D; dnk=tb9330232098; skt=a559ddde015e7a2d; existShop=MTcwODE0MzQ0OQ%3D%3D; uc4=nk4=0%40FY4HX7UsnkJMHcfV%2FvxuM4Mhvok%2BNjE%3D&id4=0%40U2gqy133ULUMfb1HFmsUv%2FAJNYiWzPUf; _cc_=URm48syIZQ%3D%3D; _l_g_=Ug%3D%3D; sg=86a; _nk_=tb9330232098; cookie1=BYS%2B9xq28PSJVkxozh5affcpHv1mWy0wrw92uFs5TBg%3D; JSESSIONID=C27CE0B0B7D5D995E6B227AE936C3DEB; mtop_partitioned_detect=1; _m_h5_tk=dda9dfe4b79bfd4871dda7dadc5863fe_1708163763078; _m_h5_tk_enc=e333c6602045e023701a7ff3537717be; ariaDefaultTheme=undefined; isg=BDo6VM3LC2U7hof1pESeYVo9i2Bc677FxC_ChkQz5k2aN9pxLHsO1QBFg8XrpzZd',
    'referer': 'https://s.taobao.com/search/_____tmd_____/page/login_jump?rand=S3WxGHAgAt756EpznwfNzJq2AFA2qBNla3j6EINUS8We9dazM_iKElp8DwVSHZUevpC41Bx7RzivXIj9RnZgdg&_lgt_=1d1d0890ba2a99ebfe1b9f4d4d4047e1___215918___3c988e0606ae7125ee3e7e8dab056462___837b211a0c5c4d0311617da5fff37e257cad377890b4c45ccdf2cb0fd662949d0f671beae3225249f15a6f3a3482050da4620a65df4ca2ccd360f91c978d4836ba9e1bc97f6cf2099614489ebd0fe94736bcd2c97d15a129973c9575dead2965f30d05b945a6f833c995602b2cac735445c409a1bd8d83f0f1917a93a9ddc72bcf69b43df9d2bf64d2ed7dd91894c71ce678fa7bb3dd0a2e4529713eb2555f8d4947a01f65db828f5a70cc62d4f5a53cdfb517b26ed9411a62cb8c19ef2f4eabde074f7e5bbf59d6a639a8c8a4c5f6154df0199ae994081ac1d8faddf08af40b41b7153f55b897f62a57012762356fdf81a611b129f005d4ae34c0e06c92f0d0c86215672b4bed54f4d7bc9245a9e24060a640bd976e6f25f40bb71668af7e5185dd4c421de90d395ef5e1bc45f7882f4365be949bbfb85c2f3719bbec315ef960124a5246e6f08bac90304cc07dcf9d',
    'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
    'sec-ch-ua-mobile': '?1',
    'sec-ch-ua-platform': '"Android"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36 Edg/121.0.0.0',
}

params = {
    'commend': 'all',
    'ie': 'utf8',
    'initiative_id': 'tbindexz_20170306',
    'page': '1',
    'q': '陈皮',
    'search_type': 'item',
    'sourceId': 'tb.index',
    'spm': 'a21bo.jianhua.201856-taobao-item.2',
    'ssid': 's5-e',
    'tab': 'all',
}

response = requests.get('https://s.taobao.com/search', params=params, cookies=cookies, headers=headers)

3.写循环函数,建立csv文件保存爬取的内容。

由于cookie是7天一更新,token是一小时一更新,过一个小时要重新取cookie和token。

重点在for循环里params内参数的修改,token取你获取到的cookie里_m_h5_tk参数里以“_”号分开的前半部分。其他参数不变。

import csv 
import json
import time
import requests
from pymongo import MongoClient
from urllib.parse import urlencode
import re
import hashlib
import time
from requests.exceptions import RequestException

f=open("爬取淘宝数据.csv",mode='a',encoding='utf-8',newline='')
csv_write=csv.writer(f)
csv_write.writerow(['title','shopinfo','price','procity','realSales'])

def get_fo():
    itemsArray=json_data['data']['itemsArray']
    for item in itemsArray:
        title=item["title"]#商品内容
        shopinfo=item['shopInfo']['title'] #店铺名称
        price=item['price'] #价格
        procity=item['procity']#发货地址
        realSales=item['realSales'] #买的人数
        #pricePerUnit=item['pricePerUnit']
        #print([title,shopinfo,price,procity,realSales])
        csv_write.writerow([title,shopinfo,price,procity,realSales])
        
for i in range(1,101):
    print('正在爬取'+str(i)+'页')
    page=str(i)
    data='{"appId":"34385","params":"{\\"device\\":\\"HMA-AL00\\",\\"isBeta\\":\\"false\\",\\"grayHair\\":\\"false\\",\\"from\\":\\"nt_history\\",\\"brand\\":\\"HUAWEI\\",\\"info\\":\\"wifi\\",\\"index\\":\\"4\\",\\"rainbow\\":\\"\\",\\"schemaType\\":\\"auction\\",\\"elderHome\\":\\"false\\",\\"isEnterSrpSearch\\":\\"true\\",\\"newSearch\\":\\"false\\",\\"network\\":\\"wifi\\",\\"subtype\\":\\"\\",\\"hasPreposeFilter\\":\\"false\\",\\"prepositionVersion\\":\\"v2\\",\\"client_os\\":\\"Android\\",\\"gpsEnabled\\":\\"false\\",\\"searchDoorFrom\\":\\"srp\\",\\"debug_rerankNewOpenCard\\":\\"false\\",\\"homePageVersion\\":\\"v7\\",\\"searchElderHomeOpen\\":\\"false\\",\\"search_action\\":\\"initiative\\",\\"sugg\\":\\"_4_1\\",\\"sversion\\":\\"13.6\\",\\"style\\":\\"list\\",\\"ttid\\":\\"600000@taobao_pc_10.7.0\\",\\"needTabs\\":\\"true\\",\\"areaCode\\":\\"CN\\",\\"vm\\":\\"nw\\",\\"countryNum\\":\\"156\\",\\"m\\":\\"pc\\",\\"page\\":'+page+',\\"n\\":48,\\"q\\":\\"%E9%99%88%E7%9A%AE\\",\\"tab\\":\\"all\\",\\"pageSize\\":48,\\"totalPage\\":100,\\"totalResults\\":4800,\\"sourceS\\":\\"0\\",\\"sort\\":\\"_coefp\\",\\"bcoffset\\":\\"\\",\\"ntoffset\\":\\"\\",\\"filterTag\\":\\"\\",\\"service\\":\\"\\",\\"prop\\":\\"\\",\\"loc\\":\\"\\",\\"start_price\\":null,\\"end_price\\":null,\\"startPrice\\":null,\\"endPrice\\":null,\\"itemIds\\":null,\\"p4pIds\\":null}"}'
    t = int(time.time()*1000)
    t = str(t)  # 要转化成字符串
    token = "dda9dfe4b79bfd4871dda7dadc5863fe" #重点在token取值,取cookie里_m_h5_tk参数里以“_”号分开的前半部分
    appkey = "12574478"
    datas = token+'&'+t+'&'+appkey+'&'+data
    sign = hashlib.md5()  # 创建md5对象
    sign.update(datas.encode())  # 使用md5加密要先编码,不然会报错,我这默认编码是utf-8
    signs = sign.hexdigest()   # 加密
    #sign=hashlib.md5(data.encode()).hexdigest()
    #sign=hashlib.md5((cookies['_m_h5_tk'].split('_')[0]+'&'+cookies['_m_h5_tk'].split('_')[1]+'&'+"12574478"+"&"+data).encode()).hexdigest()
    params = {
        'jsv': '2.6.2',
        'appKey': '12574478',
        't': t,
        'sign': signs,
        'api': 'mtop.relationrecommend.WirelessRecommend.recommend',
        'v': '2.0',
        # 更改'type': 'jsonp',
       # 更改'dataType': 'jsonp',
        'type': 'json',
       'dataType': 'json',
      # 注释掉 'callback': 'mtopjsonp1',
        'data': data,
    }
    time.sleep(3)
    response = requests.get('https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/', params=params, cookies=cookies, headers=headers)    
    json_data=response.json()
    #print(json_data)
    try:
        get_fo()
        
    except:
        print('爬取失败')

结果展示:

 

 

完整代码就是先运文章第一块代码再运第二块。读者使用时一定要自己去取自己的cookie,用我的cookie是无法使用滴,第二块代码一定要修改token的值,over。

  • 13
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值