淘宝网站在升级,所以如果要爬取多页商品信息,要绕开反爬机制,下面提供在上一期基础上的翻页操作。先看我的上一篇文章。
1.打开淘宝网站登录并且搜索商品信息。这一次我们找到search?这个文件右击复制
2.在在线curl命令转代码网站将复制过来的东西转换成python代码后,运行得到response。
import requests
cookies = {
'miid': '141439373371660658',
't': 'aa8febcd45b715e1dccbe4f13510772a',
'cna': 'bGsBGuHwHmQCAXju+IJgl+YX',
'lgc': 'tb9330232098',
'tracknick': 'tb9330232098',
'thw': 'cn',
'tk_trace': 'oTRxOWSBNwn9evfHtXQifmafmYzSU80EW6dedixskcJ%2Bc8J0R27pVpwR009bxUw3pJYMDEOjj9VTZncJdZFFzzCuQZlxUrfnSMxQAzTdsNfSmKKyqWcqKPn9v89RJSQ0oG%2FogMwgfvZEKclhcLXWhjzh3JBkQjA%2FL%2FOJkbI0oZ5jqzn2VG%2BKJci6PiuNfv4VwjmPxjt4W%2F83B9FUm3W%2FUoJxmXPRtc3s4xkDIrIG7gqxqJXEReV7jHT%2FgHjGUbuPSdePWpHou6KT1yXAyBnPwkfkI02byxjELdw9%2FCnKL7Iq4zadeWIERo47JW2ATJ2%2BGwPsEvJ48c%2Bk77bgKLvHB80SCgzUiEahg5BC9UQup1n9buUdjJ6NTEX8h6oM2W02ynYh5%2B3JyqGl0epQEXidvXp1hWDRmM5RtzB27%2FxcxSF84Q%3D%3D',
'cookie2': '1d92c1a5ff6fdef92705799a5af7735b',
'_tb_token_': '635e35bd9098',
'mt': 'ci=-1_0',
'_samesite_flag_': 'true',
'3PcFlag': '1708143394509',
'sgcookie': 'E100gH1TeVwBo1xKjL1pQrgJh%2Fw71O6qROkVBu0vqL5SWD%2BjBGqAkWHco8sWiWQQ7oaW%2BQQCV64b7dl3L8RWeoWhvZjGGN5IiG4%2Bw1ZXVqi51LkJSAY%2FZy4RpgVBtidp%2FJmF',
'unb': '2212187893946',
'uc1': 'existShop=false&cookie14=UoYenby%2B65DgNA%3D%3D&cookie15=URm48syIIVrSKA%3D%3D&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&pas=0&cookie21=W5iHLLyFe3xm',
'uc3': 'lg2=V32FPkk%2Fw0dUvg%3D%3D&id2=UUpgRKyMGLkMj4yM%2Bg%3D%3D&nk2=F5RMGoevINQpxZ1H&vt3=F8dD3eu2Nt1EHpHuHKw%3D',
'csg': '7c4e6fb4',
'cancelledSubSites': 'empty',
'cookie17': 'UUpgRKyMGLkMj4yM%2Bg%3D%3D',
'dnk': 'tb9330232098',
'skt': 'a559ddde015e7a2d',
'existShop': 'MTcwODE0MzQ0OQ%3D%3D',
'uc4': 'nk4=0%40FY4HX7UsnkJMHcfV%2FvxuM4Mhvok%2BNjE%3D&id4=0%40U2gqy133ULUMfb1HFmsUv%2FAJNYiWzPUf',
'_cc_': 'URm48syIZQ%3D%3D',
'_l_g_': 'Ug%3D%3D',
'sg': '86a',
'_nk_': 'tb9330232098',
'cookie1': 'BYS%2B9xq28PSJVkxozh5affcpHv1mWy0wrw92uFs5TBg%3D',
'JSESSIONID': 'C27CE0B0B7D5D995E6B227AE936C3DEB',
'mtop_partitioned_detect': '1',
'_m_h5_tk': 'dda9dfe4b79bfd4871dda7dadc5863fe_1708163763078',
'_m_h5_tk_enc': 'e333c6602045e023701a7ff3537717be',
'ariaDefaultTheme': 'undefined',
'isg': 'BDo6VM3LC2U7hof1pESeYVo9i2Bc677FxC_ChkQz5k2aN9pxLHsO1QBFg8XrpzZd',
}
headers = {
'authority': 's.taobao.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'max-age=0',
# Requests sorts cookies= alphabetically
# 'cookie': 'miid=141439373371660658; t=aa8febcd45b715e1dccbe4f13510772a; cna=bGsBGuHwHmQCAXju+IJgl+YX; lgc=tb9330232098; tracknick=tb9330232098; thw=cn; tk_trace=oTRxOWSBNwn9evfHtXQifmafmYzSU80EW6dedixskcJ%2Bc8J0R27pVpwR009bxUw3pJYMDEOjj9VTZncJdZFFzzCuQZlxUrfnSMxQAzTdsNfSmKKyqWcqKPn9v89RJSQ0oG%2FogMwgfvZEKclhcLXWhjzh3JBkQjA%2FL%2FOJkbI0oZ5jqzn2VG%2BKJci6PiuNfv4VwjmPxjt4W%2F83B9FUm3W%2FUoJxmXPRtc3s4xkDIrIG7gqxqJXEReV7jHT%2FgHjGUbuPSdePWpHou6KT1yXAyBnPwkfkI02byxjELdw9%2FCnKL7Iq4zadeWIERo47JW2ATJ2%2BGwPsEvJ48c%2Bk77bgKLvHB80SCgzUiEahg5BC9UQup1n9buUdjJ6NTEX8h6oM2W02ynYh5%2B3JyqGl0epQEXidvXp1hWDRmM5RtzB27%2FxcxSF84Q%3D%3D; cookie2=1d92c1a5ff6fdef92705799a5af7735b; _tb_token_=635e35bd9098; mt=ci=-1_0; _samesite_flag_=true; 3PcFlag=1708143394509; sgcookie=E100gH1TeVwBo1xKjL1pQrgJh%2Fw71O6qROkVBu0vqL5SWD%2BjBGqAkWHco8sWiWQQ7oaW%2BQQCV64b7dl3L8RWeoWhvZjGGN5IiG4%2Bw1ZXVqi51LkJSAY%2FZy4RpgVBtidp%2FJmF; unb=2212187893946; uc1=existShop=false&cookie14=UoYenby%2B65DgNA%3D%3D&cookie15=URm48syIIVrSKA%3D%3D&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&pas=0&cookie21=W5iHLLyFe3xm; uc3=lg2=V32FPkk%2Fw0dUvg%3D%3D&id2=UUpgRKyMGLkMj4yM%2Bg%3D%3D&nk2=F5RMGoevINQpxZ1H&vt3=F8dD3eu2Nt1EHpHuHKw%3D; csg=7c4e6fb4; cancelledSubSites=empty; cookie17=UUpgRKyMGLkMj4yM%2Bg%3D%3D; dnk=tb9330232098; skt=a559ddde015e7a2d; existShop=MTcwODE0MzQ0OQ%3D%3D; uc4=nk4=0%40FY4HX7UsnkJMHcfV%2FvxuM4Mhvok%2BNjE%3D&id4=0%40U2gqy133ULUMfb1HFmsUv%2FAJNYiWzPUf; _cc_=URm48syIZQ%3D%3D; _l_g_=Ug%3D%3D; sg=86a; _nk_=tb9330232098; cookie1=BYS%2B9xq28PSJVkxozh5affcpHv1mWy0wrw92uFs5TBg%3D; JSESSIONID=C27CE0B0B7D5D995E6B227AE936C3DEB; mtop_partitioned_detect=1; _m_h5_tk=dda9dfe4b79bfd4871dda7dadc5863fe_1708163763078; _m_h5_tk_enc=e333c6602045e023701a7ff3537717be; ariaDefaultTheme=undefined; isg=BDo6VM3LC2U7hof1pESeYVo9i2Bc677FxC_ChkQz5k2aN9pxLHsO1QBFg8XrpzZd',
'referer': 'https://s.taobao.com/search/_____tmd_____/page/login_jump?rand=S3WxGHAgAt756EpznwfNzJq2AFA2qBNla3j6EINUS8We9dazM_iKElp8DwVSHZUevpC41Bx7RzivXIj9RnZgdg&_lgt_=1d1d0890ba2a99ebfe1b9f4d4d4047e1___215918___3c988e0606ae7125ee3e7e8dab056462___837b211a0c5c4d0311617da5fff37e257cad377890b4c45ccdf2cb0fd662949d0f671beae3225249f15a6f3a3482050da4620a65df4ca2ccd360f91c978d4836ba9e1bc97f6cf2099614489ebd0fe94736bcd2c97d15a129973c9575dead2965f30d05b945a6f833c995602b2cac735445c409a1bd8d83f0f1917a93a9ddc72bcf69b43df9d2bf64d2ed7dd91894c71ce678fa7bb3dd0a2e4529713eb2555f8d4947a01f65db828f5a70cc62d4f5a53cdfb517b26ed9411a62cb8c19ef2f4eabde074f7e5bbf59d6a639a8c8a4c5f6154df0199ae994081ac1d8faddf08af40b41b7153f55b897f62a57012762356fdf81a611b129f005d4ae34c0e06c92f0d0c86215672b4bed54f4d7bc9245a9e24060a640bd976e6f25f40bb71668af7e5185dd4c421de90d395ef5e1bc45f7882f4365be949bbfb85c2f3719bbec315ef960124a5246e6f08bac90304cc07dcf9d',
'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
'sec-ch-ua-mobile': '?1',
'sec-ch-ua-platform': '"Android"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36 Edg/121.0.0.0',
}
params = {
'commend': 'all',
'ie': 'utf8',
'initiative_id': 'tbindexz_20170306',
'page': '1',
'q': '陈皮',
'search_type': 'item',
'sourceId': 'tb.index',
'spm': 'a21bo.jianhua.201856-taobao-item.2',
'ssid': 's5-e',
'tab': 'all',
}
response = requests.get('https://s.taobao.com/search', params=params, cookies=cookies, headers=headers)
3.写循环函数,建立csv文件保存爬取的内容。
由于cookie是7天一更新,token是一小时一更新,过一个小时要重新取cookie和token。
重点在for循环里params内参数的修改,token取你获取到的cookie里_m_h5_tk参数里以“_”号分开的前半部分。其他参数不变。
import csv
import json
import time
import requests
from pymongo import MongoClient
from urllib.parse import urlencode
import re
import hashlib
import time
from requests.exceptions import RequestException
f=open("爬取淘宝数据.csv",mode='a',encoding='utf-8',newline='')
csv_write=csv.writer(f)
csv_write.writerow(['title','shopinfo','price','procity','realSales'])
def get_fo():
itemsArray=json_data['data']['itemsArray']
for item in itemsArray:
title=item["title"]#商品内容
shopinfo=item['shopInfo']['title'] #店铺名称
price=item['price'] #价格
procity=item['procity']#发货地址
realSales=item['realSales'] #买的人数
#pricePerUnit=item['pricePerUnit']
#print([title,shopinfo,price,procity,realSales])
csv_write.writerow([title,shopinfo,price,procity,realSales])
for i in range(1,101):
print('正在爬取'+str(i)+'页')
page=str(i)
data='{"appId":"34385","params":"{\\"device\\":\\"HMA-AL00\\",\\"isBeta\\":\\"false\\",\\"grayHair\\":\\"false\\",\\"from\\":\\"nt_history\\",\\"brand\\":\\"HUAWEI\\",\\"info\\":\\"wifi\\",\\"index\\":\\"4\\",\\"rainbow\\":\\"\\",\\"schemaType\\":\\"auction\\",\\"elderHome\\":\\"false\\",\\"isEnterSrpSearch\\":\\"true\\",\\"newSearch\\":\\"false\\",\\"network\\":\\"wifi\\",\\"subtype\\":\\"\\",\\"hasPreposeFilter\\":\\"false\\",\\"prepositionVersion\\":\\"v2\\",\\"client_os\\":\\"Android\\",\\"gpsEnabled\\":\\"false\\",\\"searchDoorFrom\\":\\"srp\\",\\"debug_rerankNewOpenCard\\":\\"false\\",\\"homePageVersion\\":\\"v7\\",\\"searchElderHomeOpen\\":\\"false\\",\\"search_action\\":\\"initiative\\",\\"sugg\\":\\"_4_1\\",\\"sversion\\":\\"13.6\\",\\"style\\":\\"list\\",\\"ttid\\":\\"600000@taobao_pc_10.7.0\\",\\"needTabs\\":\\"true\\",\\"areaCode\\":\\"CN\\",\\"vm\\":\\"nw\\",\\"countryNum\\":\\"156\\",\\"m\\":\\"pc\\",\\"page\\":'+page+',\\"n\\":48,\\"q\\":\\"%E9%99%88%E7%9A%AE\\",\\"tab\\":\\"all\\",\\"pageSize\\":48,\\"totalPage\\":100,\\"totalResults\\":4800,\\"sourceS\\":\\"0\\",\\"sort\\":\\"_coefp\\",\\"bcoffset\\":\\"\\",\\"ntoffset\\":\\"\\",\\"filterTag\\":\\"\\",\\"service\\":\\"\\",\\"prop\\":\\"\\",\\"loc\\":\\"\\",\\"start_price\\":null,\\"end_price\\":null,\\"startPrice\\":null,\\"endPrice\\":null,\\"itemIds\\":null,\\"p4pIds\\":null}"}'
t = int(time.time()*1000)
t = str(t) # 要转化成字符串
token = "dda9dfe4b79bfd4871dda7dadc5863fe" #重点在token取值,取cookie里_m_h5_tk参数里以“_”号分开的前半部分
appkey = "12574478"
datas = token+'&'+t+'&'+appkey+'&'+data
sign = hashlib.md5() # 创建md5对象
sign.update(datas.encode()) # 使用md5加密要先编码,不然会报错,我这默认编码是utf-8
signs = sign.hexdigest() # 加密
#sign=hashlib.md5(data.encode()).hexdigest()
#sign=hashlib.md5((cookies['_m_h5_tk'].split('_')[0]+'&'+cookies['_m_h5_tk'].split('_')[1]+'&'+"12574478"+"&"+data).encode()).hexdigest()
params = {
'jsv': '2.6.2',
'appKey': '12574478',
't': t,
'sign': signs,
'api': 'mtop.relationrecommend.WirelessRecommend.recommend',
'v': '2.0',
# 更改'type': 'jsonp',
# 更改'dataType': 'jsonp',
'type': 'json',
'dataType': 'json',
# 注释掉 'callback': 'mtopjsonp1',
'data': data,
}
time.sleep(3)
response = requests.get('https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/', params=params, cookies=cookies, headers=headers)
json_data=response.json()
#print(json_data)
try:
get_fo()
except:
print('爬取失败')
结果展示:
- params里参数的破解参考: python使用requests库爬取淘宝食品信息,包含sign参数破解_爬取淘宝数据代码-CSDN博客
完整代码就是先运文章第一块代码再运第二块。读者使用时一定要自己去取自己的cookie,用我的cookie是无法使用滴,第二块代码一定要修改token的值,over。