1.首先导入库
import requests
import json
2.主函数
if __name__ == '__main__':
data = []
for data_value in range(44, 89, 44):
data = data + get_onePage_Text(data_value)
printGoodList(data)
注释:①在params中发现每一页的数据“data_value”值是发生变化的,所以我获取的是前两页的数据,取值为44和88,步长设置为44.
②先设置data空列表,通过循环将第一页和第二页获得到的数据一起放到data列表中。
3.封装的第一个方法:获取到每一页的数据
def get_onePage_Text(data_value):
start_url = "https://s.taobao.com/search?"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
"cookie": "miid=167232731045619172; cna=a3qKGIeTNTcCAXZNvi/UAWvF; sgcookie=E100te2z8xEr5ZZSRyYvwNSRclA%2Fcv7zWfUGRveaaYZ26nKfaeaOtNqIoMT2xlZSPiIHlIQH033d4b9hcnvAOpiSSA%3D%3D; uc3=nk2=F5RHoJeHj5VwR14%3D&lg2=Vq8l%2BKCLz3%2F65A%3D%3D&vt3=F8dCuAVgJ1b5er%2B88v0%3D&id2=UUphzOfddVBrsr5vxA%3D%3D; lgc=tb215504465; uc4=nk4=0%40FY4MsnGJ2PYoMdwJRHB%2FNc14ROVf4g%3D%3D&id4=0%40U2grF8wQ6JI1vDi0TaquIW3dVERmBZfU; tracknick=tb215504465; _cc_=UIHiLt3xSw%3D%3D; enc=yaD4h0D5zNNkWHtDv3PSJPlqo4y8GmcPYN74HqOlw%2FQwyB0cC2fcxSPooOmQ13A9gdC9sRqoPvmZ1nYlClC4F%2FM2X5T6eSe1tmOK%2BPcuUdQ%3D; mt=ci=44_1; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; lLtC1_=1; _samesite_flag_=true; cookie2=10e0736bbd814669e23c3c07d35a7e0a; t=0e36379c7d71b21d182dcccc8058c5f8; v=0; _tb_token_=fe05eefbf7b43; _m_h5_tk=c6af335c8840cb194a651bdfa0de937f_1614844615269; _m_h5_tk_enc=dcd2ebb5064bd420ce7ed02d6c683dd1; xlly_s=1; uc1=cookie14=Uoe1hg5Tdt8p1g%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; x5sec=7b227365617263686170703b32223a226665336138323362623434323366633731336463363337613439666539383732434f4855675949474549657373734f57394c6e5356686f504d6a49774e6a557a4d7a59784f444d304e7a73784b4149776b7043442f674d3d227d; JSESSIONID=CACA90FA24491C1CF4A1CBD71EAEE804; l=eBTTuCVqjb49dC4jBO5Z-urza77TfIOfG1PzaNbMiIncC6IF_b9GahKQDmKlZp-RR8XciV8B4ztxcPwTfeqz-ykjJ0YEae1VivE2Cef..; tfstk=cS8VB0NJOq3qKI7svZ_ah4gzbtwAatIGYz5Omh3t1oaqL7SfLsczB_bzf_5dp4jc.; isg=BAUFdw8XycQZWO3ttKHvCPecFEE_wrlUC-OT_gdrqz0PniEQzxPfJZM4rMJo2dEM",
"referer": "https://s.taobao.com/search?q=%E4%B9%A6%E5%8C%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20210303&ie=utf8"
}
params = {
"data-key": "s",
"data-value": data_value,
"ajax": "true",
"_ksTS": "1614820957807_415",
# "callback": "jsonp416",
"q": "书包",
# imgfile:
"js": 1,
"stats_click": "search_radio_all:1",
"initiative_id": "staobaoz_20210303",
"ie": "utf8",
"bcoffset": 3,
"ntoffset": 0,
"p4ppushleft": "1,48"
}
r = requests.get(start_url, headers=headers, params=params)
if r.status_code == 200:
content_json = r.text
content_dict = json.loads(content_json)
content0 = content_dict["mods"]
content1 = content0["itemlist"]
content2 = content1["data"]
content3 = content2["auctions"]
return content3
注释:①淘宝反爬机制比较严格,headers字段一定要加cookie字段 ②因为刚开始获取到的数据前面有字符串,不易处理,所以经过试验的发现params字段callback可以注释掉不会影响最终结果。③json格式转化为python内置字典格式。④准确定位到商品信息的所在地,如下图:
⑤最终content3获取到的是列表,列表里的每一项是字典格式。
4.封装的第二个方法:打印输出数据
def printGoodList(data):
list_name = "{:4}\t{:25}\t{:6}\t{:6}"
print(list_name.format("序号", "商品名称", "价格", "购买人数"))
count = 0
for content in data:
count = count + 1
name = content["raw_title"]
price = content["view_price"]
number = content["view_sales"]
print(list_name.format(count, name, price, number))
注释: for循环获取列表每一项,并打印输出。
*
5.最终效果
我爬的数据也不多,刚开始确实淘宝要登陆,后来也出现滑动验证,当我下午再完善代码时,最后很神奇的没有验证了。本来我是要用xpath表达式但没有好的解决方法
将三段字符串连接起来不太现实,因为书包出现的地方也不一样,最后就是想到进行翻页,动态请求。接下来要做的是将其转化为csv格式的数据。