python商品总价_使用python爬取淘宝商品名称和价格

最近在家闲得无聊,由于家里开网点,妈妈对于起商品标题感到很头痛,所以我就想在淘宝爬取一些信息。

小破站找了个学习视频,跟一遍发现视频是2018年的,而淘宝在2019年可能加入了反爬取机制,使用正常的方法爬不到结果。

但是有一种方式可以爬取,要先登陆淘宝网页版,然后去搜索,获取cookie和user-agent。

代码如下:

import requests

import re

def getHTMLText(url):

kv = {

'cookie': 'cna=54y8Fm+TyioCATzcP+BwvvDA; thw=cn; lgc=%5Cu58A8%5Cu8FF9%5Cu9519%5Cu54AF%5Cu548C; tracknick=%5Cu58A8%5Cu8FF9%5Cu9519%5Cu54AF%5Cu548C; tg=0; enc=p6YWWbSWACqr5t1PcdDiNADVd7zKpnQG9X%2FZ666%2Fl7CM9%2FsOLpiM1WX5QQNnS%2B5ydtOFYKtHlmwg9AgeUX0Rjg%3D%3D; mt=ci=25_1; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=9564049e168909dda591afc00632fed0_1581060181038; _m_h5_tk_enc=69c0046fffbc1f3750258e3f8fb06eb6; v=0; t=a8c0eb2a0d2265808242379d7f81bf64; cookie2=1f0913be01a97ec8038fb3c5354c793a; _tb_token_=ed66e3be155ef; alitrackid=www.taobao.com; _samesite_flag_=true; sgcookie=Q0iMW4gaGTiQdKuBbG0Q; unb=2514996592; uc3=vt3=F8dBxdzxpjLCa1%2BSc2Y%3D&id2=UU2zVEbkyyO2WQ%3D%3D&nk2=p2NDIgVDkDZm7A%3D%3D&lg2=VT5L2FSpMGV7TQ%3D%3D; csg=5b7ad9c4; cookie17=UU2zVEbkyyO2WQ%3D%3D; dnk=%5Cu58A8%5Cu8FF9%5Cu9519%5Cu54AF%5Cu548C; skt=c0aa8f1a6e59edcd; existShop=MTU4MTU5NTkzOQ%3D%3D; uc4=nk4=0%40pVWU4YQkw8jOJbWHe0nFlK5IE6%2Bq&id4=0%40U2%2F0ltjMwUTM8KkFFqREWIu1Zr5o; _cc_=VFC%2FuZ9ajQ%3D%3D; _l_g_=Ug%3D%3D; sg=%E5%92%8C22; _nk_=%5Cu58A8%5Cu8FF9%5Cu9519%5Cu54AF%5Cu548C; cookie1=BxAV4i9dmFVSXeCQYeZRwnecoaXNB46utcHDDLh6ZgY%3D; lastalitrackid=i.taobao.com; _uab_collina=158159732272025423845542; uc1=cookie14=UoTUO8VjaYjE6g%3D%3D&lng=zh_CN&cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&existShop=false&cookie21=WqG3DMC9Fb5mPLIQo9kR&tag=8&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; x5sec=7b227365617263686170703b32223a226538646339643635363639643831393663383261346336303365333530316433435075626c664946454b664b6b596569376f544571514561444449314d5451354f5459314f5449374e413d3d227d; JSESSIONID=A874DB22E1D5B34B2B463E86B2D768B4; isg=BKys9N3R3jpM2Mp_Xhy2vn8bfYreZVAPB10kRwbs5dfbEU0bMHcin-7jMNGpuohn; l=dBLXnIncQINxmXxCBOfwVSFu6kQOaIOb8iVrDzSKMICPOc12kkfPWZVSsMLyCnhNHsh9m3oTJ-juByT1byCqJxpsw3k_J_DmndC..',

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}

try:

r = requests.get(url, headers=kv, timeout=30)

r.raise_for_status()

r.encoding = r.apparent_encoding

return r.text

except:

return ""

def parsePage(ilt, html):

try:

plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)

tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)

for i in range(len(plt)):

price = eval(plt[i].split(':')[1])

title = eval(tlt[i].split(':')[1])

ilt.append([price, title])

except:

print("")

def printGoodsList(ilt):

tplt = "{:4}\t{:8}\t{:16}"

print(tplt.format("序号", "价格", "商品名称"))

# 写入文件

f = open('服饰.txt', 'w', encoding='utf-8')

f.write('商品名称\n')

count = 0

for g in ilt:

count = count + 1

print(tplt.format(count, g[0], g[1]))

f.write(g[1])

f.write('\r\n')

def main():

goods = '妈妈服饰'

depth = 20

start_url = 'https://s.taobao.com/search?q=' + goods

infoList = []

for i in range(depth):

try:

url = start_url + '&s=' + str(44 * i)

html = getHTMLText(url)

parsePage(infoList, html)

except:

continue

printGoodsList(infoList)

main()

最终爬取的结果如下图所示:

问题:

需要间歇的更换cookie,否则会爬几次后爬不到数据

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值