1、目标:获取淘宝搜索页面的信息,提取其中的商品名称和价格
2、理解:我们需要解决淘宝的搜索接口、翻页的处理
3、技术路线:requests-re
4、步骤:
①提交商品搜索请求,循环获取页面
②对于每个页面,提取商品名称和价格信息
③将信息输出到屏幕上
5、对原来代码的改进:增加销量的输出
6、编写代码时需注意的问题
①淘宝拒绝爬虫的访问,因此我们需要将headers改掉
②对翻页的处理:每页有44个商品,经过观察发现,网页链接后面的s=44即该页起始商品的编号,因此我们可以通过这个规律来实现翻页
import requests
import re
def getHTMLText(url):
kv = {'cookie': 'cookie: t=d61223e23cf95eaf3a7427e46620aa24; cna=KknhFroZHEcCAXgEbDkPkMA5; uc3=id2=VyyY5AHPxrqV%2Fw%3D%3D&vt3=F8dBxd7Ed9v7gopWytk%3D&nk2=F5RDL9RuzK7CKw%3D%3D&lg2=W5iHLLyFOGW7aA%3D%3D; lgc=tb63955929; uc4=nk4=0%40FY4I7KuEOrzqo%2FORxKFzLHtXG%2FsW&id4=0%40VXtXDzG6mttLTb9dnw5IY8EDQymW; tracknick=tb63955929; _cc_=U%2BGCWk%2F7og%3D%3D; tg=0; enc=LkWXCr2rjtrUCLxXukiml03lT75soVzYczcswxlr3pPb91SbW6nkCHg9kH18ZBufnpJmNA%2BV4hlHn3ZYmwG5wA%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=43_1; thw=cn; _samesite_flag_=true; cookie2=7ca742e000f0e87cb18cd16d396d59c7; _tb_token_=fb77408ed1e35; tfstk=cAHPBOXQbLpyCdOD1RyURCxwrZoRZHHnCQELZfX6epLMjPPli9bLmUI-y2z9n7f..; JSESSIONID=F698065DA30CCD2462BC75EEB3EAA4CB; v=0; l=dBaLyT4nQF2x7TpkBOCgCm5f0AQTdIRAgul44rNpi_5dp1T1cDbOour9He96cjWftUTB4o0COBe9-etktBDmndK-g3fPaxDc.; isg=BK-vdiweLUCDhSl8udLLyqTuPsO5VAN2g_ZtwcE8YZ4lEM8SySTgxq3KkgAuaNvu; uc1=cookie14=UoTUOan4slJdzQ%3D%3D',
'user-agent': 'Mozilla/5.0'}
try:
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
slt = re.findall(r'\"view_sales\"\:\".*?\"', html) #识别销量
tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price=eval(plt[i].split(':')[1])
sales=eval(slt[i].split(':')[1])
title=eval(tlt[i].split(':')[1])
ilt.append([price,sales,title])
except:
print("")
def printGoodList(ilt):
tplt="{:4}\t{:8}\t{:9}\t{:16}"
print(tplt.format("序号","价格","销量","商品名称"))
count=0
for g in ilt:
count=count+1
print(tplt.format(count,g[0],g[1],g[2]))
def main():
goods="书包"
depth=2
start_url="https://s.taobao.com/search?q="+goods
infoList=[]
for i in range(depth+1):
try:
url=start_url+"&s="+str(44*i)
html=getHTMLText(url)
parsePage(infoList,html)
except:
continue
printGoodList(infoList)
main()
输出结果: