import requests
import re
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
plt=re.findall(r'\"price\"\:\"[\d\.]*\"',html)
tlt=re.findall(r'\"description\"\:\".*?\"',html)
#最小匹配:取得最后一个双引号为止的内容
for i in range(len(plt)):
price=eval(plt[i].split(':')[1])
#eval() 函数用来执行一个字符串表达式,并返回表达式的值,去掉最外层的双引号和单引号
title=eval(tlt[i].split(':')[1])
# 使用split()方法以":"为切割点,将商品的键值分开,提取值,即商品名称
ilt.append([price,title])
except:
print()
def printGoodList(ilt):
tplt="{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count=0
for q in ilt:
count=count+1
print(tplt.format(count,q[0],q[1]))
def main():
goods="书包"
depth=2
start_url='https://ai.taobao.com/search/index.htm?spm='+goods
infoList=[]
for i in range(depth):
try:
url=start_url+'&s='+str(44*i)
html=getHTMLText(url)
parsePage(infoList,html)
except:
continue
#当某个页面出现问题时继续下一个页面的解析
printGoodList(infoList)
main()
但是由于原先的URL因为网页更新已经不能用了 ,目前网页版淘宝即使只是搜索也需要登录 ,不登录的情况下只能用天猫商城搜索且只能看到第一页
import requests
import re
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
cp = re.compile("<[^>]+>")
# HTML标签正则
cp_= re.compile("[\t|\n| ]")
# 去除字符串中的\t \n 及 空格的正则
#关于re.compile函数:(见后续)
soup = BeautifulSoup(html,"html.parser")
title_labels = soup.find_all("p",class_="productTitle")
price_labels = soup.find_all("p",class_="productPrice")
for title_label,price_label in zip(title_labels,price_labels):
title = cp.sub("",str(title_label))
price = cp.sub("",str(price_label))
#关于cp.sub函数:(见后续)
title = cp_.sub("",title)
price = cp_.sub("",price)
ilt.append([price,title])
def printGoodList(ilt):
tplt="{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count=0
for q in ilt:
count=count+1
print(tplt.format(count,q[0],q[1]))
def main():
goods="书包"
depth=1
start_url='https://list.tmall.com/search_product.htm?q='+goods
infoList=[]
for i in range(depth):
url=start_url
html=getHTMLText(url)
parsePage(infoList,html)
printGoodList(infoList)
main()