因为淘宝网需要登录,而京东的html比较奇怪,所以最后选取了当当网
技术路线
requests——re
#CrowDangDangPrize.py
import requests
import re
import urllib
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def parsePage(ls,html):
try:
pls=re.findall(r'now\_price\"\>\¥\;[\d\.]+',html)
nls=re.findall(r"alt\=\'.*?\'",html)
for i in range(len(pls)):
price=eval(pls[i].split(';')[1])
name=eval(nls[i].split("=")[1])
ls.append([price,name])