首先查看当当网好评书单页面,找到翻页的URL参数
直接用requests请求页面
resp = requests.get(url)
找到想要的信息,使用正则表达式把这些信息提取出来
pattern=re.compile('list_num.*?(\d+).<.*?<img src="(.*?)".*?title="(.*?)".*?tuijian">(.*?)</span>.*?title="(.*?)".*?<span>(\d{4}-\d{2}-\d{2}).*?(\d+)次.*?price_n">¥(.*?)</span>.*?price_r">¥(.*?)</span>',re.S)
items = re.findall(pattern,html)
打印出来看看是否提取到
把信息保存下来就可以了
完整代码如下
import requests
import re
import json
def get_html(url):
try:
resp = requests.get(url)
return resp.text
except requests.RequestException:
return None
def parse_html(html):
pattern=re.compile('list_num.*?(\d+).<.*?<img src="(.*?)".*?title="(.*?)".*?tuijian">(.*?)</span>.*?title="(.*?)".*?<span>(\d{4}-\d{2}-\d{2}).*?(\d+)次.*?price_n">¥(.*?)</span>.*?price_r">¥(.*?)</span>',re.S)
items = re.findall(pattern,html)
for item in items:
yield{
'range': item[0],
'iamge': item[1],
'title': item[2],
'recommend': item[3],
'author': item[4],
'time': item[5],
'fiveStar':item[6],
'price_r': item[7],
'price_n':item[8]
}
baseUrl = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-'
for i in range(1,26):
url=baseUrl+str(i)
html = get_html(url)
items=parse_html(html)
print("处理第{}页".format(i))
with open("res5.txt","a",encoding="utf-8")as f:
for item in items:
#print(item)
f.write(json.dumps(item,ensure_ascii=False) + '\n')
f.close()