现需爬取江门地区正在上映的所有电影短评及点赞数,并保存至Excel表中。
- 红色框是爬取目标
- 效果图如下:
该爬取数据量较大,建议用代理IP进行爬取,不然爬一次就emo了。。。
import time
import requests
from lxml import etree
import openpyxl
# 代理IP设置
proxies = {
'https': '144.255.28.185:4364',
}
# 获取热映电影首页响应
url = "https://movie.douban.com/cinema/nowplaying/jiangmen/"
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36 Edg/96.0.1054.53"}
req = requests.get(url, headers=header, proxies=proxies)
html = req.content.decode("utf-8")
# XPath解析html
tree = etree.HTML(html)
# 获取每部热映电影的id
n = 1
ids_ = []
while True:
movie = []
movie_id = tree.xpath(f'/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/ul/li[{n}]/@id')
movie_title = tree.xpath(f'/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/ul/li[{n}]/@data-title')
if not movie_id:
break
else:
movie.append(movie_id)
movie.append(movie_title)
ids_.append(movie)
n += 1
print(f"热映电影共{len(ids_)}部")
# 获取每部热映电影
k = 1
for id_ in ids_:
print(id_[1][0])
data = []
try:
for i in range(5):
url1 = f"https://movie.douban.com/subject/{id_[0][0]}/comments?start={20 * i}&limit=20&status=P&sort=new_score"
req1 = requests.get(url=url1, headers=header, proxies=proxies)
response1 = req1.content.decode("utf-8")
tree = etree.HTML(response1)
for j in range(20):
comment = tree.xpath(f'//*[@id="comments"]/div[{j + 1}]/div[2]/p/span/text()')
person = tree.xpath(f'//*[@id="comments"]/div[{j + 1}]/div[2]/h3/span[1]/span/text()')
lists = [comment, person]
data.append(lists)
k += 1
except Exception:
print(f"{id_[1][0]}爬取失败")
# 保存数据到excel表
print("save...")
try:
wb = openpyxl.load_workbook('.\\江门热门影片.xlsx')
except Exception:
wb = openpyxl.Workbook()
# sheet = book.add_sheet(f"{id_[1][0]}", cell_overwrite_ok=True)
sheet = wb.create_sheet(title=f"{id_[1][0]}", index=1)
sheet.cell(1, 1).value = "评论"
sheet.cell(1, 2).value = "点赞人数"
try:
if len(data) == 0:
continue
else:
for k in range(len(data)):
print("第%d条" % (k + 1))
sheet.cell(k + 2, 1).value = data[k][0][0]
sheet.cell(k + 2, 2).value = data[k][1][0]
except Exception:
continue
finally:
wb.save(r'.\\江门热门影片.xlsx')
print("休息1秒钟")
time.sleep(1)
print(f"成功爬取{k}部电影")