第三方库安装指令:
pip install requests -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install lxml -i https://pypi.tuna.tsinghua.edu.cn/simple
运行代码:
import requests
from lxml import etree
def save(commenters):
with open("1.txt", 'w') as file:#也可以改成1.html格式,html格式更好点
file.write(commenters)
begin = int(input("开始页码:"))
end = int(input("结束页码:"))
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 "
"Safari/537.36"
}
url_ls = []
for page in range(begin, end + 1):
url = f"https://movie.douban.com/review/best/?app_name=movie={(page - 1)}"
rsp = requests.get(url, headers=header)
url_etree = etree.HTML(rsp.text)
urls = url_etree.xpath('//h2/a/@href')
url_ls += urls
commenters=""
for url in url_ls:
rsp = requests.get(url, headers=header)
article_etree = etree.HTML(rsp.text)
commentor = article_etree.xpath('//header/a/span/text()')[0]
moviename = article_etree.xpath('//header/a[2]/text()')[0]
comments = article_etree.xpath("//div[@class='main-bd']//p/text()")
comments = ";".join(comments)
commenters += "评论者:"+commentor +"</br>"+ "电影:"+ moviename +"</br>"+ "评论:"+comments+"</br>"+"</br>"+"</br>"
save(commenters)
print("爬取完毕")