参考了大佬的正则表达式,暂时不写教程,直接上代码
import requests
import random
import re
from lxml import etree
def get_html(url):
user_agent = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ',
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]
headers = {
'Cookie': 'll="108309"; bid=ELlOU2v6wzk; __yadk_uid=cra70mWYZDspH84ns7OQe5MlRXTYRcPQ; _vwo_uuid_v2=DC107E9CBF8E0A6699D04FC69E2982C81|48d6b514f6792d4dc1a14d5910891045; __gads=ID=2e9aa8e46ca22d82-227af46190c40025:T=1604664940:RT=1604664940:S=ALNI_MajmByuNuUd5eaZXqlcwPe6XdP25g; gr_user_id=412ad26e-7363-4fd6-bc57-68a7e7df2ec2; douban-fav-remind=1; __utmz=30149280.1623417103.11.11.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmz=223695111.1623417111.8.8.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ap_v=0,6.0; _vwo_uuid_v2=DC107E9CBF8E0A6699D04FC69E2982C81|48d6b514f6792d4dc1a14d5910891045; __utma=30149280.1777458498.1604664845.1623417103.1623421591.12; __utmb=30149280.0.10.1623421591; __utmc=30149280; __utma=223695111.1738783349.1604664845.1623417111.1623421591.9; __utmb=223695111.0.10.1623421591; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1623421591%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=d5e147dfb6e78760.1604664845.9.1623421591.1623417824.; _pk_ses.100001.4cf6=*',
'User-Agent' : str(random.choice(user_agent)),
"Host": "movie.douban.com",
"Connection": "keep-alive"
}
res = requests.get(url, headers=headers)
return res
def get_comment_id_time_star(data):
id_pattern = re.compile('<h2><a href="https://movie.douban.com/review/(.*?)/', re.S)
id = re.findall(id_pattern, data)
time_pattern = re.compile('<span content=".*?" class=".*?">(.*?)</span>', re.S)
time = re.findall(time_pattern, data)
star_pattern = re.compile('<span class="allstar(.*?) main-title-rating"',re.S)
star = re.findall(star_pattern, data)
id_time_star = [id, time, star]
return id_time_star
def parse_comment(data):
content_pattern = re.compile('data-original(.*?)main-author', re.S)
content = re.findall(content_pattern, data)
text_pattern = re.compile('[\u4e00-\u9fa5|,、“”‘’:!~@#¥【】*()——+。;?]+', re.S)
text = re.findall(text_pattern, content[0])
text = ''.join(text)
return text
if __name__ == '__main__':
for i in range(0,48):
print("正在爬取第{}页.......".format(i+1))
index_url = "https://movie.douban.com/subject/30228394/reviews?start={}".format(i * 20)
index_res = get_html(index_url).text
id_time_star = get_comment_id_time_star(index_res)
id = id_time_star[0]
time = id_time_star[1]
star = id_time_star[2]
print("正在处理评论内容........")
for j in range(len(id)):
print("正在爬取第{}条评论.......".format(j+1))
comment_url = "https://movie.douban.com/j/review/{}/full".format(id[j])
# comment_url = "https://movie.douban.com/j/review/13300536/full"
comment_res = get_html(comment_url).text
content = parse_comment(comment_res)
print("将评论写入文件......")
with open("content.txt", "a", encoding="utf-8") as f:
f.write(content + "\n")
print("评论爬取完成........")
# 将评分写入star.txt文件中
print("将评分和日期写入文件........")
with open("star.txt", "a") as f:
for _ in star:
f.write(_ + "\n")
# 将日期写入date.txt文件中
with open("date.txt", "a") as f:
for _ in time:
f.write(_ + "\n")
print("第{}页爬取完成".format(i+1))