期末项目
import requests
import json
import time
from bs4 import BeautifulSoup as bs
from selenium import webdriver
def openWeb(movie_id):
# 模拟用户打开电影界面
driver = webdriver.Chrome() # 实例化对象,新建一个浏览器对象
driver.get('https://movie.douban.com/subject/{}/'.format(movie_id)) #driver.get打开指定网页
# 模拟用户点击更多短评
compte_rendu = driver.find_element_by_id('comments-section')
compte_rendu = compte_rendu.find_element_by_class_name('pl')
compte_rendu = compte_rendu.find_element_by_tag_name('a')
compte_rendu.click()
# 获取第一页短评内容
rendu = driver.find_element_by_id('comments')
rendu = rendu.get_attribute('outerHTML')
soup = bs(rendu,'lxml')
msg = soup.find_all(attrs={'class':'comment-info'})
com = soup.find_all(attrs={'class': 'comment-content'})
return msg, com
def readOne(msg, com):
li = []
# 写入第一页内容
for i in range(20):
commmmm = {}
name = msg[i].a.text # 用户名
# level = str(msg[i].find_all('span')[1]).split('=')[-1][1:-9] # 等级
ntime = str(msg[i].find_all('span')[2].text).strip() # 时间
comren = com[i].find('span').text # 文本
commmmm['username'] = name
# commmmm['level'] = level
commmmm['nowtime'] = ntime
commmmm['comren'] = comren
li.append(commmmm)
print('第1页爬取成功。')
return li
def fw(params,movie_id):
# 访问请求
response = requests.get('https://movie.douban.com/subject/{}/comments'.format(movie_id), params=params,
headers=headers)
soup = bs(response.text, 'lxml')
return soup
def readNext(val,movie_id,li):
params = {
'start': val,
'limit': '20',
'status': 'P',
'sort': 'new_score',
}
while True:
try:
soup = fw(params, movie_id)
msg = soup.find_all(attrs={'class': 'comment-info'})
com = soup.find_all(attrs={'class': 'comment-content'})
except Exception as e:
print(e)
continue
else:
break
for i in range(20):
commmmm = {}
name = msg[i].a.text # 用户名
# level = str(msg[i].find_all('span')[1]).split('=')[-1][1:-9] # 等级
ntime = str(msg[i].find_all('span')[2].text).strip() # 时间
comren = com[i].find('span').text # 文本
commmmm['username'] = name
# commmmm['level'] = level
commmmm['nowtime'] = ntime
commmmm['comren'] = comren
li.append(commmmm)
print('第{}页读取成功。'.format(page + 1))
if __name__=='__main__':
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'll="118254"; bid=olsdvBoaQlY; _pk_id.100001.4cf6=fc3c2f603219d618.1686893513.; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.2000670025.1686893513.1686893513.1686893513.1; __utmc=30149280; __utmz=30149280.1686893513.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt_douban=1; __utma=223695111.1622331921.1686893513.1686893513.1686893513.1; __utmb=223695111.0.10.1686893513; __utmc=223695111; __utmz=223695111.1686893513.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=taefTpduvw6IsOFf7V9JjTFWwecczwYa; __gads=ID=0cef8c254fb3a182-22f03c23a8b4003a:T=1686893514:RT=1686893514:S=ALNI_MYknllPR6OM9WEpGJHgQrbn4RPcKw; __gpi=UID=00000c4fac9682ea:T=1686893514:RT=1686893514:S=ALNI_MZTZy1PL0SgwWatT6BXyOWDos7ZKw; _vwo_uuid_v2=DFA18502BB5D102DC197B08F02321353B|12f68a5f57ebe2c037702036026f4ad5; __utmb=30149280.3.10.1686893513',
'Referer': 'https://movie.douban.com/subject/25868125/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 输入电影id
movie_id = 25868125
n = int(input("请输入要爬取的页数:"))
msg, com = openWeb(movie_id)
# 清空json文本
with open("cdata.json", 'w') as f:
pass
li = readOne(msg, com)
for page in range(1, n):
time.sleep(2)
val = 20 * page
readNext(val, movie_id, li)
li = json.dumps(li, ensure_ascii=False)
with open("cdata.json", 'a', encoding='utf-8') as f:
f.write(li)