# 导入工具包
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
# 请求头
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# =============================================================================
# 爬取一页
# =============================================================================
# 爬取的网址
url='https://movie.douban.com/subject/34841067/comments?limit=20&status=P&sort=new_score'
# 获取信息
html = requests.get(url,headers=headers)
# 获取内容
data = html.text
soup = BeautifulSoup(data,'lxml')
# 信息
# 用户
names = soup.select('#comments > div > div.comment > h3 > span.comment-info > a')
# 评级
pingjis = soup.select('#comments > div > div.comment > h3 > span.comment-info')
# 日期
riqis = soup.select('#comments > div > div.comment > h3 > span.comment-info > span.comment-time')
# 内容
neirongs = soup.select('#comments > div > div.comment > p > span')
# 空list
lis=[]
for name,pingji,riqi,neirong in zip(names,pingjis,riqis,neirongs):
pingji_re = pingji.find_all('span')
lis.append([name.get_text(),
pingji_re[1]['class'],
pingji_re[1]['title'],
riqi.get_text().strip(),
neirong.get_text()])
result1 = pd.DataFrame(lis,columns=['用户','评级','等级','日期','内容'])
# =============================================================================
# 爬取多页
# =============================================================================
url = ['https://movie.douban.com/subject/34841067/comments?start={}&limit=20&status=P&sort=new_score'.format(i) for i in range(0,100,20)]
lis2 = []
for urli in url:
# 获取信息
html = requests.get(urli,headers=headers)
# 获取内容
data = html.text
soup = BeautifulSoup(data,'lxml')
# 用户
names = soup.select('#comments > div > div.comment > h3 > span.comment-info > a')
# 评级
pingjis = soup.select('#comments > div > div.comment > h3 > span.comment-info')
# 日期
riqis = soup.select('#comments > div > div.comment > h3 > span.comment-info > span.comment-time')
# 内容
neirongs = soup.select('#comments > div > div.comment > p > span')
for name,pingji,riqi,neirong in zip(names,pingjis,riqis,neirongs):
pingji_re = pingji.find_all('span')
lis2.append([name.get_text(),
pingji_re[1]['class'],
pingji_re[1]['title'],
riqi.get_text().strip(),
neirong.get_text()])
print('完成:',urli)
time.sleep(np.random.randint(5,10))
result2 = pd.DataFrame(lis2,columns=['用户','评级','等级','日期','内容'])
python爬取《你好, 李焕英》豆瓣评论数据
最新推荐文章于 2022-09-01 16:44:47 发布