第一步引入库和爬虫头部:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import matplotlib.pyplot as plt
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
第二步建立函数定义数据格式:
def get_data(url):
r = requests.get(url, headers=headers, timeout=30)
return r.text
data = pd.DataFrame(columns=['评论', '星级'])
score = []
第三步进行数据的爬取和清洗:
for page in range(0, 201, 20):
base_url = 'https://movie.douban.com/subject/1291546/comments?start=' + str(
page) + '&limit=20&sort=new_score&status=P'
data = get_data(base_url)
soup = BeautifulSoup(data, 'lxml')
# 获取星级
star = soup.find_all('span', attrs={'class': 'comment-info'})
stars = []
for i in range(len(star)):
a = star[i].find_all('span')[1].get('class')[0][-2:-1]
if a == 'm':
a = '0'; #有一部分人没有评论,则把他们当作零星处理
stars.append(a)
# 获取评论
comment = soup.find_all('span', attrs={'class': 'short'})
comments = []
for i in range(len(comment)):
b = comment[i].text.replace('<span class="short">', '')
comments.append(b)
data_1 = pd.DataFrame({'评论': comments, '星级': stars})
print(data_1)
time.sleep(2)
score.extend(stars)
第四步分解出各个星级的打分人数:
a = {}
for i in score:
if score.count(i)>1:
a[i] = score.count(i)
print (a)
最后一步绘图:
keys = a.keys()
values = a.values()
plt.bar(keys, values)
plt.show()
运行后得到结果:
这个小项目是还存在问题的 还没有解决评论和分数的储存问题 解决了继续更