莴苣第三季part2实在是太棒了!!!制作和配音都绝了,看起来啊!!尤其《白夜》真的是神回!
先放个结果
自动翻页和下载评论部分的代码:
def douban_page_generator(i):
new_url =[]
for i in range(20,i+20,20):
new = 'https://movie.douban.com/subject/30353357/comments?start=' + str(i)+ '&limit=20&sort=new_score&status=P'
new_url.append(new)
return new_url
#进入一个新页面后下载评论
def get_page_reviews(link):
reviews = []
votes =[]
browser = webdriver.Chrome(
executable_path='/Users/wang/mmodule/lib/python3.6/site-packages/selenium/webdriver/chrome/chromedriver')
browser.get(link)
for i in range(1, 21):
review = browser.find_element_by_xpath(
'/html/body/div[3]/div[1]/div/div[1]/div[4]/div[{}]/div[2]/p/span'.format(str(i))).text
vote = browser.find_element_by_xpath(
'/html/body/div[3]/div[1]/div/div[1]/div[4]/div[{}]/div[2]/h3/span[1]/span'.format(str(i))).text
reviews.append(review)
votes.append(vote)
data = {
'reviews': reviews,
'votes': votes,
}
return data
制作词云:
#制作词云
import matplotlib.pyplot as plt
import jieba #中文词云
from wordcloud import WordCloud
import sys
import xlrd
def return_final_dic():
#文件名一定要加上绝对路径
data = xlrd.open_workbook(r'/Users/wang/Desktop/attack on titan.xlsx')
table = data.sheet_by_index(0)
# 获得工作表的非空行数
nrows = table.nrows
print('一共有 '+str(nrows)+' 个非空行')
with open(r'/Users/wang/Desktop/stopwords.txt', 'r+', encoding='utf-8') as f:
stop_word = f.read().split('\n')
print('成功获取停用词')
#从第二行开始读取到最后一行的循环
#对每一条评论而言
final_dic = {}
for row in range(1, nrows):
sent = table.cell_value(rowx= row ,colx=1)
votes = table.cell_value(rowx= row ,colx=2)
print(sent + ': ' + str(votes))
seg_list = jieba.cut(sent, cut_all=False)
#print('精确模式:', ''.join(seg_list))
for key in seg_list:#seg_list 是一个generator
#if not(key.strip() in stopword) and (len(key.strip()) > 1) and not(key.strip() in wordlist) :
if not(key) in stop_word and not len(key)<2:
print(key)
final_dic[str(key)] = float(votes)
print()
return final_dic
def create_cloud(dic):
cloud = WordCloud(font_path=r'/Users/wang/Downloads/Noto_Sans_SC/NotoSansSC-Light.otf', background_color='white',max_font_size=80)
cloud.generate_from_frequencies(dic)
plt.imshow(cloud)
plt.axis('off')
plt.show()
cloud.to_file(r'/Users/wang/Desktop/aot word cloud.jpg')
final = return_final_dic()
print(final)
create_cloud(final)