爬取豆瓣top250电影,提取评论关键词,然后将同一国家的关键词做成一个词云,轮廓是每个国家的地图轮廓
爬取数据
需要爬取电影名称、导演、年份、地区和前10个评论除了地区,其他的都没什么问题,我们来研究下地区的信息怎么获取
import requests
from bs4 import BeautifulSoup
import time
import pymysql
import pandas as pd
db = pymysql.connect('ip','QINYUYOU','QINyuyo!','homework')
cursor = db.cursor()
headers = {
'cookie':'bid=xiXasJy_T2s; ll="118304"; __utmz=30149280.1576307574.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1576307574.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=ucYJzWLxVGkxVUZzkLuOr2WKGYDQUChd; _vwo_uuid_v2=DDF040CDC39D506E32CB70680F68474E1|09b885503496bad5cd4ffc77a93035b1; _pk_ses.100001.4cf6=*; __utma=30149280.1798292817.1576307574.1576307574.1576411260.2; __utmb=30149280.0.10.1576411260; __utmc=30149280; __utma=223695111.844953453.1576307574.1576307574.1576411260.2; __utmb=223695111.0.10.1576411260; __utmc=223695111; ap_v=0,6.0; trc_cookie_storage=taboola%2520global%253Auser-id%3Da50462e2-0a35-4fe0-8d41-70f031512552-tuct4efa694; _pk_id.100001.4cf6=774b2f69656869fe.1576307574.2.1576411507.1576309794.',
'referer':'https://movie.douban.com/top250?start=0&filter=',
'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
list_name = []
list_dir = []
list_year = []
list_area = []
list_com = []
for i in range(0,10):
time.sleep(5)
url = 'https://movie.douban.com/top250?start={0}&filter='.format(i*25)
url_r = requests.get(url,headers=headers)#get请求网页
url_b = BeautifulSoup(url_r.text,'lxml')#解析
movie_list = url_b.find('ol',attrs={'class':'grid_view'})#电影列表
#print(url_list)
for movie_li in movie_list.find_all('li'):
movie_url = movie_li.find('a').attrs['href'] # 获取电影链接
time.sleep(4)
movie_r = BeautifulSoup(requests.get(movie_url,headers=headers).text,'lxml')
movie_name = movie_r.h1.span.string#获取h1中span的内容(电影标题)
movie_directed = movie_r.find('a',rel='v:directedBy').string#电影导演
time_ = movie_r.h1.find('span',class_='year').string#找到class为year的标签span的内容(年份)
info_div = movie_r.find('div', attrs={'id': 'info'})
for child in info_div.children:
if child.string and child.string.startswith('制片国家/地区'):
area = child.next_sibling.string.strip()#制片国家
print(area)
comment_url = movie_r.find('div',id='comments-section').find('div',class_='mod-hd').find('span',class_='pl').a.attrs['href']#评论地址
time.sleep(5)
#print(url_)
comment_req = BeautifulSoup(requests.get(comment_url, headers=headers).text, 'lxml')
comment_item = comment_req.find_all('div', class_='comment-item')
for j in range(10):
comment = comment_item[j].find('div', class_='comment').find('span', class_='short').string
print(i, comment)
list_name.append(movie_name)
list_dir.append(movie_directed)
list_year.append(time_)
list_area.append(area)
list_com.append(comment)
sql = 'INSERT INTO tp250(movie_name,movie_dir,movie_year,movie_area,movie_comment) VALUES(%s,%s,%s,%s,%s)'
cursor.execute(sql, (movie_name, movie_directed,time_, area,comment))
print('插入成功')
db.commit()
dict_ = pd.DataFrame({'name': list_name, 'dir': list_dir, 'time': list_year,'area':list_area,'comment':list_com})
dict_.to_csv('top250.csv')
db.close()
分析
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np
data = pd.read_csv('国家关键词.csv')
print(data)
def area(string):
return string.strip().split('/')[0].strip()
data['area'] = data['area'].apply(area)
count = data.groupby(['area'],as_index=False)['area'].agg({'cnt':'count'})
United_States = data[data['area']=='美国']#美国评论
China = data.loc[(data['area']=='中国台湾') | (data['area']=='中国大陆') | (data['area']=='中国香港')]#中国评论
#print(United_States)
#print(China)
Denmark = data[data['area']=='丹麦']#丹麦评论
Iran = data[data['area']=='伊朗']
India = data[data['area']=='印度']
#获取每个国家的关键词
def area_comment(area_name):
return data[data['area']==area_name]
def key_word_count(area_name):
data = area_comment(area_name)
string_key = ''
for word in data['keyword']:
string_key = string_key + word
list_key = string_key.split(' ')
series_key = pd.DataFrame({'key_word':list_key})
count = series_key.groupby(['key_word'], as_index=False)['key_word'].agg({'cnt': 'count'})
key_word = count['key_word']
key_count = count['cnt']
image = np.array(Image.open("美国.jpg"))
wordcloud = WordCloud(mask=image,font_path="/usr/local/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/SimHei.ttf").generate(string_key)
plt.imshow(wordcloud,interpolation="bilinear")
plt.show()
if __name__ == '__main__':
key_word_count('美国')
数据处理
import pandas as pd
import thulac
import math
import string
import jieba.analyse
from collections import Counter
lac = thulac.thulac(seg_only=True)
data = pd.read_csv('./top250_result.csv')
list_area = data['area'].values
def word_cut(string):
return lac.cut(string,text=True)
df_comment = data['comment']
df_comment = df_comment.apply(word_cut)
#tf-idf
'''
def tf(word, count):
return count[word] / sum(count.values())
def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)
def idf(word, count_list):
return math.log(len(count_list)) / (1 + n_containing(word, count_list))
def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
def main():
#print(df_comment)
countlist = []
for word_list in df_comment:
for i in range(len(word_list)):
count = Counter(word_list[i])
countlist.append(count)
#print(countlist)
for i, count in enumerate(countlist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, count, countlist) for word in count}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 10)))
'''
if __name__ == "__main__":
keyword_list = []
for sentence in df_comment:
string = ''
print(sentence)
keywords = jieba.analyse.extract_tags(sentence, topK=10, withWeight=True)
for item in keywords:
string = string + item[0]+' '
keyword_list.append(string)
dict__ = pd.DataFrame({'area':list_area,'keyword':keyword_list})
dict__.to_csv('国家关键词.csv')
处理2
import pandas as pd
tfidf = analyse.extract_tags
data = pd.read_csv('./top250.csv')
df_comment = data['comment']
list1 = []#评论
string = ''
for i in range(len(df_comment)):
print(i)
if (i+1) % 10 == 0:
string = string + df_comment[i]
list1.append(string)
string = ''
else :
string = string + df_comment[i]
data = data[['name','area','time','dir']]
data = data.drop_duplicates()#去重
print(data)
print(list1)
df_dir = data['dir'].values#导演
df_time = data['time'].values#年份
df_area = data['area'].values#国家
df_name = data['name'].values#片名
print(len(df_name),len(df_area),len(df_time),len(df_dir),len(list1))
dict_ = pd.DataFrame({'name':df_name,'dir':df_dir,'time':df_time,'area':df_area,'comment':list1})
dict_.to_csv('top250_result.csv')