爬取豆瓣top250电影并分析

爬取豆瓣top250电影,提取评论关键词,然后将同一国家的关键词做成一个词云,轮廓是每个国家的地图轮廓

爬取数据

需要爬取电影名称、导演、年份、地区和前10个评论除了地区,其他的都没什么问题,我们来研究下地区的信息怎么获取
在这里插入图片描述

import requests
from bs4 import BeautifulSoup
import time
import pymysql
import pandas as pd
db = pymysql.connect('ip','QINYUYOU','QINyuyo!','homework')
cursor = db.cursor()
headers = {
    'cookie':'bid=xiXasJy_T2s; ll="118304"; __utmz=30149280.1576307574.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1576307574.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=ucYJzWLxVGkxVUZzkLuOr2WKGYDQUChd; _vwo_uuid_v2=DDF040CDC39D506E32CB70680F68474E1|09b885503496bad5cd4ffc77a93035b1; _pk_ses.100001.4cf6=*; __utma=30149280.1798292817.1576307574.1576307574.1576411260.2; __utmb=30149280.0.10.1576411260; __utmc=30149280; __utma=223695111.844953453.1576307574.1576307574.1576411260.2; __utmb=223695111.0.10.1576411260; __utmc=223695111; ap_v=0,6.0; trc_cookie_storage=taboola%2520global%253Auser-id%3Da50462e2-0a35-4fe0-8d41-70f031512552-tuct4efa694; _pk_id.100001.4cf6=774b2f69656869fe.1576307574.2.1576411507.1576309794.',
    'referer':'https://movie.douban.com/top250?start=0&filter=',
    'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
list_name = []
list_dir = []
list_year = []
list_area = []
list_com = []
for i in range(0,10):
    time.sleep(5)
    url = 'https://movie.douban.com/top250?start={0}&filter='.format(i*25)
    url_r = requests.get(url,headers=headers)#get请求网页
    url_b = BeautifulSoup(url_r.text,'lxml')#解析
    movie_list = url_b.find('ol',attrs={'class':'grid_view'})#电影列表
    #print(url_list)
    for movie_li in movie_list.find_all('li'):
        movie_url = movie_li.find('a').attrs['href']  # 获取电影链接
        time.sleep(4)
        movie_r = BeautifulSoup(requests.get(movie_url,headers=headers).text,'lxml')
        movie_name = movie_r.h1.span.string#获取h1中span的内容(电影标题)
        movie_directed = movie_r.find('a',rel='v:directedBy').string#电影导演
        time_ = movie_r.h1.find('span',class_='year').string#找到class为year的标签span的内容(年份)
        info_div = movie_r.find('div', attrs={'id': 'info'})
        for child in info_div.children:
            if child.string and child.string.startswith('制片国家/地区'):
                area = child.next_sibling.string.strip()#制片国家
                print(area)
        comment_url = movie_r.find('div',id='comments-section').find('div',class_='mod-hd').find('span',class_='pl').a.attrs['href']#评论地址
        time.sleep(5)
        #print(url_)
        comment_req = BeautifulSoup(requests.get(comment_url, headers=headers).text, 'lxml')
        comment_item = comment_req.find_all('div', class_='comment-item')
        for j in range(10):
            comment = comment_item[j].find('div', class_='comment').find('span', class_='short').string
            print(i, comment)
            list_name.append(movie_name)
            list_dir.append(movie_directed)
            list_year.append(time_)
            list_area.append(area)
            list_com.append(comment)
            sql = 'INSERT INTO tp250(movie_name,movie_dir,movie_year,movie_area,movie_comment) VALUES(%s,%s,%s,%s,%s)'
            cursor.execute(sql, (movie_name, movie_directed,time_, area,comment))
            print('插入成功')
            db.commit()
dict_ = pd.DataFrame({'name': list_name, 'dir': list_dir, 'time': list_year,'area':list_area,'comment':list_com})
dict_.to_csv('top250.csv')
db.close()

分析

import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np
data = pd.read_csv('国家关键词.csv')
print(data)
def area(string):
    return string.strip().split('/')[0].strip()
data['area'] = data['area'].apply(area)
count = data.groupby(['area'],as_index=False)['area'].agg({'cnt':'count'})
United_States = data[data['area']=='美国']#美国评论
China = data.loc[(data['area']=='中国台湾') | (data['area']=='中国大陆') | (data['area']=='中国香港')]#中国评论
#print(United_States)
#print(China)
Denmark = data[data['area']=='丹麦']#丹麦评论
Iran = data[data['area']=='伊朗']
India = data[data['area']=='印度']
#获取每个国家的关键词
def area_comment(area_name):
    return data[data['area']==area_name]
def key_word_count(area_name):
    data = area_comment(area_name)
    string_key = ''
    for word in data['keyword']:
        string_key = string_key + word
    list_key = string_key.split(' ')
    series_key = pd.DataFrame({'key_word':list_key})
    count = series_key.groupby(['key_word'], as_index=False)['key_word'].agg({'cnt': 'count'})
    key_word = count['key_word']
    key_count = count['cnt']
    image = np.array(Image.open("美国.jpg"))
    wordcloud = WordCloud(mask=image,font_path="/usr/local/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/SimHei.ttf").generate(string_key)
    plt.imshow(wordcloud,interpolation="bilinear")
    plt.show()
if __name__ == '__main__':
    key_word_count('美国')

数据处理

import pandas as pd
import thulac
import math
import string
import jieba.analyse
from collections import Counter

lac = thulac.thulac(seg_only=True)
data = pd.read_csv('./top250_result.csv')
list_area = data['area'].values
def word_cut(string):
    return lac.cut(string,text=True)
df_comment = data['comment']
df_comment = df_comment.apply(word_cut)
#tf-idf
'''
def tf(word, count):
    return count[word] / sum(count.values())
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)
def idf(word, count_list):
    return math.log(len(count_list)) / (1 + n_containing(word, count_list))
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)
def main():
    #print(df_comment)
    countlist = []
    for word_list in df_comment:
        for i in range(len(word_list)):
            count = Counter(word_list[i])
            countlist.append(count)
            #print(countlist)
        for i, count in enumerate(countlist):
            print("Top words in document {}".format(i + 1))
            scores = {word: tfidf(word, count, countlist) for word in count}
            sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            for word, score in sorted_words[:]:
                print("\tWord: {}, TF-IDF: {}".format(word, round(score, 10)))
'''
if __name__ == "__main__":
    keyword_list = []
    for sentence in df_comment:
        string = ''
        print(sentence)
        keywords = jieba.analyse.extract_tags(sentence, topK=10, withWeight=True)
        for item in keywords:
            string = string + item[0]+' '
        keyword_list.append(string)
    dict__ = pd.DataFrame({'area':list_area,'keyword':keyword_list})
    dict__.to_csv('国家关键词.csv')

处理2

import pandas as pd
tfidf = analyse.extract_tags
data = pd.read_csv('./top250.csv')
df_comment = data['comment']
list1 = []#评论
string = ''
for i in range(len(df_comment)):
    print(i)
    if (i+1) % 10 == 0:
        string = string + df_comment[i]
        list1.append(string)
        string = ''
    else :
        string = string + df_comment[i]
data = data[['name','area','time','dir']]
data = data.drop_duplicates()#去重
print(data)
print(list1)
df_dir = data['dir'].values#导演
df_time = data['time'].values#年份
df_area = data['area'].values#国家
df_name = data['name'].values#片名
print(len(df_name),len(df_area),len(df_time),len(df_dir),len(list1))
dict_ = pd.DataFrame({'name':df_name,'dir':df_dir,'time':df_time,'area':df_area,'comment':list1})
dict_.to_csv('top250_result.csv')


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值