python爬取豆瓣影评理论依据_Python 豆瓣影评抓取

最新推荐文章于 2021-03-26 17:47:05 发布

愚者为聪

最新推荐文章于 2021-03-26 17:47:05 发布

阅读量308

点赞数

文章标签： python爬取豆瓣影评理论依据

本文链接：https://blog.csdn.net/weixin_32049085/article/details/114445869

版权

# encoding=utf8importurllib2

request=urllib2.Request("https://movie.douban.com/nowplaying/hangzhou/")

response=urllib2.urlopen(request)

html_data = response.read().decode('utf-8')

#print html_data#获取上映的电影信息frombs4 importBeautifulSoup asbs

#'html.parser' Python标准库解析htmlsoup = bs(html_data, 'html.parser')

nowplaying_movie = soup.find_all('div', id='nowplaying')

nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')

#print(nowplaying_movie_list[0])#电影的id和名称nowplaying_list = []

foritem innowplaying_movie_list:

nowplaying_dict = {}

nowplaying_dict['id'] = item['data-subject']

fortag_img_item initem.find_all('img'):

nowplaying_dict['name'] = tag_img_item['alt']

nowplaying_list.append(nowplaying_dict)

#print(nowplaying_list)#===================评论=========================================================#评论的地址requrl = 'https://movie.douban.com/subject/'+ nowplaying_list[8]['id'] + '/comments'+'?'+'start=0'+ '&limit=30'resp =urllib2.Request(requrl)

response2=urllib2.urlopen(resp)

html_data = response2.read().decode('utf-8')

soup = bs(html_data, 'html.parser')

comment_div_lits = soup.find_all('div', class_='comment')

#jiexi pingluneachCommentList = [];

foritem incomment_div_lits:

ifitem.find_all('p')[0].string is notNone:

#增加编码utf-8eachCommentList.append(item.find_all('p')[0].string.encode("utf8"))

#print(eachCommentList)#================================数据清洗=======================#1、拼接一个字符串comments = ''fork inrange(len(eachCommentList)):

comments = comments + (str(eachCommentList[k])).strip()

#print comments#2、匹配文本中的汉字 bu baohan 标点importre

pattern = re.compile(ur'[\u4e00-\u9fa5]+')

filterdata = re.findall(pattern, comments.decode("utf8"))

cleaned_comments = ''.join(filterdata)

#print cleaned_comments#===================================分词=============#jieba 分词importjieba

importpandas aspd

segment=jieba.lcut(cleaned_comments)

words_df=pd.DataFrame({'segment':segment})

#停止词过滤stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],

encoding='utf-8') # quoting=3全不引用words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

#print words_df.head()#=====================词频统计==============================importnumpy

#numpy计算包words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})

words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)

#print words_stat.head()#====================词云显示================importsys

reload(sys)

sys.setdefaultencoding('utf-8')

importmatplotlib.pyplot asplt

#matplotlib inlineimportmatplotlib

matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)

fromwordcloud importWordCloud#词云包wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80) # 指定字体类型、字体大小和字体颜色#word_frequence 为字典类型，可以直接传入wordcloud.fit_words()#word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}#wordcloud = wordcloud.fit_words(word_frequence)word_frequence = {x[0]:x[1] forx inwords_stat.head(1000).values}

wordcloud = wordcloud.fit_words(word_frequence)

plt.imshow(wordcloud)

plt.axis("off")

plt.show()