pythonallowpos_Python高级应用程序设计任务

1 importrequests2 from lxml importetree3 importtime4 importopenpyxl5 importre6 importjieba7 importjieba.analyse8 importpandas as pd9 from wordcloud importWordCloud10 from snownlp importSnowNLP11 importmatplotlib.pyplot as plt12 importmatplotlib13 importnumpy as np14

15

16 defparse_info(html):17 #获取评论内容节点

18 comments = html.xpath('//div[@id="comments"]')19 if len(comments) >0:20 page_info =[]21 comment_items = comments[0].xpath('./div[@class="comment-item"]/div[@class="comment"]')22 #获取每一条评论信息

23 for item incomment_items:24 info = item.xpath('./h3/span[@class="comment-info"]')25 if len(info) <0:26 continue

27 #评论用户

28 user_name = info[0].xpath('./a[1]/text()')[0].strip()29 #评分, 去除信息不全的

30 try:31 star = info[0].xpath('./span[2]/@class')[0].strip()32 star = re.search('star(\d+)', star, re.S).group(1)33 exceptException as e:34 continue

35 #评论时间

36 up_time = info[0].xpath('./span')[-1].xpath('./@title')[0].strip()37 #评论内容

38 content = item.xpath('string(./p)').strip()39 page_info.append([user_name, star, up_time, content])40 returnpage_info41 else:42 returnNone43

44

45 defparse_douban():46 headers ={47 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',48 'Accept-Language': 'zh,en-US;q=0.9,en;q=0.8,zh-TW;q=0.7,zh-CN;q=0.6',49 'Cookie': cookie,50 'Host': 'movie.douban.com',51 'Upgrade-Insecure-Requests': '1',52 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

53 }54

55 #结果集

56 datas =[]57 for page in range(0, 25):58 print('当前{}页'.format(page + 1))59 url = 'https://movie.douban.com/subject/22265687/comments?start={}'\60 '&limit=20&sort=new_score&status=P'.format(page * 20)61 #请求数据,判断是否成功

62 whileTrue:63 response = requests.get(url=url, headers=headers)64 #成功返回状态码为200

65 if response.status_code == 200:66 html =etree.HTML(response.text)67 print('访问成功')68 break

69 else:70 print(response.status_code)71 time.sleep(2)72 page_info =parse_info(html)73 if page_info is notNone:74 datas.extend(page_info)75 #每请求一页就延时一下

76 time.sleep(0.5)77

78 returndatas79

80

81 #保存成excel

82 defsave_data(datas):83 #创建excel

84 xls =openpyxl.Workbook()85 sheet =xls.active86 #标题

87 title = ['评论用户', '评分等级', '评论时间', '评论内容']88 #添加列头

89 sheet.append(title)90 #遍历结果集,添加到excel中

91 for line indatas:92 sheet.append(line)93 xls.save(file_name)94

95

96 defgen_word_cloud(all_words):97 #词云分析

98 wordcloud =WordCloud(99 font_path='simhei.ttf', #字体

100 background_color='white', #背景色

101 max_font_size=120, #频率最大单词字体大小

102 width=1000, #宽度

103 height=600, #高度

104 ).fit_words(all_words)105 #词云图片保存在本地

106 wordcloud.to_file("词云图.jpg")107

108

109 #情感值分析

110 defsnow_nlp(words):111 #情感值列表

112 sentiments_list =[]113 #消极评论数量

114 low =0115 #正常评论数量

116 center =0117 #积极评论数量

118 high =0119 for word inwords:120 snlp =SnowNLP(word)121 value =snlp.sentiments122 if value < 0.4:123 low += 1

124 elif value > 0.6:125 high += 1

126 else:127 center += 1

128 sentiments_list.append([word, value])129 #将结果数组转为DataFrame

130 df = pd.DataFrame(sentiments_list, columns=['word', 'sentiments'])131 #情感值导出excel

132 df.to_excel('情感值表.xls', encoding='utf-8', index=False, header=['评论', '情感值'])133 #直方图绘制

134 df['sentiments'].hist(bins=np.arange(0, 1, 0.01))135 #x轴文字

136 plt.xlabel('情感值')137 #y轴文字

138 plt.ylabel('数量')139 #保存为图片

140 plt.savefig('情感值直方图.jpg')141

142 #画柱状图

143 plt.figure(figsize=(4, 5)) #指定图片大小为400*500

144 plt.bar(['消极评论', '居中评论', '积极评论'], [low, center, high], width=0.2)145 #标注数字

146 for x, y in zip(['消极评论', '居中评论', '积极评论'], [low, center, high]):147 plt.text(x, y, '%d' % y, ha='center', va='bottom')148 #y轴文字

149 plt.ylabel('数量')150 #保存为图片

151 plt.savefig('情感统计条形图.jpg')152

153

154 #评分统计

155 defseg_star(stars):156 x = [1, 2, 3, 4, 5]157 y_data =[0, 0, 0, 0, 0]158 for star instars:159 idx = int(star) // 10

160 y_data[idx - 1] += 1

161 plt.figure()162 plt.plot(x, y_data, ls="--", label="评分")163 plt.legend()164 plt.xlabel('评分')165 #y轴文字

166 plt.ylabel('数量')167 #保存为图片

168 plt.savefig('评分统计折线图.jpg')169

170

171 #分词并统计

172 defseg_depart(words):173 #保存全局分词,用于词频统计

174 all_words ={}175 for content inwords:176 #TextRank 关键词抽取,只获取固定词性

177 #仅提取地名、名词、动名词、动词

178 words = jieba.analyse.textrank(content, topK=50, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))179 for word inwords:180 #记录全局分词

181 if word not inall_words:182 all_words[word] = 1

183 else:184 all_words[word] += 1

185 #生成词云图

186 gen_word_cloud(all_words)187 #排序,降序

188 top500 = sorted(all_words.items(), key=lambda x: x[1], reverse=True)189 #将结果数组转为df序列

190 df_words =pd.DataFrame(top500)191 #词频表导出excel

192 df_words.to_excel('词频统计.xls', encoding='utf-8', index=False, header=['关键词', '频率'])193

194

195 if __name__ == "__main__":196 #储存文件名

197 file_name = '星球9影评.xlsx'

198 cookie = 'bid=RA47mCn8jns; douban-fav-remind=1; __yadk_uid=KgOjFzNBFlx2BIqCAZv0sDnAcUUNFLcc; ll="108309"; trc_cookie_storage=taboola%2520global%253Auser-id%3D751098a0-eba6-4d58-9238-ffb6ce1c9886-tuct47a058c; __gads=ID=f1c44b7e3685ecb3:T=1571501126:S=ALNI_Maxii85e0vX-F9TCUt7gB8TGSw4cA; __utmc=30149280; __utmc=223695111; _vwo_uuid_v2=D397EEE8C1B4A53B7AE894D1130A06008|ab9eefd2bf71bcb86c5bf8f1e0acbb87; __utmz=30149280.1576584118.11.9.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1576738591%2C%22https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D1%26tn%3Dbaidu%26wd%3D%25E8%25B1%2586%25E7%2593%25A3%26oq%3D%2525E7%2525A5%2525A8%2525E6%252588%2525BF%2525E7%2525BD%252591%26rsv_pq%3Dffdf6fa10000d4b2%26rsv_t%3D379aFjV%252Bjy0GlPQGAbqS8HUe%252Fd%252FAYerEeU30lnTLeieUD%252Ba4XQxA3miOQM4%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_dl%3Dtb%26rsv_sug3%3D7%26rsv_sug1%3D1%26rsv_sug7%3D100%26rsv_sug2%3D0%26inputT%3D899%26rsv_sug4%3D899%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.941855647.1569745576.1576584118.1576738591.12; __utmz=223695111.1576740585.10.9.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmb=223695111.0.10.1576740585; __utma=223695111.13086392.1571304138.1576738591.1576740585.10; __utmb=30149280.1.10.1576738591; dbcl2="200835379:38bnY8zREqw"; ck=o-Px; _pk_id.100001.4cf6=7b594b040ee131b3.1571304137.8.1576742234.1576584193.; push_noty_num=0; push_doumail_num=0'

199 #抓取数据

200 datas =parse_douban()201 #保存数据

202 save_data(datas)203

204 #pandas 读取

205 df = pd.read_excel('星球9影评.xlsx')206

207 #设置中文字体和负号正常显示

208 matplotlib.rcParams['font.sans-serif'] = ['SimHei']209 matplotlib.rcParams['axes.unicode_minus'] =False210

211 #词频统计

212 seg_depart(df['评论内容'].values)213 #情感值分析

214 snow_nlp(df['评论内容'].values)215 #评分图

216 seg_star(df['评分等级'].values)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值