模拟登陆豆瓣
第一次登陆需要验证码,之后的登陆可以隐去 “login(”username’,’password’)”,因为使用session保存了必要的登陆信息,代码如下:
import requests
try:
import cookielib
except:
import http.cookiejar as cookielib
import re
import time
import os.path
import json
from bs4 import BeautifulSoup
try:
from PIL import Image
except:
pass
from mywordCloud import save_jieba_result
from mywordCloud import draw_wordcloud
import threading
import codecs
# 构造 Request headers
agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
headers = {
"Host": "www.douban.com",
"Referer": "https://www.douban.com/",
'User-Agent': agent,
}
#使用cookie登录信息
session=requests.session()
session.cookies=cookielib.LWPCookieJar(filename='cookies')
try:
session.cookies.load(ignore_discard=True)
print('成功加载cookie')
except:
print("cookie 未能加载")
# 获取验证码
def get_captcha(url):
#获取验证码
print('获取验证码',url)
captcha_url = url
r = session.get(captcha_url, headers=headers)
print('test')
with open('captcha.jpg', 'wb') as f:
f.write(r.content)
f.close()
# 用pillow 的 Image 显示验证码
# 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入
try:
im = Image.open('captcha.jpg')
im.show()
im.close()
except:
print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))
captcha = input("please input the captcha\n>")
return captcha
def isLogin():
#登录个人主页,查看是否登录成功
url='https://www.douban.com/people/151607908/'
login_code=session.get(url,headers=headers,allow_redirects=False).status_code
if login_code==200:
return True
else:
return False
def login(acount,secret):
douban="https://www.douban.com/"
htmlcha=session.get(douban,headers=headers).text
patterncha=r'id="captcha_image" src="(.*?)" alt="captcha"'
httpcha=re.findall(patterncha,htmlcha)
pattern2=r'type="hidden" name="captcha-id" value="(.*?)"'
hidden_value=re.findall(pattern2,htmlcha)
print(hidden_value)
post_data = {
"source": "index_nav",
'form_email': acount,
'form_password': secret
}
if len(httpcha)>0:
print('验证码连接',httpcha)
capcha=get_captcha(httpcha[0])
post_data['captcha-solution']=capcha
post_data['captcha-id']=hidden_value[0]
print (post_data)
post_url='https://www.douban.com/accounts/login'
login_page=session.post(post_url,data=post_data,headers=headers)
#保存cookies
session.cookies.save()
if isLogin():
print('登录成功')
else:
print('登录失败')
def get_movie_sort():
time.sleep(1)
movie_url='https://movie.douban.com/chart'
html=session.get(movie_url,headers=headers)
soup=BeautifulSoup(html.text,'html.parser')
result=soup.find_all('a',{'class':'nbg'})
print(result)
#爬取短评论
def get_comment(filename): #filename为爬取得内容保存的文件
begin=1
comment_url = 'https://movie.douban.com/subject/11600078/comments'
next_url='?start=20&limit=20&sort=new_score&status=P'
headers2 = {
"Host": "movie.douban.com",
"Referer": "https://www.douban.com/",
'User-Agent': agent,
'Connection': 'keep-alive',
}
f=open(filename,'w+',encoding='utf-8')
while(True):
time.sleep(6)
html=session.get(url=comment_url+next_url,headers=headers2)
soup=BeautifulSoup(html.text,'html.parser')
#爬取当前页面的所有评论
result=soup.find_all('div',{'class':'comment'}) #爬取得所有的短评
pattern4 = r'<p class=""> (.*?)' \
r'</p>'
for item in result:
s=str(item)
count2=s.find('<p class="">')
count3=s.find('</p>')
s2=s[count2+12:count3] #抽取字符串中的评论
if 'class' not in s2:
f.write(s2)
#获取下一页的链接
next_url=soup.find_all('div',{'id':'paginator'})
pattern3=r'href="(.*?)">后页'
if(len(next_url)==0):
break
next_url=re.findall(pattern3,str(next_url[0])) #得到后页的链接
if(len(next_url)==0): #如果没有后页的链接跳出循环
break
next_url=next_url[0]
print('%d爬取下一页评论...'%begin)
begin=begin+1
#如果爬取了5次则多休息2秒
if(begin%6==0):
time.sleep(40)
print('休息...')
print(next_url)
f.close()
#多线程爬虫,爬取豆瓣影评
def thread_get_comment(filename):
next_url = '?start=19&limit=20&sort=new_score&status=P'
headers2 = {
"Host": "movie.douban.com",
"Referer": "https://www.douban.com/",
'User-Agent': agent,
'Connection': 'keep-alive',
}
f = open(filename, 'w+', encoding='utf-8')
comment_url = 'https://movie.douban.com/subject/26363254/comments'
crawl_queue=[comment_url+next_url]
crawl_queue.append('https://movie.douban.com/subject/26363254/comments?start=144&limit=20&sort=new_score&status=P')
seen=set(crawl_queue)
def process_queue():
begin = 1
while True:
try:
url=crawl_queue.pop()
except IndexError:
break
else:
time.sleep(5)
html = session.get(url=url,headers=headers2)
soup = BeautifulSoup(html.text, 'html.parser')
# 爬取当前页面的所有评论
result = soup.find_all('div', {'class': 'comment'}) # 爬取得所有的短评
pattern4 = r'<p class=""> (.*?)' \
r'</p>'
for item in result:
s = str(item)
count2 = s.find('<p class="">')
count3 = s.find('</p>')
s2 = s[count2 + 12:count3] # 抽取字符串中的评论
f.write(s2)
# 获取下一页的链接
next_url = soup.find_all('div', {'id': 'paginator'})
pattern3 = r'href="(.*?)">后页'
if (len(next_url) == 0):
break
next_url = re.findall(pattern3, str(next_url[0])) # 得到后页的链接
if (len(next_url) == 0): # 如果没有后页的链接跳出循环
break
next_url = next_url[0]
print('%d爬取下一页评论...' % begin)
begin = begin + 1
# 如果爬取了6次则多休息2秒
if (begin % 6 == 0):
print('休息...')
time.sleep(30)
print(next_url)
if comment_url+next_url not in seen:
seen.add(comment_url+next_url)
crawl_queue.append(comment_url+next_url)
threads=[]
max_threads=5
while threads or crawl_queue:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads)< max_threads and crawl_queue:
thread=threading.Thread(target=process_queue)
print('--------下一个线程----------')
thread.setDaemon(True) # set daemon so main thread can exit when receive ctrl + C
thread.start()
threads.append(thread)
time.sleep(2)
f.close()
if __name__=='__main__':
if isLogin():
print('您已经登录')
else:
print('xs')
login('dsdz@qq.com','5sdfsd6')
file_name='key3.txt'
get_comment(file_name) #单线程爬虫
#thread_get_comment(file_name) #多线程爬虫
save_jieba_result(file_name)
draw_wordcloud('pjl_jieba.txt')
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
- 208
- 209
- 210
- 211
- 212
- 213
- 214
- 215
- 216
- 217
- 218
- 219
- 220
- 221
- 222
- 223
- 224
- 225
- 226
- 227
- 228
- 229
- 230
- 231
- 232
- 233
- 234
- 235
- 236
- 237
- 238
- 239
- 240
- 241
- 242
- 243
- 244
- 245
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
- 208
- 209
- 210
- 211
- 212
- 213
- 214
- 215
- 216
- 217
- 218
- 219
- 220
- 221
- 222
- 223
- 224
- 225
- 226
- 227
- 228
- 229
- 230
- 231
- 232
- 233
- 234
- 235
- 236
- 237
- 238
- 239
- 240
- 241
- 242
- 243
- 244
- 245
爬取得评论保存在key3.txt 文本文件中:
生成图云
第一步需要:安装必要的Python 库,其中需要的有 生成图云scipy 、wordcloud。python库的安装方法,可以参考笔者的博客安装第三方库。一切准备就绪之后,就可以使用jieba分词对得到的所有评论进行分词,分词时候就可以绘制图云。
其中主要的代码 mywordCloud.py
from scipy.misc import imread
import codecs
from os import path
import jieba
from wordcloud import WordCloud
#暂时没有用到
def get_all_keywords(file_name):
word_lists=[] #关键词列表
with codecs.open(file_name,'r',encoding='utf-8') as f:
Lists=f.readlines()
for li in Lists:
cut_list=list(jieba.cut(li))
for word in cut_list:
word_lists.append(word)
word_lists_set=set(word_lists) #去除相同的元素
sort_count=[]
word_lists_set=list(word_lists_set)
length=len(word_lists_set)
print(u'共有%d个关键词'%length)
k = 1
for w in word_lists_set:
sort_count.append(w + u':' + str(word_lists.count(w)) + u"次\n")
print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"次")
k += 1
with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
f.writelines(sort_count)
def save_jieba_result(file_name):
#设置多线程切割
#jieba.enable_parallel(4)
dirs=path.join(path.dirname(__file__),file_name)
print(dirs)
with codecs.open(dirs,encoding='utf-8') as f:
comment_text=f.read()
cut_text=" ".join(jieba.cut(comment_text))
with codecs.open('pjl_jieba.txt','w',encoding='utf-8') as f:
f.write(cut_text)
def draw_wordcloud(file_name):
with codecs.open(file_name,encoding='utf-8') as f:
comment_text=f.read()
color_mask=imread('timg.jpg') #读取背景图片
stopwords = ['png','douban','com','href','https','img','img3','class','source','icon','shire',u'有点',u'真的',u'觉得',u'还是',u'一个',u'就是', u'电影', u'你们', u'这么', u'不过', u'但是', u'什么', u'没有', u'这个', u'那个', u'大家', u'比较', u'看到', u'真是',
u'除了', u'时候', u'已经', u'可以']
font = r'C:\Windows\Fonts\simfang.ttf'
cloud=WordCloud(font_path=font,background_color='white',max_words=20000,max_font_size=200,min_font_size=10,mask=color_mask,stopwords=stopwords)
word_cloud=cloud.generate(comment_text) #产生词云
word_cloud.to_file('mycloud.jpg')
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
通过上面两个代码,就可以生成漂亮的图云,来预测观看《战狼2》这部电影的人主要评论的关键词:
附上笔者的github源代码地址:https://github.com/wu-yy/warWolf