【干货】python爬取《战狼2》电影短评论，生成图云

最新推荐文章于 2021-07-21 20:48:27 发布

天府云创

最新推荐文章于 2021-07-21 20:48:27 发布

阅读量4.1k

点赞数

模拟登陆豆瓣

第一次登陆需要验证码，之后的登陆可以隐去 “login(”username’,’password’）”,因为使用session保存了必要的登陆信息，代码如下:

import requests
try:
    import cookielib
except:
    import http.cookiejar as cookielib
import re
import time
import os.path
import json
from bs4 import BeautifulSoup
try:
    from PIL import Image
except:
    pass

from mywordCloud import save_jieba_result
from mywordCloud import draw_wordcloud
import threading
import codecs
# 构造 Request headers
agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
headers = {
    "Host": "www.douban.com",
    "Referer": "https://www.douban.com/",
    'User-Agent': agent,
}

#使用cookie登录信息
session=requests.session()
session.cookies=cookielib.LWPCookieJar(filename='cookies')

try:
    session.cookies.load(ignore_discard=True)
    print('成功加载cookie')
except:
    print("cookie 未能加载")

# 获取验证码
def get_captcha(url):
    #获取验证码
    print('获取验证码',url)
    captcha_url = url
    r = session.get(captcha_url, headers=headers)
    print('test')
    with open('captcha.jpg', 'wb') as f:
        f.write(r.content)
        f.close()
    # 用pillow 的 Image 显示验证码
    # 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入
    try:
        im = Image.open('captcha.jpg')
        im.show()
        im.close()
    except:
        print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))
    captcha = input("please input the captcha\n>")
    return captcha

def isLogin():
    #登录个人主页，查看是否登录成功
    url='https://www.douban.com/people/151607908/'
    login_code=session.get(url,headers=headers,allow_redirects=False).status_code
    if login_code==200:
        return True
    else:
        return False


def login(acount,secret):
    douban="https://www.douban.com/"
    htmlcha=session.get(douban,headers=headers).text
    patterncha=r'id="captcha_image" src="(.*?)" alt="captcha"'
    httpcha=re.findall(patterncha,htmlcha)
    pattern2=r'type="hidden" name="captcha-id" value="(.*?)"'
    hidden_value=re.findall(pattern2,htmlcha)
    print(hidden_value)

    post_data = {
        "source": "index_nav",
        'form_email': acount,
        'form_password': secret
    }
    if len(httpcha)>0:
        print('验证码连接',httpcha)
        capcha=get_captcha(httpcha[0])
        post_data['captcha-solution']=capcha
        post_data['captcha-id']=hidden_value[0]

    print (post_data)
    post_url='https://www.douban.com/accounts/login'
    login_page=session.post(post_url,data=post_data,headers=headers)
    #保存cookies
    session.cookies.save()

    if isLogin():
        print('登录成功')
    else:
        print('登录失败')


def get_movie_sort():
    time.sleep(1)
    movie_url='https://movie.douban.com/chart'
    html=session.get(movie_url,headers=headers)
    soup=BeautifulSoup(html.text,'html.parser')
    result=soup.find_all('a',{'class':'nbg'})
    print(result)

#爬取短评论
def get_comment(filename):  #filename为爬取得内容保存的文件
    begin=1
    comment_url = 'https://movie.douban.com/subject/11600078/comments'
    next_url='?start=20&limit=20&sort=new_score&status=P'
    headers2 = {
            "Host": "movie.douban.com",
            "Referer": "https://www.douban.com/",
            'User-Agent': agent,
            'Connection': 'keep-alive',
        }
    f=open(filename,'w+',encoding='utf-8')
    while(True):
        time.sleep(6)
        html=session.get(url=comment_url+next_url,headers=headers2)
        soup=BeautifulSoup(html.text,'html.parser')

        #爬取当前页面的所有评论
        result=soup.find_all('div',{'class':'comment'}) #爬取得所有的短评
        pattern4 = r'<p class=""> (.*?)' \
                   r'</p>'
        for item in result:
            s=str(item)
            count2=s.find('<p class="">')
            count3=s.find('</p>')
            s2=s[count2+12:count3]  #抽取字符串中的评论
            if 'class' not in s2:
                f.write(s2)

        #获取下一页的链接
        next_url=soup.find_all('div',{'id':'paginator'})
        pattern3=r'href="(.*?)">后页'
        if(len(next_url)==0):
            break
        next_url=re.findall(pattern3,str(next_url[0]))  #得到后页的链接
        if(len(next_url)==0): #如果没有后页的链接跳出循环
            break
        next_url=next_url[0]
        print('%d爬取下一页评论...'%begin)
        begin=begin+1
        #如果爬取了5次则多休息2秒
        if(begin%6==0):
            time.sleep(40)
            print('休息...')
        print(next_url)
    f.close()

#多线程爬虫，爬取豆瓣影评
def thread_get_comment(filename):
    next_url = '?start=19&limit=20&sort=new_score&status=P'
    headers2 = {
        "Host": "movie.douban.com",
        "Referer": "https://www.douban.com/",
        'User-Agent': agent,
        'Connection': 'keep-alive',
    }
    f = open(filename, 'w+', encoding='utf-8')
    comment_url = 'https://movie.douban.com/subject/26363254/comments'
    crawl_queue=[comment_url+next_url]
    crawl_queue.append('https://movie.douban.com/subject/26363254/comments?start=144&limit=20&sort=new_score&status=P')
    seen=set(crawl_queue)

    def process_queue():
        begin = 1
        while True:
            try:
                url=crawl_queue.pop()
            except  IndexError:
                break
            else:
                time.sleep(5)
                html = session.get(url=url,headers=headers2)
                soup = BeautifulSoup(html.text, 'html.parser')

                # 爬取当前页面的所有评论
                result = soup.find_all('div', {'class': 'comment'})  # 爬取得所有的短评
                pattern4 = r'<p class=""> (.*?)' \
                           r'</p>'
                for item in result:
                    s = str(item)
                    count2 = s.find('<p class="">')
                    count3 = s.find('</p>')
                    s2 = s[count2 + 12:count3]  # 抽取字符串中的评论
                    f.write(s2)

                # 获取下一页的链接
                next_url = soup.find_all('div', {'id': 'paginator'})
                pattern3 = r'href="(.*?)">后页'
                if (len(next_url) == 0):
                    break
                next_url = re.findall(pattern3, str(next_url[0]))  # 得到后页的链接
                if (len(next_url) == 0):  # 如果没有后页的链接跳出循环
                    break
                next_url = next_url[0]
                print('%d爬取下一页评论...' % begin)
                begin = begin + 1
                # 如果爬取了6次则多休息2秒
                if (begin % 6 == 0):
                    print('休息...')
                    time.sleep(30)

                print(next_url)
                if comment_url+next_url not in seen:
                    seen.add(comment_url+next_url)
                    crawl_queue.append(comment_url+next_url)

    threads=[]
    max_threads=5
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads)< max_threads and crawl_queue:
            thread=threading.Thread(target=process_queue)
            print('--------下一个线程----------')
            thread.setDaemon(True) # set daemon so main thread can exit when receive ctrl + C
            thread.start()
            threads.append(thread)
        time.sleep(2)


    f.close()

if __name__=='__main__':
    if isLogin():
        print('您已经登录')
    else:
        print('xs')
        login('dsdz@qq.com','5sdfsd6')

    file_name='key3.txt'
    get_comment(file_name)        #单线程爬虫
    #thread_get_comment(file_name)  #多线程爬虫
    save_jieba_result(file_name)
    draw_wordcloud('pjl_jieba.txt')


 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

爬取得评论保存在key3.txt 文本文件中：
这里写图片描述

生成图云

第一步需要：安装必要的Python 库，其中需要的有生成图云scipy 、wordcloud。python库的安装方法，可以参考笔者的博客安装第三方库。一切准备就绪之后，就可以使用jieba分词对得到的所有评论进行分词，分词时候就可以绘制图云。

其中主要的代码 mywordCloud.py

from scipy.misc import  imread
import codecs
from os import  path
import jieba
from wordcloud import WordCloud


#暂时没有用到
def get_all_keywords(file_name):
    word_lists=[]  #关键词列表
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        Lists=f.readlines()
        for li in Lists:
            cut_list=list(jieba.cut(li))
            for word in cut_list:
                word_lists.append(word)

    word_lists_set=set(word_lists)  #去除相同的元素
    sort_count=[]
    word_lists_set=list(word_lists_set)

    length=len(word_lists_set)
    print(u'共有%d个关键词'%length)
    k = 1
    for w in word_lists_set:
        sort_count.append(w + u':' + str(word_lists.count(w)) + u"次\n")
        print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"次")
        k += 1
    with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
        f.writelines(sort_count)


def save_jieba_result(file_name):
    #设置多线程切割
    #jieba.enable_parallel(4)
    dirs=path.join(path.dirname(__file__),file_name)
    print(dirs)
    with codecs.open(dirs,encoding='utf-8') as f:
        comment_text=f.read()
    cut_text=" ".join(jieba.cut(comment_text))
    with codecs.open('pjl_jieba.txt','w',encoding='utf-8') as f:
        f.write(cut_text)


def draw_wordcloud(file_name):
    with codecs.open(file_name,encoding='utf-8') as f:
        comment_text=f.read()
    color_mask=imread('timg.jpg') #读取背景图片
    stopwords = ['png','douban','com','href','https','img','img3','class','source','icon','shire',u'有点',u'真的',u'觉得',u'还是',u'一个',u'就是', u'电影', u'你们', u'这么', u'不过', u'但是', u'什么', u'没有', u'这个', u'那个', u'大家', u'比较', u'看到', u'真是',
                 u'除了', u'时候', u'已经', u'可以']
    font = r'C:\Windows\Fonts\simfang.ttf'
    cloud=WordCloud(font_path=font,background_color='white',max_words=20000,max_font_size=200,min_font_size=10,mask=color_mask,stopwords=stopwords)
    word_cloud=cloud.generate(comment_text)  #产生词云
    word_cloud.to_file('mycloud.jpg')

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
 
 
  
  
 
 
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55