学会抓包,搞到一切。
用到的模块
源代码
from urllib import request
import time
import re
import os
os.mkdir(r'C:\Users\*\Desktop\PYhomework\c800')
search_counts = 800
url = 'https://movie.douban.com/subject/2353023/reviews'
headers = {***}
headers['Referer'] = 'https://movie.douban.com/subject/***/'
i = 0
lists = []
for count in range(0, search_counts, 20):
url = url + "?start=" + str(count)
req = request.Request(url, headers=headers)
response = request.urlopen(req)
HTML = response.read()
HTML = HTML.decode("utf-8")
pattern = re.compile("<div data-cid=\"(.*)\">")
lists = pattern.findall(HTML) + lists
'''爬取实际评论'''
headers[
'Cookie'] = '***'
headers['Host'] = 'movie.douban.com'
headers['Sec-Fetch-Dest'] = 'document'
headers['Sec-Fetch-Mode'] = 'navigate'
headers['Sec-Fetch-Site'] = 'none'
headers['Sec-Fetch-User'] = '?1'
headers['Upgrade-Insecure-Requests'] = '1'
print('爬取成功!')
for id in lists:
i += 1
url = 'https://movie.douban.com/j/review/' + id + '/full'
req = request.Request(url, headers=headers)
response = request.urlopen(req)
comment = response.read()
comment = comment.decode("utf-8")
with open(r"C:\Users\*\Desktop\PYhomework\c800\comment%d.txt" % i, mode="w", encoding="utf-8") as c:
c.write(comment)
print("comment%d保存成功!" % i)
time.sleep(0) # 随缘设置
print("抓取完成!")
爬取的结果
数据清洗
import os
import re
os.mkdir("c800ok")
for i in range(1, 801):
with open(r"C:\Users\*\Desktop\PYhomework\c800\comment%d.txt" % i, encoding="utf-8") as f:
f = f.read()
start = f.find(",\"html\":\"")
end = f.find('\"}')
strings = f[start+9: end]
strings = re.sub('<br>', '\n', strings)
strings = strings.replace('\\t', '\n')
strings = strings.replace(' ', '')
with open(r'C:\Users\*\Desktop\PYhomework\c800ok\clear_comment%d.txt' % i, mode="w", encoding="utf-8") as fs:
fs.write(strings)
print('写入成功!')
清洗结果
源码:https://tominochick.github.io/