爬取海王电影20180101-20190101影评
导包
import json
import time
from datetime import datetime, timedelta
import requests
发送请求,获取响应
# 发送请求,获取响应
def get_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
# print(response.encoding)
return response.text
return None
解析响应
# 解析响应
def parse_html(html):
data = json.loads(html)['cmts'] # 将str转化json
comments = []
for item in data:
comment = {
'id': item['id'],
'cityName': item['cityName'] if 'cityName' in item else ' ',
'content': item['content'].replace('\n', ' '),
'score': item['score'],
'startTime': item['startTime']
}
comments.append(comment)
return comments
主函数
# 主函数
def main(id, start_time, end_time):
while start_time > end_time:
url = 'http://m.maoyan.com/mmdb/comments/movie/' + str(id) + '.json?' \
'_v=yes&offset=0&startTime=' + start_time.replace(
' ', '%20')
# print(url)
html = None
# 发送请求,获取响应
try:
html = get_data(url)
print(html)
except Exception:
time.sleep(0.5)
else:
time.sleep(0.1)
# 解析响应
comments = parse_html(html)
# print(comments)
# print(len(comments))
start_time = comments[14]['startTime']
start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S') # 转化为字符串
# 保存数据
for item in comments:
with open('海王影评2018.txt', 'a', encoding='utf-8') as f:
f.write(str(item['id']) + '\t' + item['cityName'] + '\t' + item['content'] + '\t' + str(
item['score']) + '\t' + item['startTime'] + '\n')
程序入口
if __name__ == '__main__':
# url = http://m.maoyan.com/mmdb/comments/movie/249342.json?_v=yes&offset=0&startTime=2019-01-01%2000:00:00
id = 249342
start_time = '2019-01-01 00:00:00'
end_time = '2018-01-01 00:00:00'
main(id, start_time, end_time)