小白flag10 爬虫爬取豆瓣电影写入csv
准备
json化
csv文件操作学习
代码
import requests
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning # 解决警告
class douban_movie():
def __init__(self):
self.rowCount = input('输入你想要的电影个数')
# 获取文章内内容
def movie_url(self, rowCount):
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) # 解决警告
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=' + self.rowCount + '&page_start=0'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.7 Safari/537.36"}
response = requests.get(url, headers=headers, verify=False, timeout=30)
text = response.text
return text
# 分析获取的内容解析
def json_dispose(self, text):
content = json.loads(text)
content = content.get('subjects')
movie_list = list()
for content in content:
movie = dict()
rate = content.get('rate')
title = content.get('title')
url = content.get('url')
movie['rate'] = rate
movie['title'] = title
movie['url'] = url
movie_list.append(movie)
movie_list = sorted(movie_list, key=lambda x: x['rate'], reverse=True) # sorted 加 lambda实现排序 reverse=True实现倒叙
return movie_list
# json写入文件csv
def output_csv(self, datalist):
# print(type(datalist), len(datalist))
import csv
# 准备好存储的csv文件
csv_file = open("douban_data.csv", 'w', newline='', encoding='utf-8-sig') # 解决中文乱码问题
writer = csv.writer(csv_file)
writer.writerow(['评分', '作品名称', '豆瓣链接'])
for data in datalist:
writer.writerow([data['rate'], data['title'], data['url']])
csv_file.close()
# 运行函数
def run(self):
result = self.json_dispose(self.movie_url(self.rowCount))
self.output_csv(result)
print("爬虫完毕,文件已生成。快去查看吧")
if __name__ == '__main__':
movie = douban_movie()
movie.run()