import requests
from fake_useragent import UserAgent
import time
import csv
import json
import random
from argparse import ArgumentParser
PROXY_POOL = [
'http://proxy1.example.com:8080',
'http://proxy2.example.com:8080',
]
class DoubanMovieSpider:
def __init__(self, use_proxy=False):
self.ua = UserAgent()
self.session = requests.Session()
self.use_proxy = use_proxy
self.headers = {
'Host': 'movie.douban.com',
'Referer': 'https://movie.douban.com/'
}
def get_random_header(self):
return {
'User-Agent': self.ua.random,
**self.headers
}
def fetch_page(self, url, page):
try:
proxies = {'http': random.choice(PROXY_POOL)} if self.use_proxy else None
response = self.session.get(
url,
headers=self.get_random_header(),
params={'start': page * 20},
proxies=proxies,
timeout=10
)
response.raise_for_status()
return response.json()
except Exception as e:
print(f'第{page}页抓取失败: {str(e)}')
return None
def parse_data(self, json_data):
return [{
'title': item['title'],
'rating': item['rate'],
'url': item['url'],
'id': item['id']
} for item in json_data.get('subjects', [])]
def save_data(self, data, filename):
with open(f'{filename}.csv', 'a', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=['title', 'rating', 'url', 'id'])
if f.tell() == 0:
writer.writeheader()
writer.writerows(data)
with open(f'{filename}.json', 'a', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
def run(self, start_page=0, pages=5):
base_url = 'https://movie.douban.com/j/search_subjects'
for page in range(start_page, start_page + pages):
data = self.fetch_page(base_url, page)
if data:
parsed_data = self.parse_data(data)
self.save_data(parsed_data, 'douban_movies')
print(f'成功保存第{page}页{len(parsed_data)}条数据')
time.sleep(random.uniform(1.5, 3.5))
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--start', type=int, default=0, help='起始页码')
parser.add_argument('--pages', type=int, default=5, help='抓取页数')
parser.add_argument('--proxy', action='store_true', help='启用代理')
args = parser.parse_args()
spider = DoubanMovieSpider(use_proxy=args.proxy)
spider.run(start_page=args.start, pages=args.pages)