import random import re import scrapy import json from scrapy_douban.items import ScrapyDoubanItem # scrapy crawl reviews # 在 Scrapy 中,.xpath() 方法返回的是一个 SelectorList, # 它代表了 XPath 查询所匹配到的所有节点。 # SelectorList 是一个容器对象,其中的每个元素都是 Selector 对象。 # 为了从中提取数据,你需要通过 .get() 或 .getall() 方法来获取实际的文本内容或 HTML 数据。 class ReviewsSpider(scrapy.Spider): name = "reviews" allowed_domains = ["movie.douban.com"] start_urls = ["https://movie.douban.com/subject/30402296/reviews"] # 设置下载延迟,每个请求之间间隔2到3秒 custom_settings = { 'DOWNLOAD_DELAY': 2, # 每个请求之间的延迟时间(单位:秒) 'CONCURRENT_REQUESTS': 1 # 限制并发请求数量为1(确保按顺序爬取) } def parse(self, response): review_url = "https://movie.douban.com/subject/30402296/reviews?start={}" for page_num in range(0, 100, 20): # 假设爬取前5页,每页20个影评 yield scrapy.Request(review_url.format(page_num), callback=self.parse_reviews) def parse_reviews(self, response): # 提取所有影评 reviews = response.xpath('//div[@class="main review-item"]') for review in reviews: item = ScrapyDoubanItem() # 定义城市列表 cities = [ "北京", "上海", "广州", "深圳", "杭州", "成都", "武汉", "重庆", "天津", "苏州", "南京", "厦门", "青岛", "长沙", "沈阳", "西安", "济南", "郑州", "合肥", "大连" ] # 随机选择一个城市 random_city = random.choice(cities) item['city'] = random_city # 提取用户名 username = review.xpath('.//a[@class="name"]/text()').get() item['username'] = username.strip() if username else '未知' # 处理空值和去掉空格 # 提取评分 rating = review.xpath('.//span/@title').get() item['rating'] = rating.strip() if rating else '未评分' # 提取时间(这里假设时间的格式是固定的,可以进一步格式化) time = review.xpath('.//span[@class="main-meta"]/text()').get() item['time'] = time.strip() if time else '未知时间' useful_count = review.xpath('.//a[@title="有用"]/span/text()').get() # 使用正则去除所有空白字符 useful_count = re.sub(r'\s+', '', useful_count) # \s+ 匹配所有空白字符(包括空格、制表符、换行等) print(useful_count) if useful_count and useful_count.isdigit(): item['useful_count'] = int(useful_count) else: item['useful_count'] = 0 useless_count = review.xpath('.//a[@title="没用"]/span/text()').get() # 同样使用正则去除所有空白字符 useless_count = re.sub(r'\s+', '', useless_count) print(useless_count) if useless_count and useless_count.isdigit(): item['useless_count'] = int(useless_count) else: item['useless_count'] = 0 # 提取影评内容 content = review.xpath('.//div[@class="short-content"]/text()').get() #print(content) item['content'] = content.replace('\xa0', ' ').strip() if content else '无内容' # 去除空格并填充默认值 yield item