什么是增量爬虫?
增量爬虫在每次爬取数据后,都会把爬取的最后位置或者时间点记录下来,然后下次执行爬虫任务时,就从这个位置或时间点开始,只爬取这之后发布或更新的数据。
爬取过程
首先配置settings
LOG_LEVEL = "WARNING"
ROBOTSTXT_OBEY = False
然后进入主文件进行爬虫
导包
import scrapy
from tianya.items import TianyaItem
from redis import Redis
配置redis数据库
class YuansuSpider(scrapy.Spider):
name = "yuansu"
allowed_domains = ["txrpic.com"]
start_urls = ["https://www.txrpic.com/"]
def __init__(self, name=None, **kwargs):
super(YuansuSpider, self).__init__(name, **kwargs)
try:
self.redis = Redis(host='localhost', port=6379, db=1, password='666666', decode_responses=True)
self.redis.ping()
self.logger.warning("已成功连接到 Redis。")
except Exception as e:
self.logger.error(f"无法连接到 Redis: {e}")
爬取数据
def parse(self, response, **kwargs):
# print(response.text)
li_list = response.xpath("//ol[@class='user-items']/li/div/div/a[@title='打开帖子']/@href").extract()
for li in li_list:
li = response.urljoin(li)
# 判断是否爬取过
result = self.redis.sismember('tianya:ty:detail:url', li)
if result:
print(f'已经爬取过{li}了')
else:
yield scrapy.Request(li, callback=self.parse_detail)
# 提取最后一个a标签的href属性值
next_href = response.xpath("//div[@class='stream-footer']/div/a[last()]/@href").extract_first()
yield scrapy.Request(response.urljoin(next_href), callback=self.parse)
def parse_detail(self, response):
item = TianyaItem()
author = response.xpath("//div[@class='topic-user']/a/strong/text()").extract_first()
content = response.xpath("//div[@class='topic-content']/p/text()").extract()
item['author'] = author.strip()
if content: # 判断是否为空
if isinstance(content, list): # 判断是否为列表类型
item['content'] = ' '.join([c.strip() for c in content]) # 如果是列表,移除每个元素周围的空白并用空格连接它们
elif isinstance(content, str): # 检查content是否为字符串
item['content'] = content.strip() # 如果是字符串,直接移除两边的空白
else:
item['content'] = "" # 如果content为空,为避免报错,可以赋值为空字符串
# 将爬取过的url存入redis
self.redis.sadd('tianya:ty:detail:url', response.url)
yield item