初始化项目及项目配置同scrapy 初探(xpath)一样
新建zwblog/spiders/lianjia_spider.py
内容如下:
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from LjCrwaler.items import LianjiaItem
class LJCrwalerSpider(CrawlSpider):
name = 'ljcrwaler'
allowed_domains = ['lianjia.com']
start_urls = ['https://qd.lianjia.com/ershoufang/']
# 设置抓取规则
rules = {
# 房产详情链接
Rule(LinkExtractor(
restrict_xpaths="//ul[@class='sellListContent']/li/div[@class='info clear']/div[@class='title']/a"),
follow=True, callback="process_item"),
# 翻页链接
Rule(LinkExtractor(restrict_xpaths="//div[@class='pagination_group_a']/a"), follow=True)