# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from day12.items import LagouItem
class LagouSpider(CrawlSpider):
name = 'lagou'
allowed_domains = ['lagou.com']
start_urls = ['http://lagou.com/']
rules = (
# linkExtractor 是需要过滤出来的url的正则表达式
# 过滤好的正则表达式会调用callback解析页面的过程
# follow=True 就表示需要获取这个页面内的链接
# False 不再当前页面寻找url继续处理
Rule(LinkExtractor(allow=r'https://www\.lagou\.com/zhaopin/\w+/'),follow = True),
Rule(LinkExtractor(allow=r'https://www\.lagou\.com/jobs/\d+.html'), callback='parse_item', follow=False),
)
def parse_item(self, response):
print('当前为列表页')
# i = {}
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
# return i
def parse_detail(self,response):
print('当前为详情页')
title = response.xpath('//span[@class="name"]/text()').extract_first()
print(title)
salary = response.xpath('//span[@class="salary"]/text()').eatract_first()
print(salary)
company = response.xpath('//div[@class="company"]/text()').extract_first()
print(company)
advantage = response.xpath('//dd[@class="job-advantage"]//text()').extract()
advantage = ''.join(advantage)
print(advantage)
job_bt = response.xpath('//dd[class="job_bt"]//text()').extract()
job_bt = ''.join(job_bt)
print(job_bt)
item = LagouItem()
item['title'] = title
item['salary'] = salary
item['company'] = company
item['advantage'] = advantage
item['job_bt'] = job_bt
return item