爬取51job以及拉勾网的数据。
1、编写爬虫文件job_spider.
# -*- coding: utf-8 -*-
import scrapy
import urllib.request
from scrapy.http import Request
from ..items import JobItem
class Job51SpiderSpider(scrapy.Spider):
name = 'job_spider'
allowed_domains = ['51job.com']
start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html']
lastpage = 0
jobname = "java"
def __init__(self, *args, **kwargs):
super(Job51SpiderSpider, self).__init__(*args, **kwargs)
self.jobname = urllib.request.quote(self.jobname)
#定义最开始的URL
self.start_urls = ['https://search.51job.com/list/%252C,000000,0000,00,9,99,' + self.jobname + ',2,1.html']
#实现翻页功能
def parse(self, response):
if self.lastpage == 0:
self.lastpage = int(response.xpath('//*[@id="resultList"]/div[55]/div/div/div/span[1]/text()').re(r'\d+')[0])
print(self.lastpage)
for page in range(1, self.lastpage + 1):
nexturl = "https://search.51job.com/list/%252C,000000,0000,00,9,99," + self.jobname + ",2," + str(page) + ".html"
#self.target += 1
yield Request(nexturl, dont_filter=True, callback=self.joblist)
#找出每个页面详情页面的URL
def joblist(self, response):
#print(response.url)
positList = response.xpath('//*[@id="resultList"]/div')
for posit in positList:
#print(len(positList))
url = posit.xpath('p/span/a/@href').extract_first()
if url:
yield Request(url, callback=self.jobdetail)
#用Xpath解析详情页面的URL
def jobdetail(self, response):
item = JobItem()
item['name'] = response