大数据HIve+Sqoop实训报告(上)

爬取51job以及拉勾网的数据。

1、编写爬虫文件job_spider.

# -*- coding: utf-8 -*-
import scrapy
import urllib.request
from scrapy.http import Request
from ..items import JobItem


class Job51SpiderSpider(scrapy.Spider):
    name = 'job_spider'
    allowed_domains = ['51job.com']
    start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html']
    lastpage = 0
    jobname = "java"
    def __init__(self, *args, **kwargs):
        super(Job51SpiderSpider, self).__init__(*args, **kwargs)
        self.jobname = urllib.request.quote(self.jobname)
        #定义最开始的URL
        self.start_urls = ['https://search.51job.com/list/%252C,000000,0000,00,9,99,' + self.jobname + ',2,1.html']

    #实现翻页功能
    def parse(self, response):
        if self.lastpage == 0:
            self.lastpage = int(response.xpath('//*[@id="resultList"]/div[55]/div/div/div/span[1]/text()').re(r'\d+')[0])
        print(self.lastpage)
        for page in range(1, self.lastpage + 1):
            nexturl = "https://search.51job.com/list/%252C,000000,0000,00,9,99," + self.jobname + ",2," + str(page) + ".html"
            #self.target += 1
            yield Request(nexturl, dont_filter=True, callback=self.joblist)

    #找出每个页面详情页面的URL
    def joblist(self, response):
        #print(response.url)
        positList = response.xpath('//*[@id="resultList"]/div')
        for posit in positList:
            #print(len(positList))
            url = posit.xpath('p/span/a/@href').extract_first()
            if url:
                yield Request(url, callback=self.jobdetail)

    #用Xpath解析详情页面的URL
    def jobdetail(self, response):
        item = JobItem()
        item['name'] = response
  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值