scrapy使用之爬取智联招聘

智联招聘是通过动态加载文件获取数据,所以一开始解析的url不是网页的地址,而是json 文件,寻找json文件的方法。通过查看各个工作具体的网址得知规律;https://job.zhaopin.com/XXXXX.htm, 其中XXXX就是json文件中number对应的值。
步骤:

  1. 解析json文件,得到所有number对应的值
  2. 通过拼接得到新的网址
  3. 进入网址,得到自己想要的内容进入网址,得到自己想要的内容
    源码:
# -*- coding: utf-8 -*-
import scrapy
from  scrapy import Request
import json

from JobScrapy.items import Job


class ZhilianSpider(scrapy.Spider):
    name = 'zhilian'
    #allowed_domains = ['www.zhaopin.com']
    #start_urls = ['https://jobs.zhaopin.com/156244110251133.htm']
    start_urls = ['https://fe-api.zhaopin.com/c/i/sou?start=0&pageSize=60&cityId=489&industry=10100']
    def parse(self,response):
        js=json.loads(response.body)
        #print(js['data'])
        res=js['data']['results']
        for i in res:
            num=i['number']
            # https: // jobs.zhaopin.com / CZ486770730J00164642602.htm
            url='https://jobs.zhaopin.com/'+str(num)+'.htm'
            yield Request(url=url, callback=self.parse_job)
        for i in range(1,2):
            url='https://fe-api.zhaopin.com/c/i/sou?start='+str(i*60)+'0&pageSize=60&cityId=489&industry=10100'
            yield Request(url=url,callback=self.parse)
    def parse_job(self, response):
        #print(response.text)
        job_name=response.css('.l.info-h3::text').extract_first('')
        salary=response.css('.l.info-money strong::text').extract_first('')
        company=response.css('.company.l a::text').extract_first('')
        address=response.css('.info-three.l span a::text').extract_first('')
        all=response.css('.info-three.l span::text').extract()
        expericnce=all[0]
        gradute=all[1]
        num=all[2]
        # light=response.css('.pos-info-tit').extract()
        position=response.css('.pos-ul p::text').extract()
        if position==[]:
            position=response.css('.pos-ul p span::text').extract()
        address_extract=response.css('.add-txt::text').extract_first('')
        company_intro=response.css('.intro-content::text').extract()
        company_area=response.css('.promulgator-ul.cl li strong a::text').extract_first('')
        company_type=response.css('.promulgator-ul.cl li strong::text').extract()
        # print('工作',job_name)
        # print('工资',salary)
        # print('公司',company)
        # print('学历',gradute)
        # print('地址',address)
        # print('公司领域',company_type)
        # print('职位要求',str(position))
        # print('具体地址',address_extract)
        # print('公司详情',str(company_intro))
        # print('公司领域',company_area)
        job=Job()
        job['job_name']=job_name
        job['salary']=salary
        job['company'] =company
        job['address'] =address
        job['expericnce'] =expericnce
        job['gradute'] =gradute
        job['num'] = num
        job['position'] =position
        job['address_extract'] =address_extract
        job['company_intro'] =company_intro
        job['company_area'] =company_area
        job['company_type'] =company_type
        yield  job

将得到的数据保存在csv文件中:
items.py

class Job(scrapy.Item):
    job_name = scrapy.Field()
    salary = scrapy.Field()
    company = scrapy.Field()
    address = scrapy.Field()
    position=scrapy.Field()
    address_extract = scrapy.Field()
    company_intro = scrapy.Field()
    company_area = scrapy.Field()
    company_type =scrapy.Field()
    expericnce=scrapy.Field()
    gradute=scrapy.Field()
    num=scrapy.Field()

pipeline.py文件

class Pipeline_ToCSV(object):

    def __init__(self):
        # csv文件的位置,无需事先创建
        store_file = os.path.dirname(__file__) + '/spiders/job.csv'
        # 打开(创建)文件
        self.file = open(store_file, 'w+',newline='')
        # csv写法
        self.writer = csv.writer(self.file)

    def process_item(self, item, spider):
        self.writer.writerow((item['job_name'],item['salary'],item['company'],item['address'],item['position'],item['address_extract'],item['company_intro'],item['company_area'],item['company_type'],item['expericnce'],item['gradute'],item['num']))
        return item

    def close_spider(self, spider):
        # 关闭爬虫时顺便将文件保存退出
        self.file.close()

setting.py

ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'JobScrapy.pipelines.JobscrapyPipeline': 300,
    'JobScrapy.pipelines.Pipeline_ToCSV':100
}

main.py:程序入口

import os
import sys
from scrapy.cmdline `import` execute
data=os.path.abspath(__file__)#绝对路径
dir_file=os.path.dirname(data)#上一级
sys.path.append(dir_file)
execute(['scrapy','crawl','zhilian'])

如有问题,请及时指定,谢谢

  • 0
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 6
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值