智联招聘是通过动态加载文件获取数据,所以一开始解析的url不是网页的地址,而是json 文件,寻找json文件的方法。通过查看各个工作具体的网址得知规律;https://job.zhaopin.com/XXXXX.htm, 其中XXXX就是json文件中number对应的值。
步骤:
- 解析json文件,得到所有number对应的值
- 通过拼接得到新的网址
- 进入网址,得到自己想要的内容进入网址,得到自己想要的内容
源码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import json
from JobScrapy.items import Job
class ZhilianSpider(scrapy.Spider):
name = 'zhilian'
#allowed_domains = ['www.zhaopin.com']
#start_urls = ['https://jobs.zhaopin.com/156244110251133.htm']
start_urls = ['https://fe-api.zhaopin.com/c/i/sou?start=0&pageSize=60&cityId=489&industry=10100']
def parse(self,response):
js=json.loads(response.body)
#print(js['data'])
res=js['data']['results']
for i in res:
num=i['number']
# https: // jobs.zhaopin.com / CZ486770730J00164642602.htm
url='https://jobs.zhaopin.com/'+str(num)+'.htm'
yield Request(url=url, callback=self.parse_job)
for i in range(1,2):
url='https://fe-api.zhaopin.com/c/i/sou?start='+str(i*60)+'0&pageSize=60&cityId=489&industry=10100'
yield Request(url=url,callback=self.parse)
def parse_job(self, response):
#print(response.text)
job_name=response.css('.l.info-h3::text').extract_first('')
salary=response.css('.l.info-money strong::text').extract_first('')
company=response.css('.company.l a::text').extract_first('')
address=response.css('.info-three.l span a::text').extract_first('')
all=response.css('.info-three.l span::text').extract()
expericnce=all[0]
gradute=all[1]
num=all[2]
# light=response.css('.pos-info-tit').extract()
position=response.css('.pos-ul p::text').extract()
if position==[]:
position=response.css('.pos-ul p span::text').extract()
address_extract=response.css('.add-txt::text').extract_first('')
company_intro=response.css('.intro-content::text').extract()
company_area=response.css('.promulgator-ul.cl li strong a::text').extract_first('')
company_type=response.css('.promulgator-ul.cl li strong::text').extract()
# print('工作',job_name)
# print('工资',salary)
# print('公司',company)
# print('学历',gradute)
# print('地址',address)
# print('公司领域',company_type)
# print('职位要求',str(position))
# print('具体地址',address_extract)
# print('公司详情',str(company_intro))
# print('公司领域',company_area)
job=Job()
job['job_name']=job_name
job['salary']=salary
job['company'] =company
job['address'] =address
job['expericnce'] =expericnce
job['gradute'] =gradute
job['num'] = num
job['position'] =position
job['address_extract'] =address_extract
job['company_intro'] =company_intro
job['company_area'] =company_area
job['company_type'] =company_type
yield job
将得到的数据保存在csv文件中:
items.py
class Job(scrapy.Item):
job_name = scrapy.Field()
salary = scrapy.Field()
company = scrapy.Field()
address = scrapy.Field()
position=scrapy.Field()
address_extract = scrapy.Field()
company_intro = scrapy.Field()
company_area = scrapy.Field()
company_type =scrapy.Field()
expericnce=scrapy.Field()
gradute=scrapy.Field()
num=scrapy.Field()
pipeline.py文件
class Pipeline_ToCSV(object):
def __init__(self):
# csv文件的位置,无需事先创建
store_file = os.path.dirname(__file__) + '/spiders/job.csv'
# 打开(创建)文件
self.file = open(store_file, 'w+',newline='')
# csv写法
self.writer = csv.writer(self.file)
def process_item(self, item, spider):
self.writer.writerow((item['job_name'],item['salary'],item['company'],item['address'],item['position'],item['address_extract'],item['company_intro'],item['company_area'],item['company_type'],item['expericnce'],item['gradute'],item['num']))
return item
def close_spider(self, spider):
# 关闭爬虫时顺便将文件保存退出
self.file.close()
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'JobScrapy.pipelines.JobscrapyPipeline': 300,
'JobScrapy.pipelines.Pipeline_ToCSV':100
}
main.py:程序入口
import os
import sys
from scrapy.cmdline `import` execute
data=os.path.abspath(__file__)#绝对路径
dir_file=os.path.dirname(data)#上一级
sys.path.append(dir_file)
execute(['scrapy','crawl','zhilian'])
如有问题,请及时指定,谢谢