python3 scrapy_Python3~scrapy项目之爬取当前页和详细页

Python3~scrapy项目之爬取当前页和详细页

发布时间:2018-05-30 16:00,

浏览次数:299

, 标签:

Python

scrapy

# -*- coding: utf-8 -*- import scrapy from urllib import request from

Py06_2018_3_16.itemsimport TencentItem class TencentSpider(scrapy.Spider): name

='tencent' allowed_domains = ['hr.tencent.com'] start_urls = [] base_url=

'https://hr.tencent.com/position.php?&start=%s#a' for i in range(0,1):

url=base_url%(i-1)*10 start_urls.append(url) def parse(self, response):

job_even=response.xpath('//tr[@class="even"]') job_odd=response.xpath(

'//tr[@class="odd"]') #合并数组 jobs=job_even+job_odd for job in jobs: print(job)

item=TencentItem()#时间 date=job.xpath('.//td[5]/text()').extract()[0] item['date'

]=date#地点 location=job.xpath('.//td[4]/text()').extract()[0] item['location'

]=location#人数 num = job.xpath('.//td[3]/text()').extract()[0] item['num'] = num

#职位类别 type = job.xpath('.//td[2]/text()').extract()[0] item['type'] = type #职位名称

name=job.xpath('.//td[1]/a/text()').extract()[0] item['name']=name #链接 url =

job.xpath('.//td[1][email protected]').extract()[0] # 比较低级 #

url='https://hr.tencent.com/'+url #高级 #拼接全路径 url = request.urljoin(response.url,

url) item['url'] = url print(name+'\t'+type+'\t'+num+'\t'+location+'\t'+date+'\t

'+url) print('~~~~~~~~~~~') # yield item #请求详情页,二级流程 yield scrapy.Request(url

=url,callback=self.parse_detail,meta={'data':item}) def parse_detail(self,

response):# print('~~~~~~~~~~~detail~~~~~~~~~~~~`') # print(response.text)

item=response.meta['data'] with open('detail.html','w',encoding='utf-8') as f:

f.write(response.body.decode('utf-8')) #工作职责 duty=response.xpath(

'//tr[@class="c"][1]//li/text()').extract() # print(duty)#字符串列表 duty=''

.join(duty) item['duty'] = duty #工作要求 rq=response.xpath(

'//tr[@class="c"][2]//li/text()').extract() rq=''.join(rq) item['rq']=rq print

(duty+'\n') print(rq) print('~~~~~~~~~~~~~~~`') yield item

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值