# -*- coding: utf-8 -*- import scrapy from urllib import request from Py06_2018_3_16.items import TencentItem class TencentSpider(scrapy.Spider): name = 'tencent' allowed_domains = ['hr.tencent.com'] start_urls = [] base_url='https://hr.tencent.com/position.php?&start=%s#a' for i in range(0,1): url=base_url%(i-1)*10 start_urls.append(url) def parse(self, response): job_even=response.xpath('//tr[@class="even"]') job_odd=response.xpath('//tr[@class="odd"]') #合并数组 jobs=job_even+job_odd for job in jobs: print(job) item=TencentItem() #时间 date=job.xpath('.//td[5]/text()').extract()[0] item['date']=date #地点 location=job.xpath('.//td[4]/text()').extract()[0] item['location']=location #人数 num = job.xpath('.//td[3]/text()').extract()[0] item['num'] = num #职位类别 type = job.xpath('.//td[2]/text()').extract()[0] item['type'] = type #职位名称 name=job.xpath('.//td[1]/a/text()').extract()[0] item['name']=name #链接 url = job.xpath('.//td[1]/a/@href').extract()[0] # 比较低级 # url='https://hr.tencent.com/'+url #高级 #拼接全路径 url = request.urljoin(response.url,url) item['url'] = url print(name+'\t'+type+'\t'+num+'\t'+location+'\t'+date+'\t'+url) print('~~~~~~~~~~~') # yield item #请求详情页,二级流程 yield scrapy.Request(url=url,callback=self.parse_detail,meta={'data':item}) def parse_detail(self,response): # print('~~~~~~~~~~~detail~~~~~~~~~~~~`') # print(response.text) item=response.meta['data'] with open('detail.html','w',encoding='utf-8') as f: f.write(response.body.decode('utf-8')) #工作职责 duty=response.xpath('//tr[@class="c"][1]//li/text()').extract() # print(duty)#字符串列表 duty=''.join(duty) item['duty'] = duty #工作要求 rq=response.xpath('//tr[@class="c"][2]//li/text()').extract() rq=''.join(rq) item['rq']=rq print(duty+'\n') print(rq) print('~~~~~~~~~~~~~~~`') yield item
Python3~scrapy项目之爬取当前页和详细页
最新推荐文章于 2024-04-11 19:16:54 发布
本文介绍了一个使用Python Scrapy框架爬取腾讯招聘网站职位信息的项目,包括当前页面的职位名称、类型、人数、地点、日期和链接,以及通过链接深入抓取职位详情页的工作职责和要求。
摘要由CSDN通过智能技术生成