程序主要代码:(tencentPosition.py)
# -*- coding: utf-8 -*- import scrapy from tencent.items import TencentItem class TencentpositionSpider(scrapy.Spider): name = "tencent" allowed_domains = ["tencent.com"] url = "https://hr.tencent.com/position.php?&start=" offset = 0 #第一次处理的url,之后就不会在此处取 start_urls = [url + str(offset)] def parse(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): item = TencentItem() # 职位名称 item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] # 详情链接 item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] # 职位类别 r = each.xpath("./td[2]/text()").extract() item['positionType'] = r[0] if r else None # 招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item # if self.offset < 1680: #爬取所有结果 # self.offset += 10 # else: # # break # raise ("结束工作") yield scrapy.Request(self.url + str(self.offset), callback=self.parse) 管道文件(pipelines.py)
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json class TencentPipeline(object): def __init__(self): self.filename = open("tencent.json","w",encoding = "utf-8") def process_item(self, item, spider): text = json.dumps(dict(item),ensure_ascii=False) self.filename.write(str(text)) self.filename.write("\n") return item def close_spider(self,spider): self.filename.close()