# 用框架爬取腾讯招聘信息并保存到数据库
# main.py
from scrapy import cmdline
cmdline.execute('scrapy crawl tencent_new'.split())
# tencent_new.py
# -*- coding: utf-8 -*-
import scrapy
from urllib import request,parse
from ..items import JobItem
from datetime import datetime
class TencentNewSpider(scrapy.Spider):
name = 'tencent_new'
allowed_domains = ['tencent.com']
start_urls = ['http://hr.tencent.com']
base_url = 'https://hr.tencent.com/position.php?start=%d'
def parse(self, response):
# 构造分页请求
for i in range(0,100 + 1,10):
fullurl = self.base_url % i
yield scrapy.Request(fullurl,callback=self.parse_list)
# 解析列表页
def parse_list(self,response):
# print(response.url)
detail_list = response.xpath('//td[@class="l square"]/a/@href').extract()
for link in detail_list:
link = request.urljoin(self.base_url,link)
yield scrapy.Request(link,callback=self.parse_detail)
# 解析详情页
def parse_detail(self,response):
item = JobItem()
p_name = response.xpath('//td[@id="sharetitle"]/text()').extract_first()
p_location = response.xpath('//tr[@class="c bottomline"]/td[1]/text()').extract_first()
p_type = response.xpath('//tr[@class="c bottomline"]/td[2]/text()').extract_first()
p_number = response.xpath('//tr[@class="c bottomline"]/td[3]/text()').extract_first()
# print(p_name,p_location,p_type,p_number)
p_info = response.xpath('//ul[@class="squareli"]')
p_duty = p_info[0]
p_require = p_inf