python爬取腾讯招聘的职位

最新推荐文章于 2021-12-24 13:54:06 发布

无悔_一叶扁舟

最新推荐文章于 2021-12-24 13:54:06 发布

阅读量1k

点赞数 1

分类专栏： python爬虫文章标签：爬取Tencent招聘 crawlspiders

本文链接：https://blog.csdn.net/u011662320/article/details/82725525

版权

python爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1.新建项目使用命令

scrapy startproject tencentcrawl

2.进入tencentcrawl\spiders

scrapy genspider -t crawl tencent hr.tencent.com

-t是模板的意思

3.编写items.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy



class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # 职位名
    position_name = scrapy.Field()
    # 详情连接
    position_link = scrapy.Field()
    # 职位类别
    position_type = scrapy.Field()
    # 招聘人数
    people_num = scrapy.Field()
    # 工作地点
    work_location = scrapy.Field()
    # 发布时间
    publish_time = scrapy.Field()

4.编写爬虫/spiders/tencent.py

# -*- coding: utf-8 -*-
import scrapy
# 导入链接匹配类，用于匹配符合规则的链接
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tencentcrawl.items import TencentItem


class TencentSpider(CrawlSpider):
    name = "tencent"
    allow_domains = ["hr.tencent.com"]
    start_urls = ["http://hr.tencent.com/position.php?&start=0#a"]

    page_link = LinkExtractor(allow="start=\d+")
    # 匹配的规则
    rules = (
        # allow满足的正则表达式，callback:回调请求的方法，注意这个是字符串，这个字符串是回调函数名
        Rule(page_link, callback='parse_item', follow=True),
    )

    def parse_item(self, response):

        item_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for each in item_list:
            # 职位名
            item = TencentItem()
            item['position_name'] = each.xpath("./td[1]/a/text()").extract()[0]
            # 详情连接
            item['position_link'] = each.xpath("./td[1]/a/@href").extract()[0]
            # 职位类别
            item['position_type'] = each.xpath("./td[2]/text()").extract()[0]
            # 招聘人数
            item['people_num'] = each.xpath("./td[3]/text()").extract()[0]
            # 工作地点
            item['work_location'] = each.xpath("./td[4]/text()").extract()[0]
            # 发布时间
            item['publish_time'] = each.xpath("./td[5]/text()").extract()[0]
            yield item

5.写管道文件pipelines.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json

class TencentcrawlPipeline(object):
    def __init__(self):
        self.filename=open("position.json","wb")

    def process_item(self, item, spider):
        # 将json对象转成json字符串存储到文件中
        text = json.dumps(dict(item),ensure_ascii=False)+",\n"
        self.filename.write(text.encode("utf_8"))
        return item

    def close_spider(self):
        self.filename.close()

6.写setting.py的日志和选用的pipeline

# -*- coding: utf-8 -*-

# Scrapy settings for tencentcrawl project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tencentcrawl'

SPIDER_MODULES = ['tencentcrawl.spiders']
NEWSPIDER_MODULE = 'tencentcrawl.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tencentcrawl (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# LOG_ENABLED 默认: True，启用logging
# LOG_ENCODING 默认: 'utf-8'，logging使用的编码
# LOG_FILE 默认: None，在当前目录里创建logging输出文件的文件名
# LOG_LEVEL 默认: 'DEBUG'，log的最低级别
# LOG_STDOUT 默认: False 如果为 True，进程所有的标准输出(及错误)将会被重定向到log中。例如，执行 print "hello" ，其将会在Scrapy log中显示。

LOG_FILE="position.log"
#保存日志的等级
LOG_LEVEL="INFO"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'tencentcrawl.middlewares.TencentcrawlSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'tencentcrawl.middlewares.TencentcrawlDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'tencentcrawl.pipelines.TencentcrawlPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

7.控制台执行命令:

scrapy crawl tencent

8.文件项目结构图和结果图: