(四)Scrapy 初级实战——腾讯招聘

王不亏

于 2022-04-26 17:33:19 发布

阅读量873

点赞数 1

文章标签： python

本文链接：https://blog.csdn.net/weixin_47177392/article/details/124432702

版权

一.txzhaopin_spider.py

import json
import math

import scrapy
from TxZhaopin.items import TxzhaopinItem, CountItem
import requests


class TxzhaopinSpiderSpider(scrapy.Spider):
    name = 'txzhaopin_spider'
    allowed_domains = ['careers.tencent.com']

    def start_requests(self):
        self.keyword = 'java'
        start_url = f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1650935321763&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={self.keyword}&pageIndex=1&pageSize=10&language=zh-cn&area=cn'
        yield scrapy.Request(start_url, callback=self.parse_count)

    def parse(self, response, **kwargs):
        item = TxzhaopinItem()
        post_list = json.loads(response.text)['Data']['Posts']
        for post in post_list:
            item['post_name'] = post['RecruitPostName']
            item['location'] = post['CountryName'] + post['LocationName']
            item['category'] = post['CategoryName']
            item['update_time'] = post['LastUpdateTime']
            item['url'] = 'http://careers.tencent.com/jobdesc.html?postId=' + post['PostId']
            item['keywords'] = self.keyword
            yield item

    def parse_count(self, response, **kwargs):
        count = CountItem()
        count['count_num'] = json.loads(response.text)['Data']['Count']
        for pagenum in range(1, math.ceil(count['count_num']/10)+1):
            print(pagenum)
            url = f'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1650935321763&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={self.keyword}&pageIndex={str(pagenum)}&pageSize=10&language=zh-cn&area=cn'
            yield scrapy.Request(url, callback=self.parse)

二.items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TxzhaopinItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    post_name = scrapy.Field()
    location = scrapy.Field()
    category = scrapy.Field()
    update_time = scrapy.Field()
    url = scrapy.Field()
    keywords = scrapy.Field()


class CountItem(scrapy.Item):
    count_num = scrapy.Field()

三.pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import pymysql
from itemadapter import ItemAdapter


class MysqlPipeline:
    def open_spider(self, spider):
        print('开始爬虫............')
        self.conn = pymysql.connect(host='...', port=3306, user='...',
                                    password='...', db='...')
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        try:
            sql_remove = f'''SELECT id FROM tx_recruit WHERE position("{item['url']}" in url);'''
            self.cursor.execute(sql_remove)
            id_list = self.cursor.fetchall()
            if not id_list:
                sql_insert = 'insert into tx_recruit(`post_name`,`location`, `category`, `update_time`, `url`, `keywords`) value(%s,%s,%s,%s,%s,%s);'
                self.cursor.execute(sql_insert, [item['post_name'], item['location'], item['category'], item['update_time'], item['url'], item['keywords']])
                self.conn.commit()
            else:
                for post_id in id_list[0]:
                    sql_update = f'''UPDATE tx_recruit SET keywords = CONCAT(keywords, "、", "{item['keywords']}") WHERE id = {post_id} and NOT position("{item['keywords']}" in keywords);'''
                    self.cursor.execute(sql_update)
                    self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        print('结束爬虫............')
        self.cursor.close()
        self.conn.close()

四.middlewares.py

自定义UA中间件

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class UserAgentMiddleware(object):
    def process_request(self, request, spider):
        request.headers['User-Agent'] = 'Mozilla/5.0 ......'

五.settings.py

# Scrapy settings for TxZhaopin project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'TxZhaopin'

SPIDER_MODULES = ['TxZhaopin.spiders']
NEWSPIDER_MODULE = 'TxZhaopin.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = ''

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 3

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

# DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'cookie': '',
#   'user-agent': USER_AGENT,
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'TxZhaopin.middlewares.TxzhaopinSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   # 'TxZhaopin.middlewares.TxzhaopinDownloaderMiddleware': 543,
   'TxZhaopin.middlewares.UserAgentMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   # 'TxZhaopin.pipelines.TxzhaopinPipeline': 300,
   'TxZhaopin.pipelines.MysqlPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'