【scrapy】python爬虫案例实战亲测有效

yurgqg

已于 2023-04-05 22:47:31 修改

阅读量693

点赞数 2

文章标签： python 爬虫 scrapy

于 2023-04-05 22:20:45 首次发布

本文链接：https://blog.csdn.net/qq_44871403/article/details/129978380

版权

一、项目结构
在这里插入图片描述

创建项目

scrapy startproject tencent

开始spider

cd .\tencent\                          
--
scrapy genspider hr careers.tencent.com

3.hr.py文件源码

import scrapy
from urllib import parse
import time
import json
from tencent.items import TencentItem


class HrSpider(scrapy.Spider):
    name = "hr"
    allowed_domains = ["careers.tencent.com"]

    # start_urls = ["http://careers.tencent.com/"]
    hr_url = "https://careers.tencent.com/tencentcareer/api/post/Query"
    data = {
        'timestamp': '',
        'countryId': '',
        'cityId': 2,  # 城市位置
        'bgIds': '',
        'productId': '',
        'parentCategoryId': '',
        'attrId': '',
        'keyWord': 'python',  # 搜索关键字
        'pageIndex': 1,  # 页号
        'pageSize': 10,  # 页大小
        'lanugage': 'zh-cn',
        'area': 'cn'
    }

    # 初始化请求方法
    def start_requests(self):
        self.data['timestamp'] = int(time.time() * 1000)
        url = self.hr_url + "?" + parse.urlencode(self.data)
        yield scrapy.Request(url, callback=self.parse1)

    def parse1(self, response):
        # print(response.url)
        json_data = json.loads(response.body)
        #判断是否爬取到信息
        if isinstance(json_data['Data']['Posts'],list):
            for dd in json_data['Data']['Posts']:
                # print("dd:",dd)
                # 定义详情Url地址
                desc_url = "https://careers.tencent.com/tencentcareer/api/post/ByPostId"
                params = {
                    'timestamp': int(time.time() * 1000),
                    'postId': dd['PostId'],
                    'language': 'zh-cn'
                }
                durl = desc_url + "?" + parse.urlencode(params)
                yield scrapy.Request(durl, callback=self.parse2)

            # 加载下一页的请求
            self.data['pageIndex'] += 1
            self.data['timestamp'] = int(time.time() * 1000)
            url = self.hr_url + "?" + parse.urlencode(self.data)
            yield scrapy.Request(url,callback=self.parse1)

    # def parse(self, response):
    #     pass
    # 解析工作详情信息
    def parse2(self, response):
        # print("详情：" + response.url)
        json_data = json.loads(response.body)
        item = TencentItem()
        item['PostId'] = json_data['Data']['PostId']
        item['RecruitPostName'] = json_data['Data']['RecruitPostName']
        item['LocationName'] = json_data['Data']['LocationName']
        item['BGName'] = json_data['Data']['BGName']
        item['CategoryName'] = json_data['Data']['CategoryName']
        item['Responsibility'] = json_data['Data']['Responsibility']
        item['Requirement'] = json_data['Data']['Requirement']
        yield item

4.items.py源码

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    PostId = scrapy.Field()#岗位Id
    RecruitPostName = scrapy.Field()#岗位工作
    LocationName = scrapy.Field()#工作地
    BGName = scrapy.Field()#部门名称
    CategoryName = scrapy.Field()#工作类型
    Responsibility = scrapy.Field()#工作职责
    Requirement = scrapy.Field()#工作要求

5.pipelines.py源码

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql


class TencentPipeline:
    def process_item(self, item, spider):
        return item


# 执行MySQL数据写入操作
class MySQLPipeline(object):
    def __init__(self, host, user, password, database, port):
        self.host = host
        self.user = user
        self.password = password
        self.database = database
        self.port = port

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            user=crawler.settings.get('MYSQL_USER'),
            password=crawler.settings.get('MYSQL_PASS'),
            database=crawler.settings.get('MYSQL_DATABASE'),
            port=crawler.settings.get('MYSQL_PORT'),
        )

    def open_spider(self, spider):
        '''负责连接数据库'''
        self.db = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            db=self.database,
            port=self.port,
            charset='utf8',
        )
        self.cursor = self.db.cursor()

    def close_spider(self, spider):
            self.db.close()

    def process_item(self, item, spider):
        '''执行数据的添加操作'''
        sql = "insert into hr(pid,pname,city,bgname,cname,responsibility,requirement) values(%s,%s,%s,%s,%s,%s,%s)"
        self.cursor.execute(sql, (
            item['PostId'], item['RecruitPostName'], item['LocationName'], item['BGName'], item['CategoryName'],
            item['Responsibility'], item['Requirement']))
        self.db.commit()  # 提交事务
        return item

6.settings.py源码

# Scrapy settings for tencent project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "tencent"

SPIDER_MODULES = ["tencent.spiders"]
NEWSPIDER_MODULE = "tencent.spiders"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "tencent (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    "tencent.middlewares.TencentSpiderMiddleware": 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    "tencent.middlewares.TencentDownloaderMiddleware": 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    # "tencent.pipelines.TencentPipeline": 300,
    "tencent.pipelines.MySQLPipeline": 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'yurgqg'
MYSQL_USER = 'root'
MYSQL_PASS = 'root'
MYSQL_PORT = 3306

7.执行命令
在这里插入图片描述

8.数据库效果图
在这里插入图片描述
9.数据库创建语句

CREATE TABLE yurgqg.`hr` (
`pid` varchar(50) NOT NULL,
`pname` varchar(255) DEFAULT null,
`city` varchar(32) DEFAULT NULL,
`bgname` varchar(32) DEFAULT NULL,
`cname` varchar(32) DEFAULT null,
`responsibility` text DEFAULT NULL,
`requirement` text DEFAULT null,
PRIMARY KEY (`pid`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;