【scrapy】python爬虫案例实战亲测有效

一、项目结构
在这里插入图片描述

  1. 创建项目
scrapy startproject tencent
  1. 开始spider
cd .\tencent\                          
--
scrapy genspider hr careers.tencent.com

3.hr.py文件源码

import scrapy
from urllib import parse
import time
import json
from tencent.items import TencentItem


class HrSpider(scrapy.Spider):
    name = "hr"
    allowed_domains = ["careers.tencent.com"]

    # start_urls = ["http://careers.tencent.com/"]
    hr_url = "https://careers.tencent.com/tencentcareer/api/post/Query"
    data = {
        'timestamp': '',
        'countryId': '',
        'cityId': 2,  # 城市位置
        'bgIds': '',
        'productId': '',
        'parentCategoryId': '',
        'attrId': '',
        'keyWord': 'python',  # 搜索关键字
        'pageIndex': 1,  # 页号
        'pageSize': 10,  # 页大小
        'lanugage': 'zh-cn',
        'area': 'cn'
    }

    # 初始化请求方法
    def start_requests(self):
        self.data['timestamp'] = int(time.time() * 1000)
        url = self.hr_url + "?" + parse.urlencode(self.data)
        yield scrapy.Request(url, callback=self.parse1)

    def parse1(self, response):
        # print(response.url)
        json_data = json.loads(response.body)
        #判断是否爬取到信息
        if isinstance(json_data['Data']['Posts'],list):
            for dd in json_data['Data']['Posts']:
                # print("dd:",dd)
                # 定义详情Url地址
                desc_url = "https://careers.tencent.com/tencentcareer/api/post/ByPostId"
                params = {
                    'timestamp': int(time.time() * 1000),
                    'postId': dd['PostId'],
                    'language': 'zh-cn'
                }
                durl = desc_url + "?" + parse.urlencode(params)
                yield scrapy.Request(durl, callback=self.parse2)

            # 加载下一页的请求
            self.data['pageIndex'] += 1
            self.data['timestamp'] = int(time.time() * 1000)
            url = self.hr_url + "?" + parse.urlencode(self.data)
            yield scrapy.Request(url,callback=self.parse1)

    # def parse(self, response):
    #     pass
    # 解析工作详情信息
    def parse2(self, response):
        # print("详情:" + response.url)
        json_data = json.loads(response.body)
        item = TencentItem()
        item['PostId'] = json_data['Data']['PostId']
        item['RecruitPostName'] = json_data['Data']['RecruitPostName']
        item['LocationName'] = json_data['Data']['LocationName']
        item['BGName'] = json_data['Data']['BGName']
        item['CategoryName'] = json_data['Data']['CategoryName']
        item['Responsibility'] = json_data['Data']['Responsibility']
        item['Requirement'] = json_data['Data']['Requirement']
        yield item

4.items.py源码

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    PostId = scrapy.Field()#岗位Id
    RecruitPostName = scrapy.Field()#岗位工作
    LocationName = scrapy.Field()#工作地
    BGName = scrapy.Field()#部门名称
    CategoryName = scrapy.Field()#工作类型
    Responsibility = scrapy.Field()#工作职责
    Requirement = scrapy.Field()#工作要求

5.pipelines.py源码

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql


class TencentPipeline:
    def process_item(self, item, spider):
        return item


# 执行MySQL数据写入操作
class MySQLPipeline(object):
    def __init__(self, host, user, password, database, port):
        self.host = host
        self.user = user
        self.password = password
        self.database = database
        self.port = port

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            user=crawler.settings.get('MYSQL_USER'),
            password=crawler.settings.get('MYSQL_PASS'),
            database=crawler.settings.get('MYSQL_DATABASE'),
            port=crawler.settings.get('MYSQL_PORT'),
        )

    def open_spider(self, spider):
        '''负责连接数据库'''
        self.db = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            db=self.database,
            port=self.port,
            charset='utf8',
        )
        self.cursor = self.db.cursor()

    def close_spider(self, spider):
            self.db.close()

    def process_item(self, item, spider):
        '''执行数据的添加操作'''
        sql = "insert into hr(pid,pname,city,bgname,cname,responsibility,requirement) values(%s,%s,%s,%s,%s,%s,%s)"
        self.cursor.execute(sql, (
            item['PostId'], item['RecruitPostName'], item['LocationName'], item['BGName'], item['CategoryName'],
            item['Responsibility'], item['Requirement']))
        self.db.commit()  # 提交事务
        return item

6.settings.py源码

# Scrapy settings for tencent project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "tencent"

SPIDER_MODULES = ["tencent.spiders"]
NEWSPIDER_MODULE = "tencent.spiders"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "tencent (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    "tencent.middlewares.TencentSpiderMiddleware": 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    "tencent.middlewares.TencentDownloaderMiddleware": 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    # "tencent.pipelines.TencentPipeline": 300,
    "tencent.pipelines.MySQLPipeline": 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'yurgqg'
MYSQL_USER = 'root'
MYSQL_PASS = 'root'
MYSQL_PORT = 3306

7.执行命令
在这里插入图片描述在这里插入图片描述

8.数据库效果图
在这里插入图片描述
9.数据库创建语句

CREATE TABLE yurgqg.`hr` (
`pid` varchar(50) NOT NULL,
`pname` varchar(255) DEFAULT null,
`city` varchar(32) DEFAULT NULL,
`bgname` varchar(32) DEFAULT NULL,
`cname` varchar(32) DEFAULT null,
`responsibility` text DEFAULT NULL,
`requirement` text DEFAULT null,
PRIMARY KEY (`pid`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;
  • 2
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,我可以为您提供一个简单的Python爬虫Scrapy框架的实战案例。 在这个案例中,我们将使用Scrapy框架爬取豆瓣电影的电影名、导演、评分等信息,并将结果保存到一个JSON文件中。 首先,您需要安装Scrapy框架。可以通过以下命令在命令行中安装: ``` pip install scrapy ``` 安装完成后,我们可以开始编写爬虫。 1. 创建项目 使用以下命令创建一个新的Scrapy项目: ``` scrapy startproject douban_movies ``` 这将在当前目录下创建一个名为“douban_movies”的目录,其中包含了项目的基本文件结构。 2. 定义Spider 我们需要定义一个Spider来指定我们要爬取的网站,并提供如何抓取网页内容的信息。在这个案例中,我们将在douban.com的电影页面上抓取信息。 在项目的根目录下,创建一个名为“douban_spider.py”的文件,并将以下代码复制到文件中: ```python import scrapy class DoubanSpider(scrapy.Spider): name = 'douban' start_urls = ['https://movie.douban.com/top250'] def parse(self, response): for movie in response.css('div.item'): yield { 'title': movie.css('span.title::text').get(), 'rating': movie.css('span.rating_num::text').get(), 'director': movie.css('div.info span:nth-child(3)::text').get(), 'year': movie.css('div.info span:nth-child(4)::text').get() } next_page = response.css('span.next a::attr(href)').get() if next_page is not None: yield response.follow(next_page, self.parse) ``` 在这个Spider中,我们定义了一个名称为“douban”的Spider,并指定了我们要抓取的起始URL。在“parse”方法中,我们使用CSS选择器选择了页面中的电影信息,并使用yield语句将信息返回。 此外,我们还检查了页面中是否有下一页,如果有,则使用“response.follow”方法继续爬取下一页。 3. 运行Spider 在命令行中,进入项目的根目录,并使用以下命令运行Spider: ``` scrapy crawl douban -o movies.json ``` 这将运行名为“douban”的Spider,并将结果保存到一个名为“movies.json”的JSON文件中。 现在,您可以在项目目录中找到“movies.json”文件,查看Scrapy框架成功抓取到的电影信息。 这就是一个简单的Python爬虫Scrapy框架的实战案例。希望这能帮助您更好地理解Scrapy框架的使用方法。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值