Python爬取51搜学网

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u012977315/article/details/86005989

写在前面

本文仅作为技术学习,勿用于其他渠道。

 项目介绍

本文用Scrapy框架,爬取51搜学网的学校数据用于分析各省的学校分布情况。

Scrapy框架介绍

Scrapy是用纯Python实现一个为了爬取网站数据、提取结构性数据而编写的应用框架,用途非常广泛。

框架的力量,用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容以及各种图片,非常之方便。

Scrapy 使用了 Twisted'twɪstɪd异步网络框架来处理网络通讯,可以加快我们的下载速度,不用自己去实现异步框架,并且包含了各种中间件接口,可以灵活的完成各种需求。

Scrapy框架官方网址:http://doc.scrapy.org/en/latest

Scrapy中文维护站点:http://scrapy-chs.readthedocs...

Scrapy框架使用简单介绍

  1. 新建项目 (scrapy startproject xxx):新建一个新的爬虫项目
  2. 明确目标 (编写items.py):明确你想要抓取的目标
  3. 制作爬虫 (spiders/xxspider.py):制作爬虫开始爬取网页
  4. 存储内容 (pipelines.py):设计管道存储爬取内容

创建项目

scrapy startproject  school_spider

用PyCharm编译器打开任一工程,运行以上命令,再打开该工程。

定义要爬取的数据字段

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SchoolspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # 学校名称
    schoolName = scrapy.Field()
    # 学校类型 中学、小学
    schoolType = scrapy.Field()
    # 地区
    schoolRegion = scrapy.Field()
    # 学校属性 全国重点
    schoolAttribute = scrapy.Field()
    # 学校性质 公办
    schoolProperty = scrapy.Field()
    # 学校地址
    schoolAddress = scrapy.Field()
    # 学校联系方式
    schoolMobile = scrapy.Field()
    # 学校省
    schoolProvince = scrapy.Field()
    # 学校市
    schoolCity = scrapy.Field()
    # 学校区
    schoolArea = scrapy.Field()
    pass

编写核心爬取功能

在spiders包底下创建school_spider的Python文件,编写以下内容。

# -*- coding: utf-8 -*-

import sys

import scrapy

from schoolSpider.items import SchoolspiderItem


class school_spider(scrapy.Spider):
    # Python2.7 运行需要打开注释
    # reload(sys)
    # sys.setdefaultencoding("utf-8")

    name = "schoolSpider"
    # 开始爬取的地址
    start_urls = [
        "http://xuexiao.51sxue.com/slist/?o=&t=3&areaCodeS=&level=&sp=&score=&order=&areaS=%B8%A3%BD%A8%CA%A1&searchKey="
    ]

    def parse(self, response):
        # 从第一页到1334页,循环爬取
        for index in range(0, 1335):
            url = "http://xuexiao.51sxue.com/slist/?o=&t=3&areaCodeS=&level=&sp=&score=&order=&areaS=%B8%A3%BD%A8%CA%A1&searchKey=&page=" + str(
                index)
            yield scrapy.Request(url=url, callback=self.parse_item)

    def parse_item(self, response):
        # 解析到列表
        schoolSelector = response.xpath("//div[@class='school_main']/div")
        #初始化item
        school = SchoolspiderItem()
        #循环设置item数据
        for sub in schoolSelector:
            school['schoolName'] = sub.xpath(
                './div[@class = "school_t_con"]/div[@class="school_m_main fl"]/li/h3/a/text()').extract()
            school['schoolType'] = sub.xpath(
                './div[@class = "school_t_con"]/div[@class="school_m_main fl"]/li[4]/ol[2]/b/text()').extract()
            school['schoolAttribute'] = sub.xpath(
                './div[@class = "school_t_con"]/div[@class="school_m_main fl"]/li[3]/b/text()').extract()
            school['schoolProperty'] = sub.xpath(
                './div[@class = "school_t_con"]/div[@class="school_m_main fl"]/li[4]/ol[1]/b/text()').extract()
            school['schoolAddress'] = sub.xpath('./ul/li[1]/b/text()').extract()
            school['schoolMobile'] = sub.xpath('./ul/li[2]/b/text()').extract()
            #爬取省市区字段
            schoolAddress = sub.xpath(
                './div[@class = "school_t_con"]/div[@class="school_m_main fl"]/li[2]/b/text()').extract()
            # 默认为空,防止数据中无省、市、区数据时,数据延用上次数据
            school['schoolProvince'] = ''
            school['schoolCity'] = ''
            school['schoolArea'] = ''
            for i in range(len(schoolAddress)):
                schoolAddress = schoolAddress[i]
                addressList = schoolAddress.split(" ")
                for j in range(len(addressList)):
                    address = addressList[j]
                    print
                    address
                    if '省' in address:
                        school['schoolProvince'] = address
                    if '市' in address:
                        school['schoolCity'] = address
                    if '区' in address:
                        school['schoolArea'] = address
            # 学校名称不为空 才发到pipelines处理
            if len(school['schoolName']) != 0:
                yield school

编写数据处理功能

Scrapy框架爬取到数据后会发送到 pipelines中,也可以设置其他的 pipelines,设置相关的会在后续介绍到。

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import xlwt


class SchoolspiderPipeline(object):
    """
    初始化时 定义变量
    """

    def __init__(self):
        self.workbook = ""
        self.worksheet = ""
        self.row = ""

    """"
    开始运行爬虫时 初始化标题
    """

    def open_spider(self, spider):
        self.workbook = xlwt.Workbook(encoding="utf-8")
        self.worksheet = self.workbook.add_sheet("sheet", cell_overwrite_ok=True)
        self.worksheet.write(0, 0, u'学校性质')
        self.worksheet.write(0, 1, u'学校所在区')
        self.worksheet.write(0, 2, u'学校类型')
        self.worksheet.write(0, 3, u'学校名称')
        self.worksheet.write(0, 4, u'学校地址')
        self.worksheet.write(0, 5, u'学校属性')
        self.worksheet.write(0, 6, u'学校所在省')
        self.worksheet.write(0, 7, u'学校所在市')
        self.worksheet.write(0, 8, u'学校联系方式')
        self.row = 1

    """
     解析item并写入到Excel
    """

    def process_item(self, item, spider):
        poetry_list = list(item.values())
        self.write_in_excel(poetry_list)
        self.row = self.row + 1
        return item

    """
    爬虫结束时保存文件
    """

    def close_spider(self, spider):
        self.workbook.save('51搜学网.xls')

    """
    写入到excel
    """

    def write_in_excel(self, crawl_list):
        for col in range(len(crawl_list)):
            self.worksheet.write(self.row, col, crawl_list[col])

配置工程

在setting文件中,打开以下注释或者是新增以下数据

# -*- coding: utf-8 -*-

# Scrapy settings for schoolSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'schoolSpider'

SPIDER_MODULES = ['schoolSpider.spiders']
NEWSPIDER_MODULE = 'schoolSpider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'


COOKIES_ENABLED=False

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'schoolSpider.middlewares.SchoolspiderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'schoolSpider.middlewares.SchoolspiderDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'schoolSpider.pipelines.SchoolspiderPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

附上代码传送门

 

 

 

展开阅读全文

没有更多推荐了,返回首页