python 爬虫框架scrapy 入门爬取博客园新闻（代码）

最新推荐文章于 2024-05-28 19:20:57 发布

以梦为马越骑越傻

最新推荐文章于 2024-05-28 19:20:57 发布

阅读量469

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/weixin_42670402/article/details/106990393

版权

爬虫专栏收录该内容

22 篇文章 1 订阅

订阅专栏

1、代码jobbole.py写爬取策略，

2、settings.py 配置pipelines、配置图片下载、配置是否遵循robote协议、数据库配置等

3、pipelines.py 主要是配置数据存储操作

4、本来用的xpath 对网站解析，但是循环解析时发现每次解析的都是第一条，不知道是什么问题，最后这部分代码换成css选择器就好了。

一、jobbole.py(主要写爬取策略)

# -*- coding: utf-8 -*-
import json
import os
import re
import sys
from urllib import parse

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.utils import Util
from items import JobbolespiderItem, ArticleItemLoader
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader


class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['http://news.cnblogs.com/']

    # def parse(self, response):
    #     jobbolespiderItem = JobbolespiderItem()
    #     jobbolespiderItem['front_image_url'] = ['https://images2018.cnblogs.com/news_topic/20180515154619133-1755088138.png']
    #     yield jobbolespiderItem

    def parse(self, response):
        item_selecters = response.css('#news_list .news_block')
        # item_selecters = response.xpath('//div[@id="news_list"]/div[@class="news_block"]')
        for item_selecter in item_selecters:
            # 循环中用xpath会出现问题，
            print(item_selecter.extract())
            front_image_url = item_selecter.css('.entry_summary a img::attr(src)').extract_first('')
            if front_image_url.startswith('//'):
                front_image_url = 'https:' + front_image_url
            url = item_selecter.css('h2 a::attr(href)').extract_first("")
            # front_image_url = item_selecter.xpath('//div[@class="entry_summary"]/a/img/@src').extract_first('')
            # url = item_selecter.xpath('//div[@class="content"]/h2/a/@href').extract_first('')
            # 请求详情数据
            print(url)
            yield Request(parse.urljoin(response.url, url), meta={"front_image_url": front_image_url},
                          callback=self.parse_detail)
        last_test = response.xpath('//div[@class="pager"]/a[last()]/text()').extract_first('')
        if last_test == 'Next >':
            # 请求下一页数据
            next_url = response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first('')
            yield Request(parse.urljoin(response.url, next_url), callback=self.parse)

    def parse_detail(self, response):
        # jobbolespiderItem = JobbolespiderItem()
        # if response.meta.get('front_image_url'):
        #     jobbolespiderItem['front_image_url'] = [parse.urljoin(response.url, response.meta.get('front_image_url'))]
        # else:
        #     jobbolespiderItem['front_image_url'] = []
        # if response.xpath('//div[@id="news_title"]/a/text()').extract_first(''):
        #     jobbolespiderItem['title'] = response.xpath('//div[@id="news_title"]/a/text()').extract_first('')
        # else:
        #     jobbolespiderItem['title'] = ''
        # if response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first(''):
        #     create_date_content = response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first('')
        # else:
        #     create_date_content = ''
        # print(create_date_content)
        # try:
        #     if re.match(r'发布于 (.*)', create_date_content).group(1):
        #         jobbolespiderItem['create_date'] = re.match(r'发布于 (.*)', create_date_content).group(1)
        #     else:
        #         jobbolespiderItem['create_date'] = '1970-01-01'
        # except Exception as e:
        #     print(e)
        # jobbolespiderItem['content'] = response.xpath('//div[@id="news_content"]/div[@id="news_body"]').extract_first(
        #     '')
        # tag_list = response.xpath('//div[@class="news_tags"]/a/text()').extract()
        # jobbolespiderItem['tags'] = ','.join(tag_list)
        # https: // news.cnblogs.com / NewsAjax / GetPreNewsById?contentId = 665930
        # print(response.url)
        # url_new = parse.urljoin(response.url, '/NewsAjax/GetAjaxNewsInfo?contentId={}'.format(id))


        item_loader = ArticleItemLoader(item=JobbolespiderItem(),response=response)
        item_loader.add_xpath('title','//div[@id="news_title"]/a/text()')
        item_loader.add_xpath('content','//div[@id="news_content"]/div[@id="news_body"]')
        item_loader.add_xpath('tags','//div[@class="news_tags"]/a/text()')
        item_loader.add_xpath('create_date','//div[@id="news_info"]/span[@class="time"]/text()')
        item_loader.add_value("url", response.url)
        if response.meta.get('front_image_url'):
            item_loader.add_value('front_image_url',parse.urljoin(response.url, response.meta.get('front_image_url')))

        article_item = item_loader.load_item()
        yield Request(parse.urljoin(response.url, '/NewsAjax/GetAjaxNewsInfo?contentId={}'.format(re.match(r'.*?(\d+)', response.url).group(1))), callback=self.parse_nums, meta={'article_item': article_item})

    def parse_nums(self, response):
        jobbolespiderItem = response.meta.get('article_item')
        if jobbolespiderItem.get('front_image_url'):
            jobbolespiderItem['image_url_id'] = Util().trans_md5(jobbolespiderItem.get('front_image_url')[0])
        r_json = json.loads(response.text)
        jobbolespiderItem['content_id'] = r_json.get('ContentID')
        jobbolespiderItem['comment_count'] = r_json.get('CommentCount')
        jobbolespiderItem['total_view'] = r_json.get('TotalView')
        jobbolespiderItem['digg_count'] = r_json.get('DiggCount')
        jobbolespiderItem['bury_count'] = r_json.get('BuryCount')


        yield jobbolespiderItem

二、settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for AricleSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os

BOT_NAME = 'AricleSpider'

SPIDER_MODULES = ['AricleSpider.spiders']
NEWSPIDER_MODULE = 'AricleSpider.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'AricleSpider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'AricleSpider.middlewares.AriclespiderSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'AricleSpider.middlewares.AriclespiderDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'AricleSpider.pipelines.AricleImagePipeline': 1,
    # 'AricleSpider.pipelines.AricleSaveJsonPipeline': 2,
    # 'AricleSpider.pipelines.AricleSaveDBPipeline': 3,
    'AricleSpider.pipelines.MysqlTwistedPipeline': 4,
    # 'AricleSpider.pipelines.AriclespiderPipeline': 300
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 图片下载存放地址
IMAGES_URLS_FIELD = 'front_image_url'  # 图片路径名的配置
img_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'images')
print(img_path)
IMAGES_STORE = img_path

MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "jobbole"
MYSQL_USER = "root"
MYSQL_PASSWORD = "root"

3、pipelines.py 数据存储

# -*- coding: utf-8 -*-
import json
import os

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.pipelines.images import ImagesPipeline
from twisted.enterprise import adbapi


class AriclespiderPipeline(object):
    def process_item(self, item, spider):
        return item


class AricleImagePipeline(ImagesPipeline):
    # 把下载图片和对应的本地地址放在一个对象中
    def item_completed(self, results, item, info):
        if 'front_image_url' in item:
            for ok, value in results:
                # value 存放图片url和本地存储path
                image_file_path = value["path"]
            item["front_image_path"] = image_file_path
        return item


class AricleSaveJsonPipeline(object):
    # 存储数据到json中
    def __init__(self):
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ceshi.json')
        self.f = open(path, "a", encoding='utf-8')

    def process_item(self, item, spider):
        itme_json = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.f.write(itme_json)
        return item


class AricleSaveDBPipeline(object):
    # 存储数据到mysql中
    def __init__(self):
        # 连接database
        self.conn = pymysql.connect(
            host="127.0.0.1",
            user="root", password="root",
            database="article_spider",
            charset="utf8")
        # 得到一个可以执行SQL语句的光标对象
        self.cursor = self.conn.cursor()  # 执行完毕返回的结果集默认以元组显示

    def process_item(self, item, spider):
        sql = '''
        INSERT INTO jobbole_article 
        (front_image_url, create_date,image_url_id,title,content,tags,content_id,comment_count,total_view,digg_count,bury_count)
         VALUES
         ("{}","{}","{}","{}",'{}',"{}",{},{},{},{},{}) on DUPLICATE KEY UPDATE parise_nums=VALUES(bury_count);;
        '''
        sql = sql.format(','.join(item.get('front_image_url')),
                         item.get('create_date'),
                         item.get('image_url_id'),
                         item.get('title'),
                         item.get('content'),
                         item.get('tags'),
                         item.get('content_id'),
                         item.get('comment_count'),
                         item.get('total_view'),
                         item.get('digg_count'),
                         item.get('bury_count'))
        print(sql)
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print('=====error:{}'.format(e))
        return item


class MysqlTwistedPipeline:
    # 异步执行sql语句
    def __init__(self, dbpool):
        self.dbpool = dbpool

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self.do_insert, item)
        query.addErrback(self.handler_error, item, spider)

    def do_insert(self, cursor, item):
        sql = '''
                INSERT INTO jobbole_article 
                (front_image_url, create_date,image_url_id,title,content,tags,content_id,comment_count,total_view,digg_count,bury_count,url_object_id)
                 VALUES
                 ("{}","{}","{}","{}",'{}',"{}",{},{},{},{},{},'0') on DUPLICATE KEY UPDATE title=VALUES(title),tags=VALUES(tags),content=VALUES(content);
                '''
        sql = sql.format(','.join(item.get('front_image_url','')),
                         item.get('create_date',''),
                         item.get('image_url_id',''),
                         item.get('title',''),
                         # 'xxx',
                         item.get('content',''),
                         # '大象汽车',
                         item.get('tags',''),
                         item.get('content_id'),
                         item.get('comment_count'),
                         item.get('total_view'),
                         item.get('digg_count'),
                         item.get('bury_count'))
        print(sql)

        cursor.execute(sql)

    def handler_error(self, failure, item, spider):
        print(failure)

    @classmethod
    def from_settings(cls, settings):
        from MySQLdb.cursors import DictCursor
        dbparms = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            user=settings["MYSQL_USER"],
            passwd=settings["MYSQL_PASSWORD"],
            charset='utf8',
            cursorclass=DictCursor,
            use_unicode=True,
        )
        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
        return cls(dbpool)

4、items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import re

import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Identity,Join


class AriclespiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


class ArticleItemLoader(ItemLoader):
    default_output_processor = TakeFirst()


def add_test(value):
    return value + '++'


def add_jobbole(value):
    return value + '--'


def date_convert(value):
    if re.match(r'发布于 (.*)', value).group(1):
        create_date = re.match(r'发布于 (.*)', value).group(1)
    else:
        create_date = '1970-01-01'
    return create_date


class JobbolespiderItem(scrapy.Item):
    title = scrapy.Field(
        # input_processor = MapCompose(add_jobbole,add_test),
        # output_processor=TakeFirst()
    )
    # create_date_content = scrapy.Field()
    create_date = scrapy.Field(
        input_processor= MapCompose(date_convert),
        output_processor=TakeFirst()
    )
    content = scrapy.Field()
    tags = scrapy.Field(
        output_processor =Join(separator=',')
    )
    url = scrapy.Field()
    front_image_url = scrapy.Field(
        output_processor=Identity()
    )
    image_url_id = scrapy.Field()
    front_image_path = scrapy.Field()
    content_id = scrapy.Field()
    comment_count = scrapy.Field()
    total_view = scrapy.Field()
    digg_count = scrapy.Field()
    bury_count = scrapy.Field()

以梦为马越骑越傻

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
python 爬虫框架scrapy 入门爬取博客园新闻（代码）

1、代码jobbole.py写爬取策略，2、settings.py 配置pipelines、配置图片下载、配置是否遵循robote协议、数据库配置等3、pipelines.py 主要是配置数据存储操作4、本来用的xpath 对网站解析，但是循环解析时发现每次解析的都是第一条，不知道是什么问题，最后这部分代码换成css选择器就好了。一、jobbole.py(主要写爬取策略)# -*- coding: utf-8 -*-import jsonimport osimport rei.
复制链接

扫一扫