[Python爬虫]使用Scrapy框架爬取糗事百科

最新推荐文章于 2022-11-29 13:38:15 发布

Black_God1

最新推荐文章于 2022-11-29 13:38:15 发布

阅读量579

点赞数

分类专栏：爬虫计算机 python 文章标签： python 爬虫 scrapy

本文链接：https://blog.csdn.net/Black_God1/article/details/82155907

版权

计算机同时被 3 个专栏收录

33 篇文章 0 订阅

订阅专栏

爬虫

26 篇文章 0 订阅

订阅专栏

python

17 篇文章 0 订阅

订阅专栏

启动main.py文件

说明qiushi是执行文件名

from scrapy.cmdline import execute

execute('scrapy crawl qiushi'.split())

在spiders文件夹下执行文件qiushi.py（自己创建）

# -*- coding: utf-8 -*-
import scrapy,re
from ..piaot import *       #导包自定义包，作用只能填个报头，手动填也可以
from ..items import qiu_mysql       #导入实例化对象


class QiushiSpider(scrapy.Spider):
    name = 'qiushi'
    allowed_domains = ['qiushibaike.com']
    # start_urls = ['https://www.qiushibaike.com/']

    # 我们使用这个函数作为初始的执行函数
    def start_requests(self):
        # 糗事百科的地址
        url='http://www.qiushibaike.com'

        # 添加报头
        form ={
            "User-Agent":pa(),
        }

        # 使用get请求发送
        req=scrapy.Request(url=url,headers=form,callback=self.pq_yeshu)
        # 将结果返回给pq_yeshu函数
        yield req

    # 获取页数循环
    def pq_yeshu(self,response):
        # 接受结果进行使用xpath匹配页数
        req=response.xpath('//*[@id="content-left"]/ul/li[7]/a/span/text()').extract_first()

        # 把\n替换掉
        req=req.replace('\n','')
        print('一共有',req,'页')

        # 请输入页数
        yeshu=int(input('请输入页数：'))

        # 判断输入的页数是否大于如果大于将赋值给当前页数的最大值
        if yeshu > int(req):
            yeshu=int(req)

        # 进行页数循环
        for i in range(1,yeshu+1):

            # 拼接url
            url='https://www.qiushibaike.com/8hr/page/{}/'.format(i)

            # 添加报头
            form = {
                "User-Agent":pa(),
            }

            # 用get方法请求，发给pq_nr
            html=scrapy.Request(url=url,headers=form,callback=self.pq_nr)

            # 将返回的值发送
            yield html


    # 爬取内容的连接
    def pq_nr(self,response):

        # 将返回值用xpath匹配出内容的url地址
        nr_url=response.xpath('//a[@class="contentHerf"]/@href').extract()

        # 将获得到的url当数量进行循环请求
        for i in range(len(nr_url)):

            # 拼接url
            req='http://www.qiushibaike.com'+nr_url[i]

            # 添加报头
            form = {
                "User-Agent": pa(),
            }
            # 使用get方法请求，发送给parse
            html = scrapy.Request(url=req, headers=form, callback=self.parse)

            # 发送
            yield html


    def parse(self, response):

        # 将items里的自定义qiu_mysql类实例化
        mysql=qiu_mysql()
        # 匹配出内容
        nr=response.xpath('//*[@id="single-next-link"]/div/text()').extract()

        # 匹配出标题名称
        name=response.xpath('//div[@class="author clearfix"]/a[2]/h2/text()').extract_first()

        # 匹配出评价和赞（笑点）
        pin=response.xpath('//i[@class="number"]/text()').extract()

        # 赋值
        mysql['haox']=pin[0]
        mysql['pingj']=pin[1]
        mysql['name']=name
        nr_1=''
        # 将内容格式化,过滤掉、\n\r\t等等
        for i in nr:
            req = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()【】~@#￥%&*]+", "", i)
            nr_1+=req
        mysql['nr']=nr_1

        # 发送
        yield mysql

items.py文件

# 自定义构造函数
class qiu_mysql(scrapy.Item):
    # define the fields for your item here like:
    # 定义实例化参数
    name = scrapy.Field()
    pingj=scrapy.Field()
    nr=scrapy.Field()
    haox=scrapy.Field()

settings.py（配置文件）

导入自定义包，其实使用就是加一个报头，大家手动加不导包也可以的

from piaot import *

#是否遵循规则，不懂请百度
#ROBOTSTXT_OBEY改成False

ROBOTSTXT_OBEY = False

添加报头
DEFAULT_REQUEST_HEADERS = {

  'User-Agent':pa(),     #导入了piaot包里的pa()报头，大家手动添加也是可以的！！
}

#配置存储文件地址和优先级
ITEM_PIPELINES = {
   # 'qiushibaike.pipelines.QiushibaikePipeline': 300,
    'qiushibaike.pipelines.mysql': 300,

}

pipelines.py(存储文件)

# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class QiushibaikePipeline(object):
    def process_item(self, item, spider):
        return item

# 自定义类
class mysql(object):

    def process_item(self, item, spider):

        # sql语句
        sql = "insert into xq_qiushi values(NULL,'{}','{}','{}','{}')".format(item['name'],item['nr'],item['pingj'],item['haox'])

        # 打开数据库连接,
        db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')
        # 使用 cursor() 方法创建一个游标对象 cursor
        cursor = db.cursor()
        # 使用 execute() 方法获取一条数据
        data = cursor.execute(sql)
        print("Database version : %s " % data)
        # 提交到数据库执行
        db.commit()
        db.close()