scrapy 完整版下载全部数据

安装scrapy
pip3 install scrapy
创建项目
scrapy startproject fistBlood
提示
cd fistBlood
scrapy genspider first www.xx.com
执行工程
scrapy crawl spderName    scrapy crawl first

直接上代码开启管道

ROBOTSTXT_OBEY = False

LOG_LEVEL = 'ERROR'

USER_AGENT = 请求中有

ITEM_PIPELINES = {
   'secBlood.pipelines.SecbloodPipeline': 300,
   'secBlood.pipelines.mysqlPileLine': 301
}

创建的爬虫文件

import scrapy
from secBlood.items import SecbloodItem

class SecbSpider(scrapy.Spider):
    name = 'secb'
    # allowed_domains = ['www.xx.com']
    start_urls = ['https://xxx.com/']
    url = 'https://xxx.com/videos/index/%d'
    page_num = 2


    def parse_detail(self,response):
        videoUrl = response.xpath('//script[8]/text()').re(r"url: '(.*?)'")[0]
        imgBig = response.xpath('//script[8]/text()').re(r"pic : '(.*?)'")[0]
        item = response.meta['item']
        item['videoUrl'] = videoUrl
        item['imgBig'] = imgBig

        yield item

    def parse(self, response):
        div_lsit = response.xpath('//div[@class="col-xlg videos-item"]/div[@node-type="video"]')
        prefixUrl = response.xpath('//link[@rel="canonical"]/@href')[0].extract()
        for tar in div_lsit:
            item = SecbloodItem()

            detailUrl = prefixUrl + tar.xpath('./div[1]/a/@href')[0].extract()
            title = tar.xpath('./div[1]/a/@alt')[0].extract()
            imgsmall = tar.xpath('./div[1]/a/img/@src')[0].extract()
            videoTime = tar.xpath('./div[1]/a/span[@class="video-duration"]/text()')[0].extract()
            numPerson = tar.xpath('./div[2]/div[2]/span[1]/text()')[0].extract()

            item['detailUrl'] = detailUrl
            item['title'] = title
            item['imgsmall'] = imgsmall
            item['videoTime'] = videoTime
            item['numPerson'] = numPerson

            yield scrapy.Request(detailUrl, callback=self.parse_detail, meta={'item': item})
        if self.page_num <= 3:
            new_url = format(self.url % self.page_num)
            self.page_num += 1
            yield scrapy.Request(url=new_url, callback=self.parse)
        print('第一数据抓出完成')

xpath写法

xpath('//a')    # 所有a标签(子孙后代)
xpath('//a[2]')        # 所有a标签,按索引找第二个

xpath('//a[@id]')    # 所有a标签,并且含有id属性
xpath('//a[@id="i1"]')        # 所有a标签,并且属性id='i1'
xpath('//a[@href="link.html"][@id="i1"]')    # 所有a标签,属性href="link.html" 而且 id="i1"

xpath('//a[contains(@href, "link")]')    # 所有a标签,属性href的值包含"link"
xpath('//a[starts-with(@href, "link")]')    # 所有a标签,属性href的值以"link"开头
xpath('//a[re:test(@id, "i\d+")]')        # 所有a标签 属性id的值 符合正则表达式"i\d+"的规则

xpath('//a[re:test(@id, "i\d+")]/text()').extract()        # 所有a标签,取text的值
xpath('//a[re:test(@id, "i\d+")]/@href').extract()        # 所有a标签,取href的属性值

xpath('/html/body/ul/li/a/@href').extract()        # 取所有的值
xpath('//body/ul/li/a/@href').extract_first()    # 取第一个值

items.py

import scrapy


class SecbloodItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    detailUrl = scrapy.Field()
    title = scrapy.Field()
    imgsmall = scrapy.Field()
    videoTime = scrapy.Field()
    numPerson = scrapy.Field()
    videoUrl = scrapy.Field()
    imgBig = scrapy.Field()

pipelines.py   看数据直接打印item

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import os.path
import sqlite3

class SecbloodPipeline:
    def process_item(self, item, spider):
        return item



class mysqlPileLine(object):
    conn = None
    cur = None
    def open_spider(self,spider):
        BASE_DIR = os.path.dirname(os.path.abspath(__file__))
        db_path = os.path.join(BASE_DIR, "second.db")
        self.conn = sqlite3.connect(db_path)

    def process_item(self,item,spider):
        self.cur = self.conn.cursor()
        insert_sql = 'insert into secondAqd values (NULL , ?, ?, ?, ?, ?, ?, ?)'
        self.cur.execute(insert_sql, (item['detailUrl'], item['title'], item['imgBig'], item['videoUrl'],
                                      item['imgsmall'], item['videoTime'], item['numPerson']))
        self.conn.commit()
        return item

    def close_spider(self,spider):
        self.cur.close()
        self.conn.close()



# class mysqlPileLine(object):
#     conn = None
#     cur = None
#     def open_spider(self, spider):
#         self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123456',db='course',charset='utf8')
#
#     def process_item(self, item, spider):
#         self.cur = self.conn.cursor()
#         insert_sql = 'insert into testav values (NULL ,"%s" , "%s" , "%s" , "%s" , "%s")'%(item['detailUrl'], item['title'], item['imgsmall'], item['videoTime'], item['numPerson'])
#         try:
#             self.cur.execute(insert_sql)
#             self.conn.commit()
#         except Exception as e:
#             print(e)
#             self.conn.rollback()
#         return item
#
#     def close_spider(self, spider):
#         self.cur.close()
#         self.conn.close()

小结:

复制粘贴简单修改后可以直接用于网站数据的获取,数据库连接的问题

sqlite3数据库要提前创建好,表名 字段,NULL 是添加了字段 id

MySQL要开启数据库,测试连接成功后才能使用,测试中指定数据库

创建表名 字段名称

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

xiaodunmeng

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值