scrapy 爬取数据保存到数据库

最新推荐文章于 2024-07-12 21:30:00 发布

都护小弟弟

最新推荐文章于 2024-07-12 21:30:00 发布

阅读量475

点赞数

分类专栏： python 文章标签： mysql 数据库 python xpath 爬虫

本文链接：https://blog.csdn.net/Hoo_ligan/article/details/109687276

版权

python 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MkwItem(scrapy.Item):
    link = scrapy.Field()   # 图片
    type = scrapy.Field()   # 属于
    title = scrapy.Field()  # 标题
    level = scrapy.Field()  # 级别
    num = scrapy.Field()    # 人数
    info = scrapy.Field()   # 简介
    price = scrapy.Field()  # 价格

spiders/**.py

# -*- coding: utf-8 -*-
import scrapy
from .. import items


class MukeSpider(scrapy.Spider):
    name = 'muke'
    allowed_domains = ['imooc.com']
    start_urls = ['http://www.imooc.com/course/list/']

    def parse(self, response):
        item = items.MkwItem()
        for i in range(len(response.xpath('//*[@id="main"]/div[2]/div[2]/div[1]/div/div'))):
            long = response.xpath('//*[@id="main"]/div[2]/div[2]/div[1]/div/div[{}]'.format(i + 1))
            item['link'] = long.xpath('.//a/div[1]/img/@data-original').extract()[0]
            type_l = ''
            for i in long.xpath('.//a/div[1]/div/label/text()').extract():
                type_l = type_l + i + '+'
            item['type'] = type_l
            item['title'] = long.xpath('.//a/div[2]/h3/text()').extract()[0]
            item['level'] = long.xpath('.//a/div[2]/div/div[1]/span[1]/text()').extract()[0]
            item['num'] = long.xpath('.//a/div[2]/div/div[1]/span[2]/text()').extract()[0]
            item['info'] = long.xpath('.//a/div[2]/div/p/text()').extract()[0]
            item['price'] = long.xpath('.//a/div[2]/div/div[2]/div/span/text()').extract()[0]
            yield item
        url = response.xpath("//*[@id=\"main\"]/div[2]/div[2]/div[2]/a[text()='下一页']/@href").extract()
        if url:
            # 将信息组合成下一页的url
            page = 'http://www.imooc.com' + url[0]
            # 返回url
            yield scrapy.Request(page, callback=self.parse)

pipeline.py

# -*- coding: utf-8 -*-
import pymysql


class MkwPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(host='88.88.88.88', user='8888', passwd='8888',
                                       db='8888')  # 后面三个依次是数据库连接名、数据库密码、数据库名称
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        # sql语句
        insert_sql = "insert into mkw(link, type, title, level, num, info, price) VALUES (%s, %s, %s, %s, %s, %s, %s)"
        # 执行插入数据到数据库操作
        self.cursor.execute(insert_sql, (
        item['link'], item['type'], item['title'], item['level'], item['num'], item['info'], item['price']))
        # 提交，不进行提交无法保存到数据库
        self.connect.commit()

    def close_spider(self, spider):
        # 关闭游标和连接
        self.cursor.close()
        self.connect.close()

settings.py

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'

ROBOTSTXT_OBEY = False

CONCURRENT_REQUESTS = 32

DOWNLOAD_DELAY = 0.1

DEFAULT_REQUEST_HEADERS = {

    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

    'Accept-Language': 'en',

}

ITEM_PIPELINES = {

   'mkw.pipelines.MkwPipeline': 300,

}