Python+Scrapy爬取起点小说网数据存入Mysql与MongoDB数据库

最新推荐文章于 2024-05-02 17:41:07 发布

Dumbliidore

最新推荐文章于 2024-05-02 17:41:07 发布

阅读量3.5k

点赞数 8

分类专栏：数据库 Pythom爬虫

本文链接：https://blog.csdn.net/Mr_blueD/article/details/79343349

版权

Pythom爬虫同时被 2 个专栏收录

19 篇文章 0 订阅

订阅专栏

数据库

18 篇文章 0 订阅

订阅专栏

抓包分析，找到整页数据所在，可以看到是在"lang"下。

继续分析，找到每一本数据所在

分析完毕，进行编码爬取。

1.编写item

class BookItem(scrapy.Item):
    # 小说名
    book_name = scrapy.Field()
    # 作者
    author = scrapy.Field()
    # 小说类型
    book_type = scrapy.Field()
    # 小说状态
    book_state = scrapy.Field()
    # 小说更新
    book_update = scrapy.Field()

    book_time = scrapy.Field()
    # 最新一章地址
    new_href = scrapy.Field()
    # 小说简介
    book_intro = scrapy.Field()
    pass

2.编写Spider

from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector

from Mycrawl.items import BookItem
import requests


class BookSpider(Spider):
    # 爬虫名字，重要
    name = 'book'
    # 反爬措施
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
    #url = "https://www.qidian.com/rank/yuepiao?style=1"
    #start_urls = ['qidian.com']


    def start_requests(self):
        url = "https://www.qidian.com/rank/yuepiao?style=1"
        yield Request(url, headers=self.headers, callback=self.parse)


    def parse(self, response):
        item = BookItem()
        selector = Selector(response)
        books = selector.xpath('//div[@class="book-mid-info"]')
        for book in books:
            name = book.xpath('h4/a/text()').extract()
            author = book.xpath('p[@class="author"]/a[@class="name"]/text()').extract()
            type = book.xpath('p[@class="author"]/a[@data-eid="qd_C42"]/text()').extract()
            state = book.xpath('p[@class="author"]/span/text()').extract()
            intro = book.xpath('p[@class="intro"]/text()').extract()
            update = book.xpath('p[@class="update"]/a[@target="_blank"]/text()').extract()
            href = book.xpath('p[@class="update"]/a/@href').extract()
            time = book.xpath('p[@class="update"]/span/text()').extract()

            item['book_name'] = name[0]
            item['author'] = author[0]
            item['book_type'] = type[0]
            item['book_state'] = state[0]
            item['book_update'] = update[0]
            item['book_time'] = time[0]
            item['new_href'] = 'https:' + href[0]
            item['book_intro'] = ''.join(intro).replace(' ','').replace('\n','')
            yield item

3.编写piplines连接数据库

Mysql

import pymysql
class BookPipeline(object):
    def __init__(self):
        # 连接数据库
     self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
                                    db='TESTDB', charset='utf8')
        # 建立游标对象
     self.cursor = self.conn.cursor()
        self.cursor.execute('truncate table Book')
        self.conn.commit()

    def process_item(self, item, spider):
        try:
            self.cursor.execute("insert into Book (book_name,author,book_type,book_state,book_update,book_time,new_href,book_intro) \
            VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", (item['book_name'], item['author'], item['book_type'],
                                                   item['book_state'], item['book_update'], item['book_time'],
                                                   item['new_href'], item['book_intro']))
            self.conn.commit()
        except pymysql.Error:
            print("Error%s,%s,%s,%s,%s,%s,%s,%s" % (item['book_name'], item['author'], item['book_type'],
                                                   item['book_state'], item['book_update'], item['book_time'],
                                                   item['new_href'], item['book_intro']))
        return item

MongoDB

import pymongo
class BookPipeline(object):
    def __init__(self):
        # 连接数据库
        self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
        self.test = self.client['TESTDB']
        self.post = self.test['movie']
    def process_item(self, item, spider):
        data = dict(item)
        self.post.insert(data)
        return item

4.编写setting

BOT_NAME = 'Mycrawl'

SPIDER_MODULES = ['Mycrawl.spiders']
NEWSPIDER_MODULE = 'Mycrawl.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Mycrawl (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    #'Mycrawl.pipelines.MoviePipeline': 100,
    'Mycrawl.pipelines.BookPipeline': 300,
}

5.结果

Mysql

MongoDB

本文只爬取了第一页的，大家可以自己尝试爬取所有页的数据。

爬虫项目地址：github

Dumbliidore

关注

8
点赞
踩
24

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录