Python+Scrapy爬取起点小说网数据存入Mysql与MongoDB数据库

抓包分析,找到整页数据所在,可以看到是在"lang"下。


继续分析,找到每一本数据所在




分析完毕,进行编码爬取。

1.编写item

class BookItem(scrapy.Item):
    # 小说名
    book_name = scrapy.Field()
    # 作者
    author = scrapy.Field()
    # 小说类型
    book_type = scrapy.Field()
    # 小说状态
    book_state = scrapy.Field()
    # 小说更新
    book_update = scrapy.Field()

    book_time = scrapy.Field()
    # 最新一章地址
    new_href = scrapy.Field()
    # 小说简介
    book_intro = scrapy.Field()
    pass


2.编写Spider

from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector

from Mycrawl.items import BookItem
import requests


class BookSpider(Spider):
    # 爬虫名字,重要
    name = 'book'
    # 反爬措施
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
    #url = "https://www.qidian.com/rank/yuepiao?style=1"
    #start_urls = ['qidian.com']


    def start_requests(self):
        url = "https://www.qidian.com/rank/yuepiao?style=1"
        yield Request(url, headers=self.headers, callback=self.parse)


    def parse(self, response):
        item = BookItem()
        selector = Selector(response)
        books = selector.xpath('//div[@class="book-mid-info"]')
        for book in books:
            name = book.xpath('h4/a/text()').extract()
            author = book.xpath('p[@class="author"]/a[@class="name"]/text()').extract()
            type = book.xpath('p[@class="author"]/a[@data-eid="qd_C42"]/text()').extract()
            state = book.xpath('p[@class="author"]/span/text()').extract()
            intro = book.xpath('p[@class="intro"]/text()').extract()
            update = book.xpath('p[@class="update"]/a[@target="_blank"]/text()').extract()
            href = book.xpath('p[@class="update"]/a/@href').extract()
            time = book.xpath('p[@class="update"]/span/text()').extract()

            item['book_name'] = name[0]
            item['author'] = author[0]
            item['book_type'] = type[0]
            item['book_state'] = state[0]
            item['book_update'] = update[0]
            item['book_time'] = time[0]
            item['new_href'] = 'https:' + href[0]
            item['book_intro'] = ''.join(intro).replace(' ','').replace('\n','')
            yield item


3.编写piplines连接数据库

Mysql

import pymysql
class BookPipeline(object):
    def __init__(self):
        # 连接数据库
     self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
                                    db='TESTDB', charset='utf8')
        # 建立游标对象
     self.cursor = self.conn.cursor()
        self.cursor.execute('truncate table Book')
        self.conn.commit()

    def process_item(self, item, spider):
        try:
            self.cursor.execute("insert into Book (book_name,author,book_type,book_state,book_update,book_time,new_href,book_intro) \
            VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", (item['book_name'], item['author'], item['book_type'],
                                                   item['book_state'], item['book_update'], item['book_time'],
                                                   item['new_href'], item['book_intro']))
            self.conn.commit()
        except pymysql.Error:
            print("Error%s,%s,%s,%s,%s,%s,%s,%s" % (item['book_name'], item['author'], item['book_type'],
                                                   item['book_state'], item['book_update'], item['book_time'],
                                                   item['new_href'], item['book_intro']))
        return item


MongoDB

import pymongo
class BookPipeline(object):
    def __init__(self):
        # 连接数据库
        self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
        self.test = self.client['TESTDB']
        self.post = self.test['movie']
    def process_item(self, item, spider):
        data = dict(item)
        self.post.insert(data)
        return item


4.编写setting

BOT_NAME = 'Mycrawl'

SPIDER_MODULES = ['Mycrawl.spiders']
NEWSPIDER_MODULE = 'Mycrawl.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Mycrawl (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
    #'Mycrawl.pipelines.MoviePipeline': 100,
    'Mycrawl.pipelines.BookPipeline': 300,
}


5.结果

Mysql


MongoDB


本文只爬取了第一页的,大家可以自己尝试爬取所有页的数据。

爬虫项目地址:github



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值