爬取起点小说+mongoDB存储

步骤:

1.连接数据库,确定存储表结构,建表。
(init_db.py)

# -*- coding: utf-8 -*-
"""
@Time : 2021/1/31 15:00
@Auth : 牟晋卓
@File :init_db.py
@IDE :PyCharm

"""
import sqlite3

conn = sqlite3.connect("qidian.db")
print("连接成功!")
c = conn.cursor()
#设置存储的字段及其类型
sql = '''
    create table novel
    (id INTEGER not null PRIMARY KEY AUTOINCREMENT,
    novel_title text,#小说题目
    novel_href text,#小说链接
    novel_img text,#小说主题图片
    novel_date text,#小说数据
    novel_writer text,#小说作者
    novel_instru text
);

'''
c.execute(sql)
conn.commit()
conn.close()
print("建表结束!")

2.开始爬取网页数据
(main.py)

# -*- coding: utf-8 -*-
"""
@Time : 2021/1/31 10:51
@Auth : 牟晋卓
@File :main_py.py
@IDE :PyCharm

"""
import requests
import time,random
from lxml import etree
import sqlite3

def random_user_agent():
    list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36']
    seed = random.randint(0, len(list)-1)
    return list[seed]

#设置请求头
headers = {
            'User-Agent': random_user_agent(),
             }
#爬取网页
def getData(url,savePath):
    print("爬取开始...")
    for i in range(1,31):
        novel_list = []
        print('第%d页'%(i))
        time.sleep(random.randint(1,3))#爬取间隔1-3秒
        relUrl = url + str(i)
        try:
            response = requests.get(url=relUrl, params=headers)
            if response.status_code == 200:
                html = response.text
            else:
                html = "    "
        except Exception as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)
            continue
        # print(html)
        selector = etree.HTML(html)#数据处理
        items = selector.xpath('//ul[@class="cf"]//li')
        # print(len(items))
        for item in items:
            novel_img = 'https:' + item.xpath('./div[@class="focus-img"]//a//img/@src')[0]
            novel_href = item.xpath('./div[@class="info"]//p//a/@href')[0]
            novel_date = item.xpath('./div[@class="info"]//span/text()')[0]
            novel_title = item.xpath('./div[@class="info"]//p//a/text()')[0]
            # print(novel_img, novel_title, novel_href, novel_date)
            try:
                time.sleep(1)
                response = requests.get(url=novel_href, params=headers)
                if response.status_code == 200:
                    info = response.text
                else:
                    info = "    "
            except Exception as e:
                if hasattr(e, "code"):
                    print(e.code)
                if hasattr(e, "reason"):
                    print(e.reason,"")
                print("第%d页%s页面丢失!"%(i,novel_href))
                continue
            sel = etree.HTML(info)
            try:
                novel_writer = sel.xpath('//a[@class="writer"]/text()')[0]
            except:
                novel_writer = '    '
            try:
                novel_instru = sel.xpath('//p[@class="intro"]/text()')[0]
            except:
                novel_instru = '    '
            # print(novel_writer,novel_instru)
            novel_list.append([novel_title,novel_href,novel_img,novel_date,novel_writer,novel_instru])
        saveSql(novel_list,savePath)

    print("爬取结束...")
    return "程序结束!"


def saveSql(novel_list,savePath):
    conn = sqlite3.connect(savePath)
    c = conn.cursor()
    print("开始存储!")
    for data in novel_list:
        for i in range(len(data)):
            data[i] = '"'+data[i]+'"'
        sqls = '''
            insert into novel (novel_title,novel_href,novel_img,novel_date,novel_writer,novel_instru)
            VALUES (%s)
        '''%",".join(data)
        # print(sqls)
        c.execute(sqls)
        conn.commit()
    conn.close()
    print("存储结束!")


if __name__ == "__main__":
    baseurl = r"https://www.qidian.com/book/coverrec?page=2"
    savePath = r"qidian.db"
    getData(baseurl, savePath)

3.总结
1.爬虫设置几个请求头,轮流切换(避免反爬)
2.有条件的可以设置IP池代理,就不用考虑反爬了(但是要钱,公司出钱没问题)
3.个人兴趣做爬虫的话,尽量不用并发多线程,减少目标网站的负荷(或者是注意反爬封IP)
4.没有了,草率结束!!!

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值