爬虫起点小说网所有小说基本信息

爬虫起点小说网所有小说基本信息

第一篇博客,先试试水。爬虫你们懂的,三小时5万条数据:

  • 多线程
  • 失败再爬取机制
  • 多次失败链接储存再爬取
  • 自定义数据量

代码块

导入需要的包

# -*- coding: utf-8 -*-
import time
import datetime
import threadpool
from bs4 import BeautifulSoup
import csv
import requests
from urllib.parse import urlencode

将每本小说url储存到urls.txt:

def load(i,count=0):
    try:
        url="https://www.qidian.com/all?page="+str(i)
        print("正在采集页面:{}".format(url))
        page=requests.get(url)
        page.encoding="utf-8"
        soup = BeautifulSoup(page.text, 'lxml')
        elem=soup.select(".book-mid-info h4 a")#选取url
        urls=[]
        for j in range(0,20):
            url = 'https:' + elem[j].get('href')
            urls.append(url)
        if len(urls)!=20:
            raise Exception(BaseException, i)
        with open('urls.txt', 'a', encoding='utf-8') as f:#写入文件
            for cont in urls:
                f.write(str(cont)+'\n')
    except BaseException as e:
        if count<5:
            load(i,count+1)
        else:
            print(str(e))
            with open('urllist.txt','a',encoding='utf-8') as fp:
                fp.write(url+' '+i+'\n')

def loadurl(start,end,thrednum):
    links = []
    for i in range(start,end+1):#自定义页数
        links.append(i)
    #开始采集小说url
    print(len(links))
    try:
        pool = threadpool.ThreadPool(thrednum)  # 线程池
        requests = threadpool.makeRequests(load, links)
        [pool.putRequest(req) for req in requests]
        pool.wait()
    except KeyboardInterrupt:
        print('手动暂停')

初始化qidian.csv文件,仅能执行一次:

def init():
    row = ['book_name', 'author', 'words_count', 'click_count', 'books_count', 'score', 'j_user_count','crawl_time','id']
    #row = ['小说名', '作者', '字数', '点击量', '作品个数', '评分', '评价人数', '抓取时间', 'url']
    with open("qidian.csv", "w", newline="") as f:
        f = csv.writer(f, dialect="excel")
        f.writerow(row)

读取urls.txt文件,将小说转换成记录储存到qidian.csv。:

def work(url, count=0):
    page = requests.get(url)
    page.encoding = "utf-8"
    soup = BeautifulSoup(page.text, 'lxml')
    try:
        # 选择元素
        elem = soup.select(".book-info h1 em")
        book_name = elem[0].text
        author = soup.select(".writer")[0].text
        words_count = soup.select(".book-info p em")[0].text
        click_count = soup.select(".book-info p em")[1].text
        books_count = soup.select(".work-state li em")[0].text
        id = url.replace("https://book.qidian.com/info/", "")
        crawl_time=get_unix_time()
        print(url)
        # score = soup.select("#score1")[0].text + '.' + soup.select("#score2")[0].text
        # j_user_count = soup.select("#j_userCount span")[0].text
        bookid = id
        data = {
            '_csrfToken': 'QpbsVhyc5zc0h21NiEweIrLMu2tFOM1RsgfZtWSS',
            'bookId': bookid,
            'pageSize': 15
        }
        other_url = 'https://book.qidian.com/ajax/comment/index?' + urlencode(data)
        page = requests.get(other_url, stream=True)
        page.encoding = "utf-8"
        cont = eval(page.text)
        score = cont.get('data').get('rate')
        j_user_count = cont.get('data').get('userCount')
        # 写:追加
        row = [book_name, author, words_count, click_count, books_count, score, j_user_count, crawl_time, id]
        with open("qidian.csv", "a", encoding="utf-8",newline='') as f:
            f = csv.writer(f, dialect="excel")
            f.writerow(row)
        with open("doneurl.txt", "a", newline='',encoding='utf-8') as fe:
            fe.write(url + '\n')
        fe.close()
    except BaseException:
        if count < 5:
            print('errror 元素获取失败 重试次数:' + str(count))
            time.sleep(2)
            work(url, count+1)
        else:
            with open("error_url.txt", "a", encoding='utf-8') as fe:
                fe.write(url + '\n')
                print('errror 元素获取失败 写入文件')
            fe.close()

其他函数及爬虫启动函数

#时间戳
def get_unix_time():  # 获取unix时间戳
    dtime = datetime.datetime.now()
    ans_time = int(time.mktime(dtime.timetuple()))
    return ans_time
#爬虫启动
def spider(start=1,end=2500,thrednum=10):  #输入文件输出文件
    #采集每本小说url储存到文件
    loadurl(start,end,thrednum)
    #将url读取到list
    with open('urls.txt', 'r+', encoding='utf-8') as f:
        links = []
        url = f.readline().strip('\n')
        while url:
            links.append(url)
            url = f.readline().strip('\n')
    #开始采集每条记录
    init()
    try:
        pool = threadpool.ThreadPool(thrednum)  # 线程池
        requests = threadpool.makeRequests(work, links)
        [pool.putRequest(req) for req in requests]
        pool.wait()
    except KeyboardInterrupt:
        print('手动暂停')

爬虫启动

spider(1,2500,20)
从第1页爬取到第2500页,20条线程.一共200条记录

作者想说的话


  1. 本人第一次发博客,必有很多解释不到之处,还请大家多多指教。
  2. 欢迎任何人发现本人代码中存在的问题或者可以改进的地方与本人交流,必将感激不尽
  3. 本人qq:289672494 常用
  4. 希望大家共同进步
  5. 需要获取其他的元素改部分即可

注:本人必将遵守法律法规,不发生任何盗取网站数据或影响网站运营的行为,此篇文章仅供广大博友或来访者参考。倡导绿色安全网络环境,人人有责。

以下是使用Python编写的爬取起点小说网多线程爬虫代码: ```python import requests from lxml import etree from queue import Queue import threading # 定义爬虫类 class Spider(): def __init__(self, url, headers): self.url = url self.headers = headers self.session = requests.Session() self.session.headers.update(headers) # 获取小说列表 def get_novel_list(self): response = self.session.get(self.url) html = etree.HTML(response.text) novel_list = html.xpath('//div[@class="book-mid-info"]/h4/a/@href') return novel_list # 获取小说信息 def get_novel_info(self, novel_url): response = self.session.get(novel_url) html = etree.HTML(response.text) novel_info = {} novel_info['title'] = html.xpath('//div[@class="book-info "]/div[@class="book-info "]/h1/em/text()')[0] novel_info['author'] = html.xpath('//div[@class="book-info "]/div[@class="book-info "]/h1/span/a/text()')[0] novel_info['intro'] = html.xpath('//div[@class="book-intro"]/p/text()')[0] novel_info['word_count'] = html.xpath('//div[@class="book-info "]/div[@class="book-info "]/p/span[1]/text()')[0] return novel_info # 定义爬取线程类 class SpiderThread(threading.Thread): def __init__(self, spider, novel_queue): threading.Thread.__init__(self) self.spider = spider self.novel_queue = novel_queue def run(self): while True: try: novel_url = self.novel_queue.get(False) novel_info = self.spider.get_novel_info(novel_url) print(novel_info) except: break # 定义主函数 def main(): url = 'https://www.qidian.com/all' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} spider = Spider(url, headers) novel_list = spider.get_novel_list() # 创建小说队列 novel_queue = Queue() # 将小说列表加入队列 for novel_url in novel_list: novel_queue.put(novel_url) # 创建爬取线程 threads = [] for i in range(5): spider_thread = SpiderThread(spider, novel_queue) spider_thread.start() threads.append(spider_thread) # 等待所有线程结束 for t in threads: t.join() if __name__ == '__main__': main() ``` 该代码使用了Python的requests库和lxml库来进行网页爬取和解析,使用了多线程来提高爬取效率。首先定义了一个Spider类来实现爬取小说列表和小说信息的功能,然后定义了一个SpiderThread类来作为爬取线程,最后在主函数中创建小说队列和爬取线程,并等待所有线程结束。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值