爬虫基础(9)

本文介绍了一个爬虫项目,旨在从笔趣网下载一部玄幻小说并保存为本地txt文件。首先,解析指定章节获取小说内容,接着分析网页获取小说目录和网址,然后遍历下载所有章节,并对比了之前的模仿练习代码进行优化。
摘要由CSDN通过智能技术生成
  • 目录
    1.项目设计
    2.代码:编程获取小说指定章节的内容
    3.完善代码:获取小说目录,遍历完成小说下载
    4.代码优化
    5.对比上次模仿练习代码

  • 项目设计
    1.功能描述
    1)下载一部玄幻小说
    2)输出:保存到本地txt
    3)数据源:小说内容静态存储于网页中,无robots限制
    2.结构设计
    1)从笔趣网选择一本玄幻小说,先解析获取指定章节小说的内容
    2)分析小说目录网页,获取小说目录及网址
    3)遍历下载小说所有章节
    4)输出到本地文档

  • 指定小说章节的内容

# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup

class spider(object):

    def __init__(self, url):
        self.url = url
        self.kv = {'user-agent':'Mozilla/5.0'}
        self.lsText = []

    def getHtmlText(self):
        try:
            req = requests.get(self.url, headers=self.kv)
            req.raise_for_status
            print(req.status_code)
            return req.text
        except Exception as e:
            return "getHtmlText产生异常:{}".format(e)

    def parserHtmlText(self, html):
        soup = BeautifulSoup(html)
        bf = soup.find_all('div', id="content")
        self.ls.append(bf[0].text.replace('\xa0'*8, '\n').replace('\r', '').replace(' ', ''))
        # print(bf[0].text.replace('\xa0'*8, '\n'))  #1 文本
        # print((i.replace('\xa0'*8, '').replace('\r', '') for i in bf[0].strings if i != ' '))  #2 generator,如若想得到与#1相同结果,for遍历
        # for i in (i.replace('\xa0'*8, '').replace('\r', '') for i in bf[0].strings if i != ' '):
        #     print(i)

    def writeToFile(self):
        with open("黑铁之堡.txt", 'w') as f:
            f.write(self.ls)

if __name__ == '__main__':
    url = 'https://www.biqukan.com/2_2892/1254662.html'
    sp = spider(url)
    html = sp.getHtmlText()
    # print(html)
    sp.parserHtmlText(html)
    print(sp.ls)
    # sp.writeToFile()
  • 小说目录获取,遍历下载至本地
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup

class spider(object):

    def __init__(self):
        self.kv = {'user-agent':'Mozilla/5.0'}
        self.lsText = []
        self.lsUrls = []
        self.lsChapters = []

    def getHtmlText(self, url):
        try:
            req = requests.get(url, headers=self.kv)
            req.raise_for_status
            req.encoding = req.apparent_encoding
            return req.text
        except Exception as e:
            return "getHtmlText产生异常:{}".format(e)

    def parserHtmlText(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        bf = soup.find_all('div', id="content")
        self.lsText.append(bf[0].text.replace('\xa0', '').replace('\r', '').replace(' ', ''))

    def parserHtmlUrl(self, html, url):
        soup = BeautifulSoup(html, 'html.parser')
        div = soup.find_all('div', class_="listmain")
        bf = BeautifulSoup(str(div[0]), 'html.parser')
        for i in bf.find_all('a')[12:]:
            self.lsChapters.append(i.string)
            self.lsUrls.append(url + (i.get('href').split('/'))[-1])
            

    def writeToFile(self):
        with open("黑铁之堡.txt", 'w', encoding='utf-8') as f:
            f.write("《黑铁之堡》\n\n\n")
            for num in range(len(self.lsText)):  
                f.write(self.lsChapters[num]+'\n')
                f.write(self.lsText[num]+'\n\n\n')

if __name__ == '__main__':
    url = 'https://www.biqukan.com/2_2892/'
    sp = spider()
    html = sp.getHtmlText(url)
    sp.parserHtmlUrl(html, url)
    cnt = 0
    for u in sp.lsUrls:
        print(cnt)
        sp.parserHtmlText(sp.getHtmlText(u))
        cnt += 1
    sp.writeToFile()

[Finished in 2945.7s]
  • 优化
# -*- coding: utf-8 -*-
'''
# 边下载边写入txt
'''

import requests
from bs4 import BeautifulSoup

class spider(object):

    def __init__(self):
        self.kv = {'user-agent':'Mozilla/5.0'}
        self.lsText = []
        self.lsUrls = []
        self.lsChapters = []

    def getHtmlText(self, url):
        try:
            req = requests.get(url, headers=self.kv)
            req.raise_for_status
            req.encoding = req.apparent_encoding
            return req.text
        except Exception as e:
            return "getHtmlText产生异常:{}".format(e)

    def parserHtmlText(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        bf = soup.find_all('div', id="content")
        texts = bf[0].text.replace('\xa0', '').replace('\r', '').replace(' ', '')
        return texts

    def parserHtmlUrl(self, html, url):
        soup = BeautifulSoup(html, 'html.parser')
        div = soup.find_all('div', class_="listmain")
        bf = BeautifulSoup(str(div[0]), 'html.parser')
        for i in bf.find_all('a')[12:]:
            self.lsChapters.append(i.string)
            self.lsUrls.append(url + (i.get('href').split('/'))[-1])
            

    def writeToFile(self, texts, cnt):
        with open("黑铁之堡.txt", 'w', encoding='utf-8') as f:
            f.write("《黑铁之堡》\n\n\n")
            f.write(self.lsChapters[cnt]+'\n')
            f.write(texts+'\n\n')
            
if __name__ == '__main__':
    url = 'https://www.biqukan.com/2_2892/'
    sp = spider()
    html = sp.getHtmlText(url)
    # print(html)
    sp.parserHtmlUrl(html, url)
    cnt = 0
    for u in sp.lsUrls:
        print(cnt)
        html = sp.getHtmlText(u)
        texts = sp.parserHtmlText(html)
        sp.writeToFile(texts, cnt)
        cnt += 1
    
[Finished in 2453.7s]
  • 对比之前模仿练习代码
# -*- coding:utf-8 -*-
# 浏览器google

from bs4 import BeautifulSoup
import requests, sys

class downloads(object):

	def __init__(self):
		self.server = 'https://www.qu.la/'
		self.target = 'https://www.qu.la/book/16431/'
		self.nums = 0
		self.names = []
		self.urls = []

	def get_urls(self):
		'''
		获取章节名,网址
		'''
		req = requests.get(url=self.target)
		html = req.text
		div_bf = BeautifulSoup(html)
		div = div_bf.find_all('div', id='list')
		a_bf = BeautifulSoup(str(div[0]))
		a = a_bf.find_all('a')
		self.nums = len(a[12:])
		for i in a[12:]:
			self.names.append(i.string)
			self.urls.append(self.server + i.get('href'))
			#print(i.string)

	def downloads(self, content_url):
		'''
		获取正文
		'''
		req = requests.get(content_url)
		html = req.text
		div_bf = BeautifulSoup(html)
		div = div_bf.find_all('div', id='content')
		texts = div[0].text.replace('\xa0'*4, '\n')
		return texts

	def write(self, name, path, content):
		'''
		写入文档
		'''
		with open(path, 'a', encoding='utf-8') as fout:
			fout.write(name)
			fout.writelines(content + '\n')

if __name__ == '__main__':
	
	ll = downloads()
	ll.get_urls()
	#ll.downloads(ll.urls[0])
	sys.stdout.write('开始下载:《**》')
	for i in range(ll.nums):
		ll.write(ll.names[i], 'test2.txt', ll.downloads(ll.urls[i]))
		sys.stdout.write('进度:%.3f' % (i/ll.nums))
		sys.stdout.flush()
		if i == 50:
			break
	sys.stdout.write('下载结束:<**>')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值