python自制英汉词典不用爬虫_python小说全站爬虫,渣渣程序需改进

[Python] 纯文本查看 复制代码import requests

from lxml import etree

import time

import re

import os

import threading

#9.20修改get_url()

def get_url():

# 全书网每个栏目的url

for i in range(1,12):

url = 'http://www.quanshuwang.com/list/' + str(i) + '_1.html'

r = requests.get(url)

r.encoding = 'gbk'

html = etree.HTML(r.text)

lastpage = int(''.join(html.xpath('//a[@class="last"]/text()')))

yield url, lastpage

#获取每一页的小说链接

def get_url_page():

dict_url = get_url()

for i in dict_url:

for j in range(1,i[1]):

url = ''.join(re.findall(r'(.*\d_)', i[0]))

url = url + str(j) + '.html' #url拼接

r = requests.get(url)

r.encoding = 'gbk'

html = etree.HTML(r.text)

href = html.xpath('//a[@class="clearfix stitle"]/@href') # 每一页的小说url

yield href

def get_url_page_book(url):

def crawl_page():

print(url,"执行第",attempts,"次")

r = requests.get(url)

r.encoding = 'gbk'

html = etree.HTML(r.text)

'''出现的问题:

1、请求时可能出现空白页面,实际上是网络原因,需要重新多次请求尝试

'''

href = ''.join(html.xpath('//div[@class="detail"]/a/@href')) # 小说地址

title = ''.join(html.xpath('//h1/text()')) # 小说名

title = re.sub(r"[\\/:*?<>|!\.\"]", '', title) # 修改小说带有非法字符的名称

em = ''.join(html.xpath('//*[@id="waa"]/text()')) # 小说简介

path = 'F:/python/xiaoshuo' # 小说文件夹路径

read_path = path + '/' + title

r1 = requests.get(href) # 请求小说的目录页

r1.encoding = 'gbk'

html_page = etree.HTML(r1.text)

list_href = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/@href') # 章节链接

list_title = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/text()') # 章节标题

# print(list_href,list_title)# 小说的章节标题和链接

if not os.path.exists(read_path):#判断路径是否存在

os.mkdir(read_path) # 创建小说文件夹路径

for j in range(len(list_href)): # 保存小说章节内容

r2 = requests.get(list_href[j])

r2.encoding = 'gbk'

html_content = etree.HTML(r2.text)

try:

content = ''.join(html_content.xpath('//*[@id="content"]/text()'))

except AttributeError:

print(list_href[j],"读取内容失败")

#小说列表字符处理

list_titlee = re.sub(r"[\\/:*?<>|!\"]", '', list_title[j])

list_read_path = read_path + '/' + list_titlee

# 写入小说

if not os.path.exists(list_read_path):#判断路径是否存在

os.mkdir(list_read_path)

file = open(list_read_path + '/' + 'text.txt', 'w', encoding='utf-8')

file.write(content)

file.close()

#出现解析等问题重试五次

attempts = 1

success = False

while attempts < 6 and not success:

try:

crawl_page()

success = True

except:

print("失败重试...")

attempts += 1

if attempts == 6:

break

if __name__ == '__main__':

urls = get_url_page()

threads = []

for uu in urls:#对每一页小说进行迭代

for u in uu:

t1 = threading.Thread(target=get_url_page_book,args=(u,))

t1.start()

time.sleep(1)

threads.append(t1)

time.sleep(90)

for j in threads:

j.join()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值