python自制英汉词典不用爬虫_python小说全站爬虫，渣渣程序需改进

weixin_39574246

于 2020-12-15 11:48:16 发布

阅读量76

点赞数

文章标签： python自制英汉词典不用爬虫

本文链接：https://blog.csdn.net/weixin_39574246/article/details/111426624

版权

[Python] 纯文本查看复制代码import requests

from lxml import etree

import time

import re

import os

import threading

#9.20修改get_url()

def get_url():

# 全书网每个栏目的url

for i in range(1,12):

url = 'http://www.quanshuwang.com/list/' + str(i) + '_1.html'

r = requests.get(url)

r.encoding = 'gbk'

html = etree.HTML(r.text)

lastpage = int(''.join(html.xpath('//a[@class="last"]/text()')))

yield url, lastpage

#获取每一页的小说链接

def get_url_page():

dict_url = get_url()

for i in dict_url:

for j in range(1,i[1]):

url = ''.join(re.findall(r'(.*\d_)', i[0]))

url = url + str(j) + '.html' #url拼接

r = requests.get(url)

r.encoding = 'gbk'

html = etree.HTML(r.text)

href = html.xpath('//a[@class="clearfix stitle"]/@href') # 每一页的小说url

yield href

def get_url_page_book(url):

def crawl_page():

print(url,"执行第",attempts,"次")

r = requests.get(url)

r.encoding = 'gbk'

html = etree.HTML(r.text)

'''出现的问题：

1、请求时可能出现空白页面，实际上是网络原因，需要重新多次请求尝试

'''

href = ''.join(html.xpath('//div[@class="detail"]/a/@href')) # 小说地址

title = ''.join(html.xpath('//h1/text()')) # 小说名

title = re.sub(r"[\\/:*?<>|!\.\"]", '', title) # 修改小说带有非法字符的名称

em = ''.join(html.xpath('//*[@id="waa"]/text()')) # 小说简介

path = 'F:/python/xiaoshuo' # 小说文件夹路径

read_path = path + '/' + title

r1 = requests.get(href) # 请求小说的目录页

r1.encoding = 'gbk'

html_page = etree.HTML(r1.text)

list_href = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/@href') # 章节链接

list_title = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/text()') # 章节标题

# print(list_href,list_title)# 小说的章节标题和链接

if not os.path.exists(read_path):#判断路径是否存在

os.mkdir(read_path) # 创建小说文件夹路径

for j in range(len(list_href)): # 保存小说章节内容

r2 = requests.get(list_href[j])

r2.encoding = 'gbk'

html_content = etree.HTML(r2.text)

try:

content = ''.join(html_content.xpath('//*[@id="content"]/text()'))

except AttributeError:

print(list_href[j],"读取内容失败")

#小说列表字符处理

list_titlee = re.sub(r"[\\/:*?<>|!\"]", '', list_title[j])

list_read_path = read_path + '/' + list_titlee

# 写入小说

if not os.path.exists(list_read_path):#判断路径是否存在

os.mkdir(list_read_path)

file = open(list_read_path + '/' + 'text.txt', 'w', encoding='utf-8')

file.write(content)

file.close()

#出现解析等问题重试五次

attempts = 1

success = False

while attempts < 6 and not success:

try:

crawl_page()

success = True

except:

print("失败重试...")

attempts += 1

if attempts == 6:

break

if __name__ == '__main__':

urls = get_url_page()

threads = []

for uu in urls:#对每一页小说进行迭代

for u in uu:

t1 = threading.Thread(target=get_url_page_book,args=(u,))

t1.start()

time.sleep(1)

threads.append(t1)

time.sleep(90)

for j in threads:

j.join()

weixin_39574246

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python自制英汉词典不用爬虫_python小说全站爬虫，渣渣程序需改进

[Python] 纯文本查看复制代码import requestsfrom lxml import etreeimport timeimport reimport osimport threading#9.20修改get_url()def get_url():# 全书网每个栏目的urlfor i in range(1,12):url = 'http://www.quanshuwang.com/l...
复制链接

扫一扫