小说名称《赘婿》,作者:愤怒的香蕉,侵权删
单线程代码,代码实现:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/11/17 11:34
# @Author : huni
# @File : 爬纵横中文小说.py
# @Software: PyCharm
import requests
from lxml import etree
import Zstring
import os
import time
start = time.time()
if not os.path.exists('./赘婿'):
os.mkdir('./赘婿')
headers = {
'Referer': 'http://www.xbiquge.la/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
url = 'http://www.xbiquge.la/0/885/'
session = requests.Session()
cata = session.get(url=url,headers=headers)
print(cata.status_code)
cata = cata.text.encode('ISO-8859-1','ignore')
# #处理爬取到的中文字符是乱码的情况
# cata = cata.text.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(cata.text)[0])
tree = etree.HTML(cata)
cata_url_list = tree.xpath('//*[@id="list"]/dl/dd')
for cata_url in cata_url_list:
url1 = 'http://www.xbiquge.la' + cata_url.xpath('./a/@href')[0]
title = cata_url.xpath('./a/text()')[0]
# print(url1,title)
time.sleep(2)
page_text = session.get(url=url1,headers=headers)
print(page_text.status_code)
page_text = page_text.text.encode('ISO-8859-1', 'ignore')
#处理爬取到的中文字符是乱码的情况
# page_text = page_text.text.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(page_text.text)[0])
tree1 = etree.HTML(page_text)
content_list = tree1.xpath('//*[@id="content"]/text()')
file_name = '赘婿/' + f'./{title}.txt'
with open(file_name,'a',encoding='utf-8') as fp:
for content in content_list:
content = ''.join(content)
content = content.replace(' ',' ')
# print(len(content))
content = Zstring.String(content)
fp.write(content.paragraph(40))
print("%s写入完成"%title)
end = time.time()
print('用时: ',end-start)
多线程代码实现:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/11/14 18:33
# @Author : huni
# @File : xiaoshuo.py
# @Software: PyCharm
from threading import Thread #多线程的包
from queue import Queue #队列
from fake_useragent import UserAgent #模拟请求头的包,可用可不用,我这里没用,自己写的headers
import requests
from lxml import etree
import os
import Zstring
import time
class CrawlInfo(Thread):
#重写构造函数
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
#声明两个类属性
self.url_queue = url_queue
self.html_queue = html_queue
#重写run方法
def run(self):
#爬虫代码
headers = {
'Referer': 'http://www.xbiquge.la/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36'
}
while self.url_queue.empty() == False: #当url队列中不是空的就继续爬
url = self.url_queue.get() #从队列中获取一个url
cata = requests.get(url=url,headers=headers)
cata1 = cata.text.encode('ISO-8859-1', 'ignore')
if cata.status_code == 200:
self.html_queue.put(cata1) #访问成功把html文件放进html队列中
class ParseInfo(Thread):
def __init__(self,html_queue):
Thread.__init__(self)
self.html_queue = html_queue
#重写run方法
def run(self):
head = {
'Referer': 'http://www.xbiquge.la/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
while self.html_queue.empty() == False:
e = etree.HTML(self.html_queue.get()) #从html队列中获取一个html使用etree解析
#下面的内容可以修改为自己需要解析的内容,比如爬取段子,音频,视频等,只需要在运行前把base_url做更改就可以了
cata_url_list = e.xpath('//*[@id="list"]/dl/dd')
for cata_url in cata_url_list:
url1 = 'http://www.xbiquge.la' + cata_url.xpath('./a/@href')[0]
title = cata_url.xpath('./a/text()')[0]
# print(url1,title)
time.sleep(2)
page_text = requests.get(url=url1, headers=head)
# print(page_text.status_code)
page_text = page_text.text.encode('ISO-8859-1', 'ignore')
# 处理爬取到的中文字符是乱码的情况
# page_text = page_text.text.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(page_text.text)[0])
tree1 = etree.HTML(page_text)
content_list = tree1.xpath('//*[@id="content"]/text()')
file_name = '赘婿/' + f'./{title}.txt'
with open(file_name, 'w', encoding='utf-8') as fp:
for content in content_list:
content = ''.join(content)
content = content.replace(' ', ' ')
content = Zstring.String(content)
# print(content.paragraph(40,first_line=0))
fp.write(content.paragraph(40))
# if
print("%s写入完成" % title)
if __name__ == '__main__':
start = time.time()
if not os.path.exists('./赘婿'):
os.mkdir('./赘婿')
#创建一个存储有url和html的容器:队列
url_queue = Queue()
html_queue = Queue()
base_url = 'http://www.xbiquge.la/0/885/'
url_queue.put(base_url)
crawl_list = [] #创建三个线程,加到线程列表中
for i in range(1):
Crawl = CrawlInfo(url_queue,html_queue)
crawl_list.append(Crawl)
Crawl.start()
for crawl in crawl_list:
#等待操作,可以理解成url队列解析完了之后,需要等待一会,再交给html队列解析内容
crawl.join()
parse_list = []
for i in range(1):
parse = ParseInfo(html_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
end = time.time()
print('用时: ',end-start)
成果图: