import requests
from lxml import etree
class QiuBaiSpider(object):
def __init__(self):
# 要爬取的地址
self.url_temp = 'https://www.qiushibaike.com/text/page/{}/'
# 请求头
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_url_list(self, page_num):
return [self.url_temp.format(i) for i in range(1, page_num)]
def paser_one_url(self, url1):
# 发送请求,获取响应
response = requests.get(url1, headers=self.headers)
return response.content.decode()
def paser_two_url(self, url2):
response = requests.get(url2, headers=self.headers)
return response.content
def get_page_url(self, html_str):
num = 1
html = etree.HTML(html_str)
self.div_list = html.xpath('//div[@id="content-left"]/div/a[1]/@href')
# 遍历出每一个页面
for i in self.div_list:
# 在文本中查看哪些是有已经下载过的地址
with open('已下载段子网址.txt', 'r') as f:
a = f.readlines()
# 如果当前地址是没有下载过的就进行下载
if i + '\n' not in a:
# 拼接页面地址
url2 = 'https://www.qiushibaike.com' + i
url2_html = self.paser_two_url(url2)
html2 = etree.HTML(url2_html)
data_list = html2.xpath('//div[@id="single-next-link"]/div[@class="content"]/text()')
data = ''.join(data_list)
print('-' * 1000)
print('%s.' % num)
print(data)
# 保存数据
with open('糗事百科.txt', 'a', encoding='utf-8') as f:
f.write(str(num) + '.')
f.write(data)
f.write('\n\n\n')
num += 1
print('保存成功')
print('-' * 1000)
# 将已经下载过的地址存入文本中
with open('已下载段子网址.txt', 'a', encoding='utf-8') as f:
f.write(i)
f.write('\n')
# 查到当前的地址是已经下载过的就跳过
else:
print('已下载')
f.close()
def run(self):
# 1.获取url_list,所有网页
page_num = int(input('请输入要爬取的页数:'))
url_list = self.get_url_list(int(page_num) + 1)
print(url_list)
# 2.遍历,发送请求,获取响应
num = 1
for url1 in url_list:
print('第%s页' % num)
html_str = self.paser_one_url(url1)
get_data = self.get_page_url(html_str)
with open('糗事百科.txt', 'a', encoding='utf-8') as f:
f.write('\n\n\n')
num += 1
if __name__ == '__main__':
qiubai = QiuBaiSpider()
qiubai.run()
糗事百科(文字)爬取
最新推荐文章于 2021-05-27 10:05:47 发布