import requests
from bs4 import BeautifulSoup
import os
import shutil
start = os.getcwd()
def getHTMLtext(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'gbk'
return r.text
except:
return ""
def list2txt(title_list, content_list, i):
global start
untitle = '?!@#$%^&*()+<>,.:";'
os.chdir(start)
dn = str(i) # dirname
fn = '?'
if os.path.exists(dn):
shutil.rmtree(dn)
os.mkdir(dn)
os.chdir(dn)
for n, ff in enumerate(title_list):
for t in untitle:
ff = ff.replace(t, "")
fn = ff + '.txt'
print(fn)
for temp in content_list[n]:
if temp == '\n':
content_list[n].remove(temp)
joke = ((str(content_list[n])).replace("<p>", "") \
.replace("</p>", "").replace('[', "").replace(']', "").replace(u'\xa0', u''))
# print(fn)
# print(len(content_list))
with open(fn, "w") as f:
# f.write(joke[::2]if joke[0]=='\n' else joke[::1])
f.write(joke) # \n待清除
f.write('\n\n')
f.close()
os.chdir(start)
def get_list(soup, i):
content_list = []
title_list = []
data_all = soup.find('ul', {'class': "article-list"})
for a in data_all:
data = data_all.find_all('li', {'class': "article-summary"})
for b in data:
# <span class="article-title"><a target="_blank" href="/detail60/59045.html">闲侃男女,笑语连珠</a></span>
data_title = b.find('span', {'class': "article-title"})
if data_title.string in title_list:
continue
title_list.append(data_title.string)
data_content = b.find('div', {'class': "summary-text"})
if data_content.contents in content_list:
continue
content_list.append(data_content.contents)
list2txt(title_list, content_list, i)
def main():
# https://xiaohua.zol.com.cn/new/2.html
base = "https://xiaohua.zol.com.cn/new/"
n = 5
# n=eval(input('请输入要爬多少页\n'))
for i in range(1, n):
url = base + str(i) + '.html'
html = getHTMLtext(url)
# 获取每一页的笑话 标题 存入TXT
soup = BeautifulSoup(html, "html.parser")
# 建立文件夹 保存TXT
get_list(soup, i)
main()