import requests
import re
import os
from bs4 import BeautifulSoup
catalog_list_name=[]
catalog_list_href=[]
catalog_list_url=[]
def HTMLToSoup(url):
html=requests.get(url,headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95'})
html.encoding=html.apparent_encoding
soup=BeautifulSoup(html.text,'html.parser')
return soup
url='http://www.xbiquge.la/15/15003/'
soup=HTMLToSoup(url)
catalog_html=soup.find(name='body').find(name='div',id='list').find('dl').find_all('dd')
for i in catalog_html:
catalog_list_name.append(i.find('a').string)
catalog_list_href.append(i.find('a').attrs['href'])
for i in catalog_list_href:
catalog_list_url.append('http://www.xbiquge.la'+i)
def get_content(url):
content_soup=HTMLToSoup(url)
text=content_soup.find(name='div',id='content')
return text.text
def put_all_content(mingzilist,urllist2):
s=1
if os.path.exists('C:\\Users\\Administrator\\Desktop\\xiaoshuo'):
print('目录已经存在')
else:
os.mkdir('C:\\Users\\Administrator\\Desktop\\xiaoshuo')
print('目录不存在,创建')
for u in urllist2:
with open('C:\\Users\\Administrator\\Desktop\\xiaoshuo\\'+mingzilist[s]+'.txt','w',encoding='utf-8') as f:
f.write(get_content(u))
print('已爬取',s,'章')
s=s+1
put_all_content(catalog_list_name,catalog_list_url)