import urllib.request
import re
from bs4 import BeautifulSoup as bs
def urlopen(url):
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
html = urllib.request.urlopen(req)
html = html.read()
return html
def list1(url):
html = urlopen(url)
html = bs(html,'lxml')
list1 = html.div(id="list")
#这个id是所有折链接都在这个DIV里面
list1 = list1[0]
#返回的是一个列表所以要取出来
list1 = list1.find_all('a')
#再从这个列表里面把所有的链接取出来
urllist = []
#这里新建一个列表来装链接
for i in list1:
i = i.attrs
i = i['href']
i = 'http://www.biquge.com.tw'+i
urllist.append(i)
return urllist
def xia(url):
urllist = list1(url)
name = urlopen(url)
name = bs(name,'lxml')
name = name.h1.string
na = name+'.txt'
#文件名以小说名加.txt
for i in urllist:
html = urlopen(i)
html = bs(html,'lxml')
h1 = html.h1.string
#这里是小说章节名字
content = html.div(id="content")
content = content[0]
content = content.text
aa = re.compile('\xa0')
content = re.sub(aa,'',content)
#去掉不能解码的内容
with open( na ,'a')as f:
f.write(h1)
#写入章节名字
f.write(content)
#这里是小说正文
print('已经下载'+h1)
url= 'http://www.biquge.com.tw/18_18820/'
xia(url)
笔趣阁小说 python3爬虫实例
最新推荐文章于 2024-08-14 08:38:25 发布