import urllib.request
import re
from bs4 import BeautifulSoup as bs
def urlopen(url):
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
html = urllib.request.urlopen(req)
html = html.read()
return html
def list_url(url):
html = urlopen(url)
html = html.decode('gbk')
html = bs(html,'lxml')
html = html.find_all('div',id="list-chapterAll")
urllist = html[0]
urllist =urllist.find_all('a')
list1=[]
for i in urllist:
i = i.attrs
url2 = i['href']
url2 = url+url2
list1.append(url2)
return list1
def name(url):
html = urlopen(url)
html = html.decode('gbk')
html = bs(html,'lxml')
title = html.h1.string
title = str(title)
return title
def xia(url):
na = name(url)
urlname = na +'.txt'
urllist = list_url(url)
print(len(urllist))
for i in urllist:
html = urlopen(i)
html = html.decode('gbk')
html = bs(html,'lxml')
h1 = html.h1
h1 = str(h1)
h1 = re.search(r'(readTitle">)(.*? )(<small>)',h1)
h1 =h1.group(2)
content = html.find_all('div',id="htmlContent")
content = content[0]
content = content.text
aa = re.compile('\xa0')
content = re.sub(aa,'',content)
with open(urlname,'a')as f:
f.write(h1)
f.write(content)
print(h1)
url = 'http://www.biqukan.net/book/48302/'
#这里放小说的链接就可以了
xia(url)
笔趣看 python3爬虫实例
最新推荐文章于 2020-08-15 19:36:20 发布