from bs4 import BeautifulSoup as bs
import urllib
class Spider(object):
def __init__(self):
self.base_url='https://www.biquge.com.cn'
def load_page(self,url):
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0"}
url=self.base_url+url
request=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(request)
html=response.read().decode('utf-8')
return html
def parse_page(self,html):
html=bs(html,'lxml')
result=html.select('dd a')
for i in result:
item={}
html=bs(self.load_page(i.attrs['href']),'lxml')
content=html.find('div',attrs={'id':"content"}).text
print('正在加载:'+i.text)
item[i.text]=content
with open(i.text+'.txt','ab') as file:
print('正在缓存:'+i.text)
file.write(str(item).encode())
if __name__=='__main__':
spider=Spider()
html=spider.load_page('/book/23488/')
spider.parse_page(html)
python之爬取小说
最新推荐文章于 2024-08-07 09:00:00 发布