本文实例讲述了python实现爬虫抓取小说功能。分享给大家供大家参考,具体如下:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from urllib import request
import re
import os,time
#访问url,返回html页面
def get_html(url):
req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0')
response = request.urlopen(url)
html = response.read()
return html
#从列表页获取小说书名和链接
def get_books(url):#根据列表页,返回此页的{书名:链接}的字典
html = get_html(url)
soup = BeautifulSoup(html,'lxml')
fixed_html = soup.prettify()
books = soup.find_all('div',attrs={'class':'bbox'})
book_dict = {}
for book in books:
book_name = book.h3.a.string
book_url = book.h3.a.get('href')
book_dict[book_name] = book_url
return book_dict
#根据书名链接&