爬完数据目录和内容后,我们来爬取书籍的基本信息。
在上篇博客的基础上,爬取书籍信息并存入字典
# -*- coding: utf-8 -*-
import urllib.request
import bs4
import re
import sqlite3
def getHtml(url):
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent":user_agent}
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
return html
# 爬取整个网页
def parse(url):
html_doc = getHtml(url)
sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
return sp
# 爬取书籍基本信息
def get_book_baseinfo(url):
# class = "info"信息获取
info = parse(url).find('div',class_ = 'info')
book_info = {}
if info:
book_info['title'] = ''
book_info['img'] = ''
# 标题
book_info['title'] = info.find('h2').string
# 图片链接
img = info.find('div',class_ = 'cover')
for im in img.children:
# 图片地址想要访问,显然需要拼接
book_info['img'] = 'http://www.biqukan.com' + im.attrs['src']
# 基本信息存储
ifo = info.find('div',class_ = 'small')
bkinfo = []
for b in ifo:
for v in b.children:
t = v.string
if t:
bkinfo.append(''.join(t))
# 将:后面的信息连起来
spv = []
cv = ''
for v in bkinfo:
if v.find(':') >= 0:
if cv:
spv.append(cv)
cv = v
else:
cv += v
spv.append(cv)
# 基本信息转成字典
for element in spv:
its = [v.strip() for v in element.split(':')]
if len(its) != 2:
continue
nm = its[0].lower() # 统一成小写
if type(nm).__name__ == 'unicode':
nm = nm.encode('utf-8')
vu = its[1]
book_info[nm] = vu
# 发现这里获取到的字典键与后面将要获取的键重复了,所以这里改一下
book_info['auther'] = book_info.pop('作者')
#简介获取(与基本信息的获取方式一致)
intro = info.find('div',class_ = 'intro')
bkurl = []
for b in intro:
t = b.string
if t:
bkurl.append(''.join(t))
bkjj = []
cvx = ''
for w in bkurl:
if w.find(':') >= 0:
if cvx:
bkjj.append(cvx)
cvx = w
else:
cvx += w
bkjj.append(cvx)
for ele in bkjj:
itis = [n.strip() for n in ele.split(':')]
if len(itis) != 2:
continue
summ = itis[0].lower() # 统一成小写
if type(summ).__name__ == 'unicode':
summ = summ.encode('utf-8')
vux = itis[1]
book_info[summ] = vux
# 使用笨办法将字典的key转成英文状态,这样方便数据库存储
book_info['type'] = book_info.pop('分类')
book_info['status'] = book_info.pop('状态')
book_info['num'] = book_info.pop('字数')
book_info['updatatime'] = book_info.pop('更新时间')
book_info['newchapter'] = book_info.pop('最新章节')
book_info['authsummery'] = book_info.pop('作者')
book_info['summery'] = book_info.pop('简介')
book_info['notipurl'] = book_info.pop('无弹窗推荐地址')
return book_info
# 获取书籍目录
def get_book_dir(url):
books_dir = []
name = parse(url).find('div', class_='listmain')
if name:
dd_items = name.find('dl')
dt_num = 0
for n in dd_items.children:
ename = str(n.name).strip()
if ename == 'dt':
dt_num += 1
if ename != 'dd':
continue
Catalog_info = {}
if dt_num == 2:
durls = n.find_all('a')[0]
Catalog_info['name'] = (durls.get_text())
Catalog_info['url'] = 'http://www.biqukan.com' + durls.get('href')
books_dir.append(Catalog_info)
return books_dir
# 获取章节内容
def get_charpter_text(curl):
# 直接根据curl地址取章节内容就行了获取一篇文章??!!!
# 这是一个地址,你不要打开网页获取网页内容?这不是已经
text = parse(curl).find('div', class_='showtxt')
if text:
cont = text.get_text()
cont = [str(cont).strip().replace('\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0', '').replace('\u3000\u3000', '')]
c = " ".join(cont)
ctext = re.findall(r'^.*?html', c)
return ctext
else:
return ''
# 获取书籍
def get_book(burl):
# 目录
book = get_book_dir(burl)
if not book:
return book
# 内容
for d in book:
curl = d['url']
try:
print('正在获取章节【{}】【内容】【{}】'.format(d['name'],d['url']))
ctext = get_charpter_text(curl)
d['text'] = ctext
print(d['text'])
print()
except Exception as err:
d['text'] = 'get failed'
return book
if __name__ == '__main__':
# 这里调用get_book_baseinfo函数看看效果
book = get_book_baseinfo('http://www.biqukan.com/1_1094/')
print(book)
结果展示:
{'title': '一念永恒', 'img': 'http://www.biqukan.com/files/article/image/1/1094/1094s.jpg', 'auther': '耳根', 'type': '玄幻小说', 'status': '连载', 'num': '3689058', 'updatatime': '2018-02-09 18:20:00', 'newchapter': '第1314章 你的选择(终)', 'authsummery': '耳根所写的《一念永恒》无弹窗免费全文阅读为转载作品,章节由网友发布。', 'summery': '一念成沧海,一念化桑田。一念斩千魔,一念诛万仙。唯我念……永恒', 'notipurl': 'http://www.biqukan.com/1_1094/?_t_t_t=0.4355400702253367'}
后面我们将这些信息存储到数据库。
创作不易,喜欢的话给喵喵赏点吧~(可怜眼神)