爬完数据目录和内容后,我们来爬取书籍的基本信息。
在上篇博客的基础上,爬取书籍信息并存入字典
# -*- coding: utf-8 -*-
import urllib.request
import bs4
import re
import sqlite3
def getHtml(url):
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent":user_agent}
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
return html
# 爬取整个网页
def parse(url):
html_doc = getHtml(url)
sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
return sp
# 爬取书籍基本信息
def get_book_baseinfo(url):
# class = "info"信息获取
info = parse(url).find('div',class_ = 'info')
book_info = {}
if info:
book_info['title'] = ''
book_info['img'] = ''
# 标题
book_info['title'] = info.find('h2').string
# 图片链接
img = info.find('div',class_ = 'cover')
for im in img.children:
# 图片地址想要访问,显然需要拼接