由于时间关系,我们先把每章的内容存储到数据库。
需要用到sqlite,
接着上一篇,在原基础上修改代码如下:
# -*- coding: utf-8 -*-
import urllib.request
import bs4
import re
import sqlite3
import time
print ('连接数据库……')
cx = sqlite3.connect('PaChong.db')
# #在该数据库下创建表
# 创建书籍基本信息表
cx.execute('''CREATE TABLE book_info(
id INTEGER PRIMARY KEY AUTOINCREMENT,
title verchar(128) not null,
img verchar(512) null,
auther verchar(64) null,
type verchar(128) null,
status verchar(64) null,
num int null,
updatatime datetime null,
newchapter verchar(512) null,
authsummery verchar(1024) null,
summery verchar(1024) null,
notipurl verchar(512) null);
''')
# 创建章节内容表
cx.execute('''CREATE TABLE book_chapter(
id INTEGER PRIMARY KEY AUTOINCREMENT,
book_id int null ,
chapter_no int null ,
chapter_name verchar(128) null,
chapter_url verchar(512) null,
chapter_content text null);
''')
print("Table created successfully")
print("数据库连接完成")
def getHtml(url):
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent":user_agent}
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
return html
# 爬取整个网页
def parse(url):
html_doc = getHtml(url)
sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
return sp
# 爬取书籍基本信息
def get_book_baseinfo(url):
#