准备工作
安装:
# 数据库驱动
pip install pymysql
# 数据库连接池
pip install DBUtils
建表
CREATE TABLE `novel` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增主键',
`title` varchar(100) NOT NULL COMMENT '标题',
`content` text NOT NULL COMMENT '内容',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2353 DEFAULT CHARSET=utf8
脚本
# 写入数据库
def write_db(chapter, content):
sql = "INSERT INTO novel (title, content) VALUES(%(title)s, %(content)s);"
param = {"title": chapter, "content": content}
mysql.insert(sql, param)
book_db.py
# 导入requests库
import requests
# 导入文件操作库
import codecs
from bs4 import BeautifulSoup
import sys
import mysql_DBUtils
from mysql_DBUtils import MyPymysqlPool
import importlib
importlib.reload(sys)
# 给请求指定一个请求头来模拟chrome浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
server = 'http://www.biquge.cm'
# 星辰变地址
book = 'http://www.biquge.cm/2/2042/'
# 定义DB
mysql = MyPymysqlPool("dbMysql")
# 获取章节内容
def get_contents(chapter):
req = requests.get(url=chapter)
html = req.content
html_doc = str(html, 'gbk')
bf = BeautifulSoup(html_doc, 'html.parser')
texts = bf.find_all('div', id="content")
# 获取div标签id属性content的内容 \xa0 是不间断空白符
content = texts[0].text.replace('\xa0' * 4, '\n')
return content
# 写入数据库
def write_db(chapter, content):
sql = "INSERT INTO novel (title, content) VALUES(%(title)s, %(content)s);"
param = {"title": chapter, "content": content}
mysql.insert(sql, param)
# 主方法
def main():
res = requests.get(book, headers=headers)
html = res.content
html_doc = str(html, 'gbk')
# 使用自带的html.parser解析
soup = BeautifulSoup(html_doc, 'html.parser')
# 获取所有的章节
a = soup.find('div', id='list').find_all('a')
print('总章节数: %d ' % len(a))
for each in a:
try:
chapter = server + each.get('href')
content = get_contents(chapter)
chapter = each.string
write_db(chapter, content)
except Exception as e:
print(e)
mysql.dispose()
if __name__ == '__main__':
main()
数据库配置文件:
[dbMysql]
host = localhost
port = 3306
user = root
password = 123456
db_name = book
mysql_DBUtils.py
#!/usr/bin/python3
# -*- coding:utf-8 -*-
import pymysql, os, configparser
from pymysql.cursors import DictCursor
from DBUtils.PooledDB import PooledDB
import chardet
import json
class Config(object):
"""
# Config().get_content("user_information")
配置文件里面的参数
[dbMysql]
host = 192.168.1.101
port = 3306
user = root
password = python123
"""
def __init__(self, config_filename="dbMysqlConfig.cnf"):
file_path = os.path.join(os.path.dirname(__file__), config_filename)
self.cf = configparser.ConfigParser()
self.cf.read(file_path)
def get_sections(self):
return self.cf.sections()
def get_options(self, section):
return self.cf.options(section)
def get_content(self, section):
result = {}
for option in self.get_options(section):
value = self.cf.get(section, option)
result[option] = int(value) if value.isdigit() else value
return result
class BasePymysqlPool(object):
def __init__(self, host, port, user, password, db_name):
self.db_host = host
self.db_port = int(port)
self.user = user
self.password = str(password)
self.db = db_name
self.conn = None
self.cursor = None
class MyPymysqlPool(BasePymysqlPool):
"""
MYSQL数据库对象,负责产生数据库连接 , 此类中的连接采用连接池实现
获取连接对象:conn = Mysql.getConn()
释放连接对象;conn.close()或del conn
"""
# 连接池对象
__pool = None
def __init__(self, conf_name=None):
self.conf = Config().get_content(conf_name)
super(MyPymysqlPool, self).__init__(**self.conf)
# 数据库构造函数,从连接池中取出连接,并生成操作游标
self._conn = self.__getConn()
self._cursor = self._conn.cursor()
def __getConn(self):
"""
@summary: 静态方法,从连接池中取出连接
@return MySQLdb.connection
"""
if MyPymysqlPool.__pool is None:
__pool = PooledDB(creator=pymysql,
mincached=1,
maxcached=20,
host=self.db_host,
port=self.db_port,
user=self.user,
passwd=self.password,
db=self.db,
use_unicode=False,
charset="utf8mb4",
cursorclass=DictCursor)
return __pool.connection()
def getAll(self, sql, param=None):
"""
@summary: 执行查询,并取出所有结果集
@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
@param param: 可选参数,条件列表值(元组/列表)
@return: result list(字典对象)/boolean 查询到的结果集
"""
if param is None:
count = self._cursor.execute(sql)
else:
count = self._cursor.execute(sql, param)
if count > 0:
result = self._cursor.fetchall()
else:
result = False
return result
def getOne(self, sql, param=None):
"""
@summary: 执行查询,并取出第一条
@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
@param param: 可选参数,条件列表值(元组/列表)
@return: result list/boolean 查询到的结果集
"""
if param is None:
count = self._cursor.execute(sql)
else:
count = self._cursor.execute(sql, param)
if count > 0:
result = self._cursor.fetchone()
else:
result = False
return result
def getMany(self, sql, num, param=None):
"""
@summary: 执行查询,并取出num条结果
@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
@param num:取得的结果条数
@param param: 可选参数,条件列表值(元组/列表)
@return: result list/boolean 查询到的结果集
"""
if param is None:
count = self._cursor.execute(sql)
else:
count = self._cursor.execute(sql, param)
if count > 0:
result = self._cursor.fetchmany(num)
else:
result = False
return result
def insertMany(self, sql, values):
"""
@summary: 向数据表插入多条记录
@param sql:要插入的SQL格式
@param values:要插入的记录数据tuple(tuple)/list[list]
@return: count 受影响的行数
"""
count = self._cursor.executemany(sql, values)
return count
def __query(self, sql, param=None):
if param is None:
count = self._cursor.execute(sql)
else:
count = self._cursor.execute(sql, param)
return count
def update(self, sql, param=None):
"""
@summary: 更新数据表记录
@param sql: SQL格式及条件,使用(%s,%s)
@param param: 要更新的 值 tuple/list
@return: count 受影响的行数
"""
return self.__query(sql, param)
def insert(self, sql, param=None):
"""
@summary: 更新数据表记录
@param sql: SQL格式及条件,使用(%s,%s)
@param param: 要更新的 值 tuple/list
@return: count 受影响的行数
"""
return self.__query(sql, param)
def delete(self, sql, param=None):
"""
@summary: 删除数据表记录
@param sql: SQL格式及条件,使用(%s,%s)
@param param: 要删除的条件 值 tuple/list
@return: count 受影响的行数
"""
return self.__query(sql, param)
def begin(self):
"""
@summary: 开启事务
"""
self._conn.autocommit(0)
def end(self, option='commit'):
"""
@summary: 结束事务
"""
if option == 'commit':
self._conn.commit()
else:
self._conn.rollback()
def dispose(self, isEnd=1):
"""
@summary: 释放连接池资源
"""
if isEnd == 1:
self.end('commit')
else:
self.end('rollback')
self._cursor.close()
self._conn.close()
if __name__ == '__main__':
mysql = MyPymysqlPool("dbMysql")
sqlAll = "select * from novel limit 1"
result = mysql.getOne(sqlAll)
print(result)
# print(result['title'])
# print(result['title'].decode('utf-8'))
# print(result.decode('utf-8'))
# print(chardet.detect(result['title']))
data = {
'no': 1,
'name': 'Runoob',
'url': 'http://www.runoob.com'
}
json_str = json.dumps(result)
print(json_str)
# 释放资源
mysql.dispose()