4、Python数据库操作
1Python 操作数据库—— MySQL篇
1-1 数据库种类概述
数据库分类:
1) SQL NoSQL
2) 单机分布式
3) 文件型内存型
4) 批处理交互式
数据库产品:
1) SQL(结构化查询语言)– MySQLPostgreSQL SQLServer oracle SQLite Access
2) NoSQL(非关系型数据库) – mongoDB
3) 单机(数据库运行在一台数据库上)- MySQL PostgreSQL
4) 分布式(数据库运行在服务器集群上)– HIVE IMPALA
5) 文件型(数据放在硬盘上)– MySQL mongoDB
6) 内存型(数据放在内存里)– redis memcached
7) 批处理(将SQL分成mapreduce任务)- HIVE
8) 交互式(分级查询之后汇总后)– IMPALA HBASE DynamoDB
AWS介绍 – amazon webservice
http://aws.amazon.com/cn/
1-2 MySQL概述及基本使用
AWS启动MySQL实例
SQL UI工具和基本语法 - workbench
1-3 MySQL Python客户端的使用
1)官方客户端mysql-connector使用
Windows
安装:mysql-connector-python-2.1.3-py2.7-win32.msi
Linux
安装:dpkg -i mysql-connector-python-cext_2.1.3-1ubuntu14.04_i386.deb
代码:
#coding=utf-8
from __future__ import print_function #导入Python3的打印函数
sql = ('SELECT * from ipdata limit 5') #数据查找
# mysql-connector
print('mysql-connector'.center(50, '='))
from mysql import connector
cnx = connector.Connect(host="127.0.0.1", user="yourusername",password="yourpassword", database="pythontest", charset="utf8")
# cnx.autocommit = True
db0 = cnx.cursor()
db0.execute(sql)
for row in db0:
print(*row) # print row[0], row[1], row[2], row[3]
2)第三方客户端MySQLdb使用
安装:pip install mysql-python
代码:
# MySQLdb
print('MySQLdb'.center(50, '='))
import MySQLdb
def connect_mysql(db_host='127.0.0.1',user='yourusername',passwd='yourpassword',db='pythontest', charset='utf8'):
conn = MySQLdb.connect(host=db_host, user=user, passwd=passwd, db=db, charset=charset)
conn.autocommit(True)
return conn.cursor()
db1 = connect_mysql()
db1.execute(sql)
for row in db1:
print(*row)
3)MySQLdb的二次封装torndb使用
安装:
pip install torndb
pip install simplejson
代码:
# torndb1
print('torndb1'.center(50, '='))
import torndb
import simplejson as json
db2 = torndb.Connection(
host='127.0.0.1',
database='pythontest',
user='yourusername',
password='yourpassword',
charset="utf8")
rows = db2.query(sql)
for row in rows:
print(json.dumps(row, ensure_ascii=False))
# torndb3
print('torndb2'.center(50, '='))
row = db2.get('SELECT * from ipdata limit 1') # 返回一行数据时,使用get方法
print(json.dumps(row, ensure_ascii=False))
4)使用这三种客户端连接方式对数据进行插入操作
代码:
# ! /usr/bin/python
# -*- coding: utf-8 -*-
sql = 'INSERT INTO `ipdata` (`startip`,`endip`,`country`,`local`) VALUES(18684928,18684928,"内蒙古赤峰市巴林左旗","联通林东镇新城区BRAS数据机房")'
sql_tmp = 'INSERT INTO `ipdata`(`startip`,`endip`,`country`,`local`) VALUES (%s, %s, %s, %s)'
values = [(16890112,16891391,"泰国","曼谷"),(16891392,16891647,"泰国","如果硅农"), (16891648,16892159,"泰国","加拉信府")]
# mysql-connector
print('mysql-connector'.center(50, '='))
from mysql import connector
cnx = connector.Connect(host="127.0.0.1", user="yourusername",password="yourpassword", database="pythontest", charset="utf8")
cnx.autocommit = True
db0 = cnx.cursor()
print db0.execute(sql)
print db0.executemany(sql_tmp, values)
# MySQLdb
print('MySQLdb'.center(50, '='))
import MySQLdb
def connect_mysql(db_host="127.0.0.1", user="yourusername",passwd="yourpassword",db="pythontest", charset="utf8"):
conn = MySQLdb.connect(host=db_host, user=user, passwd=passwd, db=db, charset=charset)
conn.autocommit(True)
return conn.cursor()
db1 = connect_mysql()
print db1.execute(sql), db1.lastrowid
print db1.executemany(sql_tmp, values),db1.lastrowid
# torndb
print('torndb1'.center(50, '='))
import torndb
db2 = torndb.Connection(
host='127.0.0.1',
database='pythontest',
user='yourusername',
password='yourpassword',
charset="utf8")
print db2.insert(sql)
print db2.insertmany(sql_tmp, values)
2 Python 操作数据库-SQLAlchemy篇
2-1 ORM 概述及 SQLAlchemy 构架解析
ORM (Object Relatonal Mapping) – 对象关系映射。实质是将关系数据库中的业务数据用对象的形式表示出来,并通过面向对象的方式将这些对象组织起来,最终在应用程序中创建一个“虚拟对象数据库“。
ORM的优点:
隐藏数据库实现,让业务代码访问对象而不是数据库
良好的数据库操作接口,简单、学习成本低
动态数据表映射,在表结构甚至数据库改变时,减少代码修改量
可以方便的引入缓存功能
SQLAlchemy概述 – 稳定性,高性能性 –openstack yelp Dropbox reddit
SQLAlchemy构架解析:
2-2 SQLAlchemy ORM 详解
1) SQLAlchemy ORM建立连接以及创建表策略
安装:
pip install flask-sqlalchemy
验证:
python
from flask.ext.sqlalchemy import SQLAlchemy
代码:
// 连接以及创建表
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer,String
from sqlalchemy.orm import sessionmaker
print(sqlalchemy.__version__)
# examples of connectionhttp://docs.sqlalchemy.org/en/rel_1_0/core/engines.html#sqlalchemy.create_engine
engine = create_engine('sqlite:///foo.db', echo=True)
Base = declarative_base()
class User(Base):
__tablename__ = 'users'
id =Column(Integer, primary_key=True)
name = Column(String)
fullname = Column(String)
password = Column(String)
def __repr__(self):
return "<User(name='%s', fullname='%s', password='%s')>" % (
self.name, self.fullname, self.password)
Base.metadata.create_all(engine)
ed_user = User(name='ed', fullname='Ed Jones', password='edspassword')
print(ed_user)
2) SQLAlchemy ORM插入和查询数据
代码:
//插入 – 用事务才可以查看表的内容
ed_user = User(name='ed', fullname='Ed Jones', password='edspassword')
print(ed_user)
Session = sessionmaker(bind=engine)
session = Session()
session.add(ed_user)
our_user = session.query(User).filter_by(name='ed').first()
# SELECT * FROM users WHERE name="ed" LIMIT 1;
session.add_all([
User(name='wendy', fullname='Wendy Williams', password='foobar'),
User(name='mary', fullname='Mary Contrary', password='xxg527'),
User(name='fred', fullname='Fred Flinstone', password='blah')])
session.commit()
//查询
print(session.query(User).filter_by(name='ed').first())
print(session.query(User).all())
for row in session.query(User).order_by(User.id):
print(row)
for row in session.query(User).filter(User.name.in_(['ed', 'wendy', 'jack'])):
print(row)
for row in session.query(User).filter(~User.name.in_(['ed', 'wendy', 'jack'])):
print(row)
print(session.query(User).filter(User.name == 'ed').count())
from sqlalchemy import and_, or_
for row in session.query(User).filter(and_(User.name == 'ed', User.fullname == 'Ed Jones')):
print(row)
for row in session.query(User).filter(or_(User.name == 'ed', User.name == 'wendy')):
print(row)
3) 高级话题– 建立带有外键的表join
代码:
//外键的表
from sqlalchemy import ForeignKey
from sqlalchemy.orm import relationship,backref
class Address(Base):
__tablename__ = 'addresses'
id =Column(Integer, primary_key=True)
email_address = Column(String, nullable=False)
user_id = Column(Integer, ForeignKey('users.id'))
user = relationship("User", backref=backref('addresses', order_by=id))
def __repr__(self):
return "<Address(email_address='%s')>" % self.email_address
Base.metadata.create_all(engine)
// join
jack = User(name='jack', fullname='Jack Bean', password='gjffdd')
jack.addresses = [
Address(email_address='jack@google.com'),
Address(email_address='j25@yahoo.com')]
session.add(jack)
session.commit()
for u, a in session.query(User, Address).\
filter(User.id==Address.user_id).\
filter(Address.email_address=='jack@google.com').\
all():
print u, a
2-3 SQLAlchemy Core 详解
代码:
import sqlalchemy
print(sqlalchemy.__version__)
from sqlalchemy import create_engine
from sqlalchemy import Table, Column,Integer, String, MetaData, ForeignKey
engine = create_engine('sqlite:///foo.db', echo=True)
metadata = MetaData()
users = Table('users', metadata,
Column('id', Integer, primary_key=True),
Column('name', String),
Column('fullname', String),
)
addresses = Table('addresses', metadata,
Column('id', Integer, primary_key=True),
Column('user_id', None, ForeignKey('users.id')),
Column('email_address', String, nullable=False)
)
metadata.create_all(engine)
conn = engine.connect()
# conn.execute(users.insert(), [dict(name='jack', fullname='Jack Jones'),
# dict(name='wendy', fullname='Wendy Williams')])
conn.execute(addresses.insert(), [
{'user_id': 1, 'email_address' : 'jack@yahoo.com'},
{'user_id': 1, 'email_address' : 'jack@msn.com'},
{'user_id': 2, 'email_address' : 'www@www.org'},
{'user_id': 2, 'email_address' : 'wendy@aol.com'},
])
from sqlalchemy.sql import select
s = select([users])
result = conn.execute(s)
for row in result:
print(row)
s = select([users, addresses]).where(users.c.id == addresses.c.user_id)
for row in conn.execute(s):
print row
from sqlalchemy.sql import text
s = text(
"SELECTusers.fullname || ', ' || addresses.email_address AS title "
"FROM users, addresses"
"WHERE users.id =addresses.user_id "
"AND users.name BETWEEN :xAND :y "
"AND(addresses.email_address LIKE :e1 "
"ORaddresses.email_address LIKE :e2)")
print(conn.execute(s, x='m', y='z', e1='%@aol.com', e2='%@msn.com').fetchall())
3Python 操作数据库——查询优化实战
3-1 使用 Python 进行 ETL
目标:使用python处理IP数据,并将数据装载到MySQL中
%head ipdata.csv
%tail ipdata.csv
//使用异常来处理错误数据
try:
assert len(ls) == 5
except:
print(l)
print(*ls)
//Python的多重赋值
c1,c2,c3,c4,c5=ls[0],ip2int(ls[1]),ip2int(ls[2]),ls[3],ls[4]
//一次性写入大量数据时,会报MySQLserver has gone away的错误,需要对数据进行分批处理
db = get_mysql_conn()
db.execute("START TRANSACTION")#由于要多次查询,可以启动事务来减少插入时间
for i in range(len(nl_p_list)/1000 + 1):
tmp_nl_p_list = nl_p_list[i*1000: (i+1)*1000]
ret = db.insertmany('insert into ipdata (id,startip,endip,country,carrier) values(%s,%s,%s,%s,%s)',tmp_nl_p_list)
db.execute("COMMIT")#结束后提交所有的变动
//数据库的查询操作
//降序
SELECT * FROM pythontest.ipdata order by id desc;
SELECT * FROM pythontest.ipdata order by -id;
//升序
SELECT * FROM pythontest.ipdata order by id asc;
SELECT * FROM pythontest.ipdata order byid;
//统计总数
SELECT count(*)FROM test.ipdata;
3-2 数据库优化技巧
1)复杂的判断语句between … and – 对索引的支持不好
SELECT * FROM pythontest.ipdata where1780997668 between startip and endip;
2)为表的startip和endip加上索引,并修改sql判断语句,来提高查询效率
结果:提升的性能有限
3)使用另外一种查询方法,假设ip段是连续的无间隔的,找出第一条>=所要查询的已经降序的startip,取出第一条记录
SELECT * FROM pythontest.ipdata where1780997668 >= startip order by startip desc limit 1;
结果:提升一千多倍
核心思想:在进行数据库查询时,要尽可能的利用索引,不要使用复杂的判断语言,因为对索引的支持不好
代码:
with open("./ipdata.csv", 'r') as fr:
lines = fr.readlines()
nl_p_list = []
for l in lines:
ls = l.strip().split(',', 4)
c1, c2, c3, c4, c5 = ls[0], ip2int(ls[1]), ip2int(ls[2]), ls[3], ls[4]
nl = [c2, c3, c4, c5]
nl_p_list.append(nl)
import random
import time
ip_list = map(lambda x: x[1], random.sample(nl_p_list, 100))
db = get_mysql_conn()
t1 = time.time()
for ip in ip_list:
ret = db.get('SELECT * FROM ipdata where%s>=startip order by startip desc limit 1', ip)
t2 = time.time()
print(t2-t1)
for ip in ip_list:
ret = db.get('SELECT * FROM ipdata where%s between startip and endip',ip)
t3 = time.time()
print(t3-t2)
3-3 结合多种优化策略
思想:将一百条语句一起来查询
select t1.* from (select * frompythontest.ipdata where 1780997668 >= startip order by startip desc limit 1)t1
union all
select t2.* from (select * frompythontest.ipdata where 1033241837 >= startip order by startip desc limit 1)t2;
核心思想:在客户端上优化Python代码的,并使用union all,来提升性能
代码:
t1 = time.time()
sql_tmp = 'select {0}.* from(SELECT * FROM ipdata where %s>=startip order by startip desc limit 1) {0}'
sql_list = []
for i in range(len(ip_list)):
# print(sql_tmp.format('t' + str(i)) %ip_list[i])
sql_list.append(sql_tmp.format('t'+str(i)) %ip_list[i])
sql = ' union all '.join(sql_list)
t0 = time.time()
# for row in db.query(sql):
# print(row)
dict(zip(ip_list, db.query(sql))) #将查询值与结果合并
t1 = time.time()
print(t1-t0)