前置基础知识
封装成类(Python3版本)
from pymysql import Connection
from my_utils import Timer # https://blog.csdn.net/Yellow_python/article/details/80723272
class Mysql(Timer):
def __init__(self, **kwargs):
super().__init__()
self.database = kwargs.pop('database')
self.db = Connection(
user=kwargs.pop('user', 'root'),
password=kwargs.pop('password'),
host=kwargs.pop('host', 'localhost'),
database=self.database,
port=kwargs.pop('port', 3306),
charset=kwargs.pop('charset', 'UTF8'),
)
self.cursor = self.db.cursor()
def __del__(self):
self.cursor.close()
self.db.close()
def commit(self, sql):
self.debug(sql)
try:
self.cursor.execute(sql)
self.db.commit()
except Exception as e:
print(e)
def fetchall(self, query):
self.cursor.execute(query)
return self.cursor.fetchall() # 有数据:tuple of tuple;无数据:()
def fetchone(self, query, n=9999999):
print(query)
self.cursor.execute(query)
for _ in range(n):
one = self.cursor.fetchone() # tuple
if one:
yield one
def fetchone_dt(self, query, n=9999999):
print(query)
rowcount = self.cursor.execute(query)
columns = [i[0] for i in self.cursor.description]
length = len(columns)
for _ in range(min(n, rowcount)):
one = self.cursor.fetchone() # tuple
yield {columns[i]: one[i] for i in range(length)}
def insert(self, dt, tb):
ls = [(k, v) for k, v in dt.items() if v is not None]
sql = 'INSERT %s (' % tb + ','.join(i[0] for i in ls) + \
') VALUES (' + ','.join('%r' % i[1] for i in ls) + ')'
self.commit(sql)
def update(self, dt, dt_condition, tb):
sql = 'UPDATE %s SET ' % tb + ','.join('%s=%r' % (k, v) for k, v in dt.items()) \
+ ' WHERE ' + ' AND '.join('%s=%r' % (k, v) for k, v in dt_condition.items())
self.commit(sql)
def insert_or_update(self, dt, dt_condition, tb):
"""有则更新,无则插入"""
query = 'SELECT * FROM ' + tb + ' WHERE ' + \
' AND '.join('%s=%r' % (k, v) for k, v in dt_condition.items())
if self.fetchall(query) == ():
dt.update(dt_condition)
self.insert(dt, tb)
else:
self.update(dt, dt_condition, tb)
def replace(self, dt, tb):
"""
若存在相同数据,会先删除,再重新插入
缺点:
1、需要考虑全部数据列,不然会丢失部分列
2、索引的原因,导致插入效率较低
"""
ls = [(k, v) for k, v in dt.items() if v is not None]
sql = 'REPLACE INTO %s (' % tb + ','.join(i[0] for i in ls) + \
') VALUES (' + ','.join('%r' % i[1] for i in ls) + ')'
self.commit(sql)
def truncate(self, tb):
sql = 'TRUNCATE TABLE %s' % tb
self.commit(sql)
生成Excel模块
import pandas as pd # conda install pandas openpyxl
class Excel(Mysql):
def postfix(self, prefix):
"""导出Excel文件后缀名"""
return prefix.replace('.xlsx', '') + '_{}.xlsx'.format(self.today)
def db2sheet(self, query, prefix=''):
"""
保存为单sheet的Excel文件
:param query: 数据库查询语句
:param prefix: 文件名前缀
"""
df = pd.read_sql_query(query, self.db)
if prefix:
df.to_excel(self.postfix(prefix), index=False)
return df
def db2sheets(self, queries, prefix):
"""
保存为多sheets的Excel文件
:param queries: 字典{sheet名:sql}
:param prefix: 文件名前缀
"""
writer = pd.ExcelWriter(self.postfix(prefix))
for sheet_name, query in queries.items():
df = pd.read_sql_query(query, self.db)
# if 'url' in df.columns:df['url'] = "'" + df['url']
df.to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
def tb2sheet(self, tb):
self.db2sheet('SELECT * FROM ' + tb, tb)
简单统计模块
class Analysis(Mysql):
def show_tables(self):
"""所有表的名字、数据量、表注释,数据量更新有延时"""
for i in self.fetchone("""SELECT TABLE_NAME,TABLE_ROWS,TABLE_COMMENT
FROM information_schema.TABLES WHERE TABLE_SCHEMA='%s' """ % self.database):
print(*i, sep=' ')
def show_create_table(self, tb=''):
"""查看表注释"""
if tb:
tb = tb.upper()
self.yellow('表注释')
sql = "SELECT TABLE_COMMENT FROM information_schema.TABLES WHERE table_name='%s'" % tb
print(self.fetchall(sql)[0][0])
self.yellow('字段注释')
sql = "SELECT COLUMN_NAME,COLUMN_COMMENT FROM information_schema.COLUMNS WHERE TABLE_NAME='%s'" % tb
for i in self.fetchall(sql):print(i)
self.yellow('建表语句')
sql = "SHOW CREATE TABLE " + tb
print(self.fetchall(sql)[0][1])
else:
sql = "SELECT TABLE_NAME,COLUMN_NAME,COLUMN_COMMENT FROM information_schema.COLUMNS"
for i in self.fetchall(sql):print(i)
def count(self, tb):
return self.fetchall('SELECT COUNT(*) FROM ' + tb)[0][0]
def proportion_not_null(self, table, field=None):
"""非空占比"""
if field:
sql = '''
SELECT
t1.f1 amount,
t1.f1/t2.f1 proportion
FROM
(SELECT COUNT(*)f1 FROM {table} WHERE "{field}" IS NOT NULL)t1,
(SELECT COUNT(*)f1 FROM {table})t2
'''.format(table=table, field=field.upper())
result = self.fetchall(sql)[0]
print(field.rjust(15, ' '), self.hot(round(result[1]*100, 2), 50)+'%', result[0], sep=' '*5)
return [field, *result]
else:
sql = "SELECT COLUMN_NAME FROM information_schema.COLUMNS WHERE table_name='%s'" % table
return [self.proportion_not_null(table, i)for i, in self.fetchall(sql)]
def proportion_group_by(self, table, field, prints=''):
"""单字段各项占比"""
sql = '''
SELECT
t1.f2 fullname,
t1.f1 amount,
t1.f1/t2.f1 proportion
FROM
(SELECT COUNT(*)f1,"{field}" f2 FROM {table} GROUP BY "{field}")t1,
(SELECT COUNT(*)f1 FROM {table})t2
ORDER BY amount DESC
'''.format(table=table, field=field.upper())
result = self.fetchall(sql)
for i, j, k in prints and result:
print('{}'.format(i).rjust(15, ' '), self.hot(round(k*100, 2), 50)+'%', j, sep=' '*5)
return result
文本挖掘模块
class TM(Mysql): # Text Mining
def highlight_print(self, field, table, keyword, n=500):
"""查询+高亮"""
sql = "SELECT %s FROM %s WHERE INSTR(%s,'%s')>0" % (field, table, field, keyword)
for text, in self.fetchone(sql, n):
print(self.highlight(text, keyword))
def highlight_cut(self, field, table, keyword, n=500, length=50):
"""查询+分词+高亮"""
from jieba import dt
sql = "SELECT %s FROM %s WHERE INSTR(%s,'%s')>0" % (field, table, field, keyword)
dt.add_word(keyword)
for text, in self.fetchone(sql, n):
for word, head, tail in dt.tokenize(text):
if word == keyword:
highlight_word = self.dark_red(keyword)
print(text[max(head-length, 0):head] + highlight_word + text[tail:tail+length])
def group_by_count(self, field, table):
"""查数量,降序"""
sql = 'SELECT %s,count(*) as cnt FROM %s GROUP BY %s ORDER BY cnt DESC' % (field, table, field)
return self.fetchall(sql)
封装成类(Python2版本)
Python2的优势是免安装
脚本执行所在的服务器上安装了MySQL,就可执行mysql
+-e
,例如:
mysql -u"用户" -p"密码" -D"库名" -e"INSERT t1(f1,f2)VALUES('a',3)"
# coding=utf-8
from subprocess import check_output
def evaluate(cmd):
"""执行Linux命令并获取返回值,Python2返回值类型为str"""
print cmd
return check_output(cmd, shell=True).strip()
class Mysql:
def __init__(self, **kwargs):
host = kwargs.pop('host', 'localhost')
user = kwargs.pop('user', 'root')
password = kwargs.pop('password')
database = kwargs.pop('database', None)
self.cmd = "mysql -h'{}' -u'{}' -p'{}' ".format(host, user, password)
if database:
self.cmd += '-D"{}" '.format(database)
def e(self, sql):
return evaluate(self.cmd + "-e'{}'".format(sql.replace("'", '"')))
def fetch_dt(self, sql):
lines = self.e(sql).split('\n')
if len(lines) > 1:
columns = lines[0].split('\t')
col_num = len(columns)
for line in lines[1:]:
cells = line.split('\t')
yield {columns[i]: cells[i] for i in range(col_num)}
def insert(self, dt, tb):
ls = [(k, v) for k, v in dt.items() if v is not None]
sql = 'INSERT %s (' % tb + ','.join(i[0] for i in ls) + \
') VALUES (' + ','.join('%r' % i[1] for i in ls) + ')'
print sql
self.e(sql)
def update(self, dt, dt_condition, tb):
sql = 'UPDATE %s SET ' % tb + ','.join('%s=%r' % (k, v) for k, v in dt.items()) \
+ ' WHERE ' + ' AND '.join('%s=%r' % (k, v) for k, v in dt_condition.items())
self.e(sql)
def replace(self, dt, tb):
"""
若存在相同数据,会先删除,再重新插入
缺点:
1、需要考虑全部数据列,不然会丢失部分列
2、索引的原因,导致插入效率较低
"""
ls = [(k, v) for k, v in dt.items() if v is not None]
sql = 'REPLACE INTO %s (' % tb + ','.join(i[0] for i in ls) + \
') VALUES (' + ','.join('%r' % i[1] for i in ls) + ')'
self.e(sql)
if __name__ == '__main__':
PWD = '123456'
Mysql(password=PWD).e('CREATE DATABASE b1;')
_m = Mysql(password=PWD, database='b1')
_m.e('''
CREATE TABLE t1(
a INT PRIMARY KEY,
b VARCHAR(255),
c DATETIME DEFAULT NOW())
''')
_m.insert({'a': 234, 'b': 'ABC', 'c': evaluate('date +"%Y-%m-%d %H:%M:%S"')}, 't1')
_m.insert({'a': 222, 'b': 'EEE'}, 't1')
_m.update({'b': 'FFF'}, {'a': 234}, 't1')
for _d in _m.fetch_dt('SELECT * FROM t1'):
print _d