创建DataFrame
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
取列
print(df.A)
print(df['A'])
print(df[['A']])
print(df[['A', 'B']])
取行
print(df.iloc[3])
print(df.iloc[3:5, 0:2])
print(df.loc['2013-01-01':'2013-01-02'])
print(df.loc['2013-01-01'])
增加一列
f_col = pd.Series(range(1,7), index=pd.date_range('20130101', periods=6))
df['F'] = f_col
pivot_table
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar','bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
table = pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
print(table)
连接oracle
import cx_Oracle
import pandas as pd
def pd_query():
conn = cx_Oracle.connect('username/password@ip:port/service_name')
sql = '''select * from table_1 where a= :code_ind '''
a = 'aaaa'
sql_param = {'code_ind': a}
result = pd.read_sql(sql, conn, params=sql_params)
conn.close()
print(result)
def cx_query():
conn = cx_Oracle.connect('username/password@ip:port/service_name')
sql = '''select * from table_1 where a= :code_ind '''
a = 'aaaa'
sql_param = {'code_ind': a}
cursor = conn.cursor()
cursor.prepare(sql)
cursor.execute(None, sql_params)
conn.commit()
cursor.close()
conn.close()
数据库连接池
import cx_Oracle
from dbutils.pooled_db import PooledDB
db_conf = {'dsn': 'ip:port/service_name', 'user': 'userName', 'password': 'xxx', 'encoding': 'utf-8'}
class Pool(object):
def __init__(self, config=None):
if config is None:
config = db_conf
conn_kwargs = config
self._pool = PooledDB(creator=cx_Oracle, mincached=5, maxcached=200, blocking=True, **conn_kwargs)
def get_conn(self):
return self._pool.connection()
DBPool = Pool(db_conf)
线程池两类
import pandas as pd
import threadpool
import multiprocessing
from threading import RLock
global_reault_frame = pd.DataFrame(columns=['A', 'B'])
lock = RLock()
def callback_fun(request, result):
global global_reault_frame
lock.acquire(blocking=True)
global_reault_frame = global_reault_frame.append(result, ignore_index= True)
lock.release()
def work(code=None, name=None, begin_time=None):
print(code)
return pd.DataFrame({'A': code, 'B': name})
def do_something():
count = multiprocessing.cpu_count()
agr_list = []
agr_list.append({'code': 'code', 'name':'name', 'begin_time':'2021-09-09'})
pool = threadpool.ThreadPoolI(count)
requests = threadpool.makeRequests(work, agr_list, callback_fun)
[pool.putRequest(req) for req in requests ]
pool.wait()
print(global_reault_frame)
from concurrent.futures import ThreadPoolExecutor, wait, as_completed, FIRST_COMPLETED, ALL_COMPLETED
import multrprocessing
def task_sunmit(fun=None, args=None, thread_count=None):
if len(args) < thread_count:
thread_count = len(args)
result_list = []
with ThreadPoolExecutor(max_workers=thread_count) as t:
all_task = [t.submit(fun, item) for item in args]
result = wati(all_task)
for in in result.done:
result_list.append(i.results())
return result_list
def work(param):
code = param['code']
name = param['name']
return pd.DataFrame({'A': code, 'B': name})
def main():
args = [{'code':'code_val', 'name':'name_val'}]
result = task_submit(work, args, multiprocessing.cpu_count())
print(result)
日志
import logging
import os.path
import sys
import time
class Logger(object):
def __init__(self):
logger = logging.getLogger()
level = logging.DEBUG
logger.setLevel(level)
rq = time.strftime('%Y%m%d', time.localtime(time.time()))
log_path = os.getcwd() + '/logs/'
log_name = log_path + rq + '.log'
logfile = log_name
os.makedirs(log_path, exist_ok=True)
file = open(logfile, 'a')
file.close()
fh = logging.FileHandler(logfile, encoding='utf-8')
fh.setLevel(level)
console_handler = logging.StreamHandler()
console_handler.setStream(sys.stdout)
console_handler.setLevel(level)
formatter = logging.Formatter("%(asctime)s - [thread:%(thread)5d-%(threadName)24s]"
" - %(filename)s[line:%(lineno)d-%(funcName)s]"
" - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
console_handler.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(console_handler)
self._logger = logger
def log(self):
return self._logger
logger_ = Logger().log()
stats_models
ols
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
def y_equals_x_plus_1():
df = pd.DataFrame({'y':np.arange(2, 21), 'x': np.arange(1, 20)})
x = df.loc[:, 'x']
y = df.loc[:, 'y']
reaults = sm.OLS(y, sm.add_constant(x)).fit()
df['predicted'] = results.predict()
df['residuals'] = df.predicted -y
print(results.summary())
print(results.params)
fig = plt.figure()
ax = fig.add_subplot()
ax.scatter(x, y, c='b')
ax.plot(x, df['predicted'], c='r')
ax.scatter(x, df['Abs_Residuals'], c='y')
plt.show()
标准差
import pandas as pd
import statsmodels.stats.weightstats as weightstats
def weight_api(weight=None, data_frame=None):
weight_status = weightstats.DescrStatsW(data_frame, weight)
return weight_stats.std
读取h5
import pandas as pd
import h5py
import tables
def pd_read():
hdf = pd.read_hdf("dir/xxx.h5")
print(hdf)
def get_value(g):
for key in g:
g0 = g[key]
if isinstance(g0, h5py.Dataset)
print(g0.name)
print(g0[()])
else:
print(g0, g0.name)
get_value(g0)
def h5py_read():
file = h5py.File('dir/xxx.h5', 'r')
print(list(file.keys()))
group = file.get('group_name')
get_value(group)
def hdfstore_read():
hdf_store = pd.HDFStore("dir/xxx.h5")
print(dir(hdf_store))
hdf_store.close()
def pytable_test():
file = tables.open_file('dir/xxx.h5', driver='H5FD_CORE')
print(file)