python pandas

最新推荐文章于 2025-03-10 21:59:21 发布

AntHub

最新推荐文章于 2025-03-10 21:59:21 发布

阅读量86

点赞数

分类专栏： python 文章标签： python 数据库

本文链接：https://blog.csdn.net/wwq921220/article/details/120253418

版权

python 专栏收录该内容

18 篇文章

订阅专栏

文章目录

创建DataFrame
取列
取行
增加一列
pivot_table
连接oracle
数据库连接池
线程池两类
日志
stats_models

创建DataFrame

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df2 = pd.DataFrame({ 'A' : 1.,
					'B' : pd.Timestamp('20130102'),
					'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
					'D' : np.array([3] * 4,dtype='int32'),
					'E' : pd.Categorical(["test","train","test","train"]),
					'F' : 'foo' })

取列

print(df.A) # A列 Series
print(df['A'])  # A列 Series
print(df[['A']])  # A列 DataFrame
print(df[['A', 'B']])  # A列和B列 DataFrame

取行

print(df.iloc[3]) # 第3行
print(df.iloc[3:5, 0:2])  # [3,5)行 , [0,2)列
print(df.loc['2013-01-01':'2013-01-02'])  # [2013-01-01,2013-01-02)行
print(df.loc['2013-01-01'])  # 2013-01-01 行

增加一列

f_col = pd.Series(range(1,7), index=pd.date_range('20130101', periods=6))
df['F']  = f_col

pivot_table

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
					'B' : ['A', 'B', 'C'] * 4,
					'C' : ['foo', 'foo', 'foo', 'bar','bar', 'bar'] * 2,
					'D' : np.random.randn(12),
					'E' : np.random.randn(12)})
table = pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])					
print(table)

连接oracle

pip install cx_Oracle

import cx_Oracle
import pandas as pd

def pd_query():
	conn = cx_Oracle.connect('username/password@ip:port/service_name')
	sql = '''select * from table_1  where a= :code_ind '''
	a = 'aaaa'
	sql_param = {'code_ind': a}
	result = pd.read_sql(sql, conn, params=sql_params)
	conn.close()
	print(result)

def cx_query():	
	conn = cx_Oracle.connect('username/password@ip:port/service_name')
	sql = '''select * from table_1  where a= :code_ind '''
	a = 'aaaa'
	sql_param = {'code_ind': a}
	cursor = conn.cursor()
	cursor.prepare(sql)
	cursor.execute(None, sql_params)
	#cursor.executemany(None, [sql_params])
	conn.commit()
	cursor.close()
	conn.close()

数据库连接池

pip install DBUtils

import cx_Oracle
from dbutils.pooled_db import PooledDB

db_conf = {'dsn': 'ip:port/service_name', 'user': 'userName', 'password': 'xxx', 'encoding': 'utf-8'}

class Pool(object):
	def __init__(self, config=None):
		if config is None:
			config = db_conf
		conn_kwargs = config
		self._pool = PooledDB(creator=cx_Oracle, mincached=5, maxcached=200, blocking=True, **conn_kwargs)	
	
	def get_conn(self):
		return self._pool.connection()	

DBPool = Pool(db_conf)

线程池两类

import pandas as pd
import threadpool
import multiprocessing
from threading import RLock

global_reault_frame = pd.DataFrame(columns=['A', 'B'])
lock = RLock()


def  callback_fun(request, result):
	global global_reault_frame 
	lock.acquire(blocking=True)
	global_reault_frame = global_reault_frame.append(result, ignore_index= True)
	lock.release()


def work(code=None, name=None, begin_time=None):
	print(code)
	return pd.DataFrame({'A': code, 'B': name})


def do_something():
	count = multiprocessing.cpu_count()
	agr_list = []
	agr_list.append({'code': 'code', 'name':'name', 'begin_time':'2021-09-09'})
	pool = threadpool.ThreadPoolI(count)
	requests = threadpool.makeRequests(work, agr_list, callback_fun)
	[pool.putRequest(req) for req in requests ]
	pool.wait()
	print(global_reault_frame)

from concurrent.futures import ThreadPoolExecutor, wait, as_completed, FIRST_COMPLETED, ALL_COMPLETED
import multrprocessing

def task_sunmit(fun=None, args=None, thread_count=None):
	if len(args) < thread_count:
		thread_count = len(args)
	result_list = []	
	with ThreadPoolExecutor(max_workers=thread_count) as t:
		all_task = [t.submit(fun, item) for item in args]
		result = wati(all_task)

		for in in result.done:
			result_list.append(i.results())
		return result_list 

def work(param):
	code = param['code']
	name = param['name']
	return pd.DataFrame({'A': code, 'B': name})

def main():
	args = [{'code':'code_val', 'name':'name_val'}]
	result = task_submit(work, args, multiprocessing.cpu_count())
	print(result)

日志

import logging
import os.path
import sys
import time


class Logger(object):

    def __init__(self):
        logger = logging.getLogger()
        level = logging.DEBUG
        logger.setLevel(level)
        rq = time.strftime('%Y%m%d', time.localtime(time.time()))
        # log_path = os.path.dirname(os.getcwd()) + '/logs/'
        log_path = os.getcwd() + '/logs/'
        log_name = log_path + rq + '.log'
        logfile = log_name
        os.makedirs(log_path, exist_ok=True)
        file = open(logfile, 'a')
        file.close()
        fh = logging.FileHandler(logfile, encoding='utf-8')
        fh.setLevel(level)

        console_handler = logging.StreamHandler()
        console_handler.setStream(sys.stdout)
        console_handler.setLevel(level)
        formatter = logging.Formatter("%(asctime)s - [thread:%(thread)5d-%(threadName)24s]"
                                      " - %(filename)s[line:%(lineno)d-%(funcName)s]"
                                      " - %(levelname)s: %(message)s")
        fh.setFormatter(formatter)
        console_handler.setFormatter(formatter)
        logger.addHandler(fh)
        logger.addHandler(console_handler)
        self._logger = logger

    def log(self):
        return self._logger


logger_ = Logger().log()

stats_models

ols

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

def y_equals_x_plus_1():
	df = pd.DataFrame({'y':np.arange(2, 21), 'x': np.arange(1, 20)})
	x = df.loc[:, 'x']
	y = df.loc[:, 'y']
	reaults = sm.OLS(y, sm.add_constant(x)).fit()
	df['predicted'] = results.predict()
	df['residuals'] = df.predicted -y 
	print(results.summary())
	print(results.params)
	
	fig = plt.figure()
	ax = fig.add_subplot()
	ax.scatter(x, y, c='b')
	ax.plot(x, df['predicted'], c='r')
	ax.scatter(x, df['Abs_Residuals'], c='y')
	plt.show()

标准差

import pandas as pd
import statsmodels.stats.weightstats as weightstats

def weight_api(weight=None, data_frame=None):
	weight_status = weightstats.DescrStatsW(data_frame, weight)
	return weight_stats.std

读取h5

import pandas as pd
import h5py
import tables

def pd_read():
	hdf = pd.read_hdf("dir/xxx.h5")
	print(hdf)

def get_value(g):
	for key in g:
		g0 = g[key]
		if isinstance(g0, h5py.Dataset)
			print(g0.name)
			print(g0[()])
		else:
			print(g0, g0.name)	
			get_value(g0)

def h5py_read():
	file = h5py.File('dir/xxx.h5', 'r')
	print(list(file.keys()))
	group = file.get('group_name')
	get_value(group)

def hdfstore_read():
	hdf_store = pd.HDFStore("dir/xxx.h5")	
	print(dir(hdf_store))
	hdf_store.close()

def pytable_test():
	file = tables.open_file('dir/xxx.h5', driver='H5FD_CORE')
	print(file)