从数据库读取数据保存为CSV,然后转换为HDF5,用于后面数据快速处理
from sqlalchemy import create_engine
import cx_Oracle as cx
import pandas as pd
class Analyze:
dsnStr = cx.makedsn("ip", "port", service_name="spvdb")
engine = create_engine("oracle://username:password@%s" % dsnStr)
conn = engine.connect().execution_options(stream_results=True)
def save_csv(self, query, filename):
"""
保存为CSV文件
"""
for chunk in pd.read_sql(sql=query, con=self.conn, chunksize=50000):
chunk.to_csv(filename, encoding="utf_8_sig", mode="a", index=False)
del chunk
print("保存文件中...")
print("保存CSV文件完成")
def save_hdf5(self, csv_table, hdf_table):
"""
保存为HDF5文件
"""
# for chunk in pd.read_csv(csv_table, chunksize=50000):
# chunk.to_hdf(hdf_table, "data", mode="a")
# del chunk
# print("保存文件中...")
#
# print("保存HDF5文件完成")
"""
上面这种方法不行,最后只添加最后一分块的数据
"""
store = pd.HDFStore(hdf_table, mode="w")
for chunk in pd.read_csv(csv_table, chunksize=50000):
chunk = pd.DataFrame(chunk).astype(str)
store.append("df", chunk)
del chunk
print("保存文件中...")
store.close()
print("保存HDF5文件完成")
def read(self):
realtime = pd.read_hdf("realtime.hd5")
print(realtime)
if __name__ == '__main__':
rs = Analyze()
# 保存为CSV
# rs.save_csv(query="select * from realtime", filename="realtime.csv")
# 保存为HDF5
# rs.save_hdf5(csv_table="realtime.csv", hdf_table="realtime.hd5")
rs.read()
之前在网上找的分块读取的,但是实际使用的时候,速度太慢(千万行数据),记录一下
def query_result(query):
chunk_list = []
for chunk in pd.read_sql(query, con=conn, chunksize=50000):
# for chunk in pd.read_csv(query, chunksize=50000):
chunk_list.append(chunk)
del chunk
dfs = pd.concat(chunk_list, ignore_index=True)
return dfs