使用python的Pandas工具包读取数据
直接使用pd.read_csv()读取少量数据
import pandas as pd
data_path = '../DataWarehouse/TestData/test01.csv'
文件包含标题行
df = pd.read_csv(data_path, sep='\t', encoding='utf-8')
文件不包含标题行
header_name = ['col1', 'col2', 'col3', 'col4', 'col5']
df = pd.read_csv(data_path, sep='\t', encoding='utf-8', header=None, names=header_name)
读取文件指定列
df = pd.read_csv(data_path, sep='\t', encoding='utf-8', usecols=[0,1,2])
df = pd.read_csv(data_path, sep='\t', encoding='utf-8', usecols=['col1', 'col3', 'col4'])
指定特殊分隔符
df = pd.read_csv(data_path, sep='\\x7f\\x5e', encoding='utf-8')
读取大量数据(分批次读取)
读取单个大文件
df_chunk = pd.read_csv(data_path, sep='\t', encoding='utf-8', chunksize=1000000)
def get_chunk_data(df_chunk):
chunks = []
for chunk in df_chunk:
chunks.append(chunk)
df_all = pd.concat(chunks, axis=0)
del chunks
return df_all
df_all = get_chunk_data(df_chunk)
读取多个大文件
import os
def get_chunk_data(df_chunk):
chunks = []
for chunk in df_chunk:
chunks.append(chunk)
df_all = pd.concat(chunks, axis=0)
del chunks
return df_all
def concat_files(source_path):
"""
Paramters
---------
source_path: 需要读取的大文件所在文件夹路径
Returns
-------
df_concat: 最终所有文件合并后的DataFrame
"""
file_list = []
for root, dirs, files in os.walk(source_path):
for file in files:
src_file = os.path.join(root, file)
file_list.append(src_file)
temp_df = []
for file in file_list:
df_chunk = pd.read_csv(file, sep='\t', encoding='utf-8', chunksize=1000000)
df_chunk_concat = get_chunk_data(df_chunk)
temp_df.append(df_chunk_concat)
df_concat = pd.concat(temp_df, axis=0)
del temp_df
return df_concat