机器学习--数据读取

bw876720687

已于 2024-09-29 12:43:25 修改

阅读量869

点赞数

分类专栏：人工智能文章标签： python 机器学习

于 2022-04-22 14:17:47 首次发布

本文链接：https://blog.csdn.net/bw876720687/article/details/124344731

版权

人工智能专栏收录该内容

22 篇文章 0 订阅

订阅专栏

背景：
ValueError: This sheet is too large! Your sheet size is: 1840927, 23 Max sheet size is: 1048576, 16384
当你的数据过百万之后经常会出现读取的问题。

不同来源的数据集的读取方式不同
批量读取数据或者分批次读取数据

分块读取

遇到大的文件，需要分块读取的方式
在这里插入图片描述

import pandas as pd

# 读取前1000行数据
data = pd.read_csv('data.csv',nrows =1000)
#只读第一列，想读哪一列就在后面写哪一列 
df=pd.read_csv("data.csv",usecols=colums_label)

#设置chunksize参数，来控制每次迭代数据的大小，分块读取
chunker = pd.read_csv("./train.csv",chunksize=5)
for item in chunker:
    print(type(item))
    #<class 'pandas.core.frame.DataFrame'>
    print(len(item))
    #5

# 读取数据
csv_data = pd.read_csv('file1.csv')  

#取一半行的数据，并存储到新的文件
df=data[0:(int)(data.shape[0]/2)]
df.to_csv(path_or_buf='file2.csv',index=False)

#取某些列，按照新的顺序排列，并存储到新的文件
df = pd.DataFrame()
df=(csv_data.loc[:,['workclass', 'education',  'capital_loss', 'hours_per_week']])
df.to_csv(path_or_buf='file2.csv',index=False)

# 数据分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# 这是经常的情况，最终的target在最后一列
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

压缩数据

这个方法的核心思想是通过减小每个数据变量在存储空间当中消耗的字符长度来减小整体的消耗

def reduce_mem_usage(df):
    """
    reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间，需要先做数据预处理，EDA，最终才能够对数据进行减少内存的方式，为的是方便后期的处理
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

polars加速，利用cpu多线程

import pandas as pd
import polars as pl

# 创建模拟数据
data = {
    'store_nbr': [1, 2, 3],
    'item_nbr': [111, 222, 333],
    'onpromotion': [True, False, True],
    'unit_sales': [10, 20, 30],
    'date': ['2017-01-01', '2017-01-02', '2017-01-03']
}

# 将模拟数据保存为 CSV 文件
df = pd.DataFrame(data)
csv_path = "simulated_data.csv"
df.to_csv(csv_path, index=False)

# Pandas 和 Polars 读取特定列的模拟
# Pandas 读取特定列
df_pandas = pd.read_csv(csv_path, usecols=[1, 2, 3])  # 使用列索引
df_pandas_named = pd.read_csv(csv_path, usecols=['item_nbr', 'onpromotion', 'unit_sales'])  # 使用列名

# Polars 读取特定列
df_polars = pl.read_csv(csv_path, columns=[1, 2, 3])  # 使用列索引
df_polars_named = pl.read_csv(csv_path, columns=['item_nbr', 'onpromotion', 'unit_sales'])  # 使用列名

# 打印结果
print("Pandas 读取列索引 [1, 2, 3]:\n", df_pandas)
print("\nPandas 读取列名 ['item_nbr', 'onpromotion', 'unit_sales']:\n", df_pandas_named)

print("\nPolars 读取列索引 [1, 2, 3]:\n", df_polars)
print("\nPolars 读取列名 ['item_nbr', 'onpromotion', 'unit_sales']:\n", df_polars_named)

在这里插入图片描述
下面是一个pandas和polars相互切换的例子

## 读取训练集
df_train = (
    ## 读取指定的列
    pl.read_csv(
        path + 'train.csv', 
        columns=[1, 2, 3, 4, 5],  # 等价于 Pandas 的 usecols
        has_header=True,  # 确保包含列名
        skip_rows=66458908,  # 跳过前 66458908 行，相当于 Pandas 的 skiprows
        new_columns = ['date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion'],
        
    )
    ##将 'onpromotion' 列转换为布尔类型
    .with_columns(
        pl.col('onpromotion').cast(pl.Boolean)  # 等价于 Pandas 的 dtype={'onpromotion': bool}
    )
    ## 根据 unit_sales 的值进行对数转换
    .with_columns(
        pl.when(pl.col('unit_sales') > 0)
        .then(np.log1p(pl.col('unit_sales').cast(float)))
        .otherwise(0)
        .alias('unit_sales')  # 等价于 Pandas 的 converters 参数
    )
    ## 将 'date' 列转换为日期格式
    .with_columns(
        pl.col('date').str.strptime(pl.Date, '%Y-%m-%d')  # 等价于 Pandas 的 parse_dates=["date"]
    )
)

对应的pandas代码如下

df_train = pd.read_csv(
    path + 'train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)