一篇文章学会 Pandas

最新推荐文章于 2024-09-29 14:13:49 发布

陈君豪

最新推荐文章于 2024-09-29 14:13:49 发布

阅读量113

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/howard789/article/details/107552368

版权

python 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt


index=1
if(index==1):
# 创建一个Dataframe
    data = pd.DataFrame(np.arange(20).reshape(4, 5), index=list('abcd'), columns=list('ABCDE'))

    print(data)
    print('取出前三行')
    d1=data[:3]
    print(d1)

    print('取出第三行')
    d2=data[3:]
    print(d2)

    # print(data)# 取第一行数据，索引为'a'的行就是第一行，所以结果相同
    print(data.iloc[0])
    print('行',data.shape[0])
    print('列',data.shape[1])
    print(data.iloc[0].value_counts())
    print(data.iloc[0].value_counts().index)
    print(data.loc[:,['A']])
    print(data.iloc[:,[0]])
    print('*********',4,'***************************');
    # print(data.loc[['a','b'],['A','B']])
    print('*********',5,'***************************');
    print(data.iloc[[0,1],[0,1]])
    print('*********',6,'***************************');
    print(data.loc[:,:])
    print('*********',7,'***************************');
    print(data.iloc[:,:])
    print('*********',8,'***************************');
    print(data.loc[data['A']==0])
    print('*********',9,'***************************');
    print(data.loc[(data['A']==0)&(data['B']==2)])
    print('*********',10,'***************************');

    print(data[data['A']==0]) #dataframe用法)
    print('*********',11,'***************************');
    print(data[data['A'].isin([0])]) #isin函数)
    print('*********',12,'***************************');
    print(data[(data['A']==0)&(data['B']==2)]) #dataframe用法)
    print('*********',13,'***************************');
    print(data[(data['A'].isin([0]))&(data['B'].isin([2]))]) #isin函数


if(index==2):
    #Series
    s = pd.Series([11., 12., 13.], name='S')
    data = np.arange(21, 24)
    df = pd.DataFrame({'A': [1., 2.], 'B': [3, 4]})

    a = df.as_matrix(columns=['A', 'B'])
    b = df.as_matrix(columns=['A', 'B'])
    print(a.shape)
    print(b.shape)
    # b=df.values()
    #
    c = np.mat(a)
    d = np.mat(b)
    e = np.matmul(c, d)
    print(type(e))
    print(e)

    A = df['A'].item()
    print(type(A))
    print(len(A))
    for i in A[3::-1]:
        print(df.loc[i, 'A'])


if(index==3):
    #concat
    df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},index = [0, 1, 2, 3])


    df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                         'C': ['C4', 'C5', 'C6', 'C7'],
                         'D': ['D4', 'D5', 'D6', 'D7']},index = [4, 5, 6, 7])

    df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                         'D': ['D8', 'D9', 'D10', 'D11']},index = [8, 9, 10, 11])

    frames = [df1, df2, df3]
    result = pd.concat(frames)

if(index==4):
    df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['', 'D1', 'D2', 'D3']},index = [0, 1, 2, 3])
    df1.dropna(inplace=True)
    print(df1)

    # 转成array
    s=np.array(df1)
    print(s)


if(index==6):

    df.eval("""
    .....: e = 气温 + 湿度
    .....: f = 气温 - 湿度
    .....: g = 气温 / 2.0""", inplace=True)

    df.eval('new1 = 气温 + 湿度 + PM2P5', inplace=True)
    df.query()
    df.eval("""
    .....: e = 气温 + 湿度
    .....: f = 气温 - 湿度
    .....: g = 气温 / 2.0""", inplace=True)

if(index==7):
    df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

    df['B']=df['A'].apply(lambda x:1 if x==2 else -1)

    S=df['A']
    S=S.map(lambda x:x+3)
    df['A']=S
    print(df)
    df.to_csv(r'D:\wks\wks_ml\demo\test.csv',index=False)
    df.to_csv(r'D:\wks\wks_ml\demo\test1.csv',index=True)



if (index == 8):
    df = pd.DataFrame({'A': [np.nan, 2], 'B': [3, 4]})
    s=df['A']
    s=s.fillna(value=-1)
    df['A']=s
    print(df)

if (index == 9):
    # df = pd.DataFrame(np.arange(6).reshape(2, 3), index=("AA", "BB"), columns=["three", "two", "one"])

    df = pd.DataFrame({'A': [np.nan, 2], 'B': [3, 4]})
    df=df.loc[1,:].to_frame()
    df2 = pd.DataFrame(df.values.T, index=df.columns, columns=df.index)
    df2.reset_index(drop=True, inplace=True)
    print(df2)

if (index == 10):

    # 概率分布直方图
    # 高斯分布
    # 均值为0
    mean = 0
    # 标准差为1，反应数据集中还是分散的值
    sigma = 1
    x = mean + sigma * np.random.randn(10000)
    fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(9, 6))
    # 第二个参数是柱子宽一些还是窄一些，越大越窄越密
    ax0.hist(x, 40, normed=1, histtype='bar', facecolor='yellowgreen', alpha=0.75)
    ##pdf概率分布图，一万个数落在某个区间内的数有多少个
    ax0.set_title('pdf')
    ax1.hist(x, 20, normed=1, histtype='bar', facecolor='pink', alpha=0.75, cumulative=True, rwidth=0.8)
    # cdf累计概率函数，cumulative累计。比如需要统计小于5的数的概率
    ax1.set_title("cdf")
    fig.subplots_adjust(hspace=0.4)
    plt.show()

sys.exit(0)