pandas Foundations(datacamp)

1.Data ingestion & inspection

1.了解数据基础信息

type(df)
df.head()
df.tail()
df.info()
df.values
df.column.dtype
df.shape

2.创建DataFrame

df=pd.DataFrame(...)

#示例
# Zip the 2 lists together into one list of (key,value) tuples: zipped
zipped = list(zip(list_keys, list_values))

# Inspect the list using print()
print(zipped)

# Build a dictionary with the zipped list: data
data = dict(zipped)

# Build and inspect a DataFrame from the dictionary: df
df = pd.DataFrame(data)
print(df)

# Build a list of labels: list_labels
list_labels = ['year', 'artist', 'song', 'chart weeks']

# Assign the list of labels to the columns attribute: df.columns
df.columns = list_labels

3.数据的导入与输出

df = pd.read_csv(filepath)
df.to_csv(out_csv)

#示例,
# Read the raw file as-is: df1
df1 = pd.read_csv(file_messy)

# Print the output of df1.head()
print(df1.head())

# Read in the file with the correct parameters: df2
df2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')

# Print the output of df2.head()
print(df2.head())

# Save the cleaned up DataFrame to a CSV file without the index
df2.to_csv(file_clean, index=False)

# Save the cleaned up DataFrame to an excel file without the index
df2.to_excel('file_clean.xlsx', index=False)



  1. 制图

    # 注意区分
    col_array = df[colname].values
    col_series = df[colname]
    
    # 成图,kind包括['hist','box','scatter',...]
    plt.plot(df, kind=, )
    
    #示例
    # Plot all columns (default)
    df.plot()
    plt.show()
    
    # Plot all columns as subplots
    df.plot(subplots=True)
    plt.show()
    
    # Plot just the Dew Point data
    column_list1 = ['Dew Point (deg F)']
    df[column_list1].plot()
    plt.show()
    
    # Plot the Dew Point and Temperature data, but not the Pressure data
    column_list2 = ['Temperature (deg F)','Dew Point (deg F)']
    df[column_list2].plot()
    plt.show()
    
2.Exploratory data analysis

1.图示:lineplot、scatter、box、hist

2.统计量

#数据数量
df[colname].count()
df[[colname1, colname2]].count()

df[colname].mean()
df.mean()
df.std()
df.median()
df.mean()
df.max()
df[colname].describe()
df[colname].unique()


#分位数
q=num  #[num1, num2]
df.quantile(q)
3.Time series in pandas

1.导入与选取

#导入
df = pd.read_csv(..., parse_dates=True, index_col='Date')

#选取一整天、整个月,整年、时间段
df.loc['2015-02-05']
df.loc['February 5,2015']
df.loc['2015-Feb-5']

#增添数据,
df.reindex(evening_2_11, method='ffill')

#转换
my_datetimes = pd.to_datetime(date_list, format=time_format)
  1. resample

    #'D':daily, 'W':weekly, 'A':year
    df.resample('D').mean()
    
    #sample
    df.resample('_').first().interpolate('__')
    
    #选取
    df.loc['2015-2-2', 'Units']
    #rolling:移动窗口。window=3,也就是3个数取一个均值
    smoothed = unsmoothed.rolling(window=24).mean()
    
    #示例
    # Extract temperature data for August: august
    august = df['Temperature']['2010-august']
    
    # Downsample to obtain only the daily highest temperatures in August: august_highs
    august_highs = august.resample('D').max()
    
    
    

3.时间序列的相关函数

#提取小时
df['Date'].dt.hour

#设置时区
central = df['Date'].dt.tz_localize(___)
#改变时区
central =central.dt.tz_convert(___)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值