1.Data ingestion & inspection
1.了解数据基础信息
type(df)
df.head()
df.tail()
df.info()
df.values
df.column.dtype
df.shape
2.创建DataFrame
df=pd.DataFrame(...)
#示例
# Zip the 2 lists together into one list of (key,value) tuples: zipped
zipped = list(zip(list_keys, list_values))
# Inspect the list using print()
print(zipped)
# Build a dictionary with the zipped list: data
data = dict(zipped)
# Build and inspect a DataFrame from the dictionary: df
df = pd.DataFrame(data)
print(df)
# Build a list of labels: list_labels
list_labels = ['year', 'artist', 'song', 'chart weeks']
# Assign the list of labels to the columns attribute: df.columns
df.columns = list_labels
3.数据的导入与输出
df = pd.read_csv(filepath)
df.to_csv(out_csv)
#示例,
# Read the raw file as-is: df1
df1 = pd.read_csv(file_messy)
# Print the output of df1.head()
print(df1.head())
# Read in the file with the correct parameters: df2
df2 = pd.read_csv(file_messy, delimiter=' ', header=3, comment='#')
# Print the output of df2.head()
print(df2.head())
# Save the cleaned up DataFrame to a CSV file without the index
df2.to_csv(file_clean, index=False)
# Save the cleaned up DataFrame to an excel file without the index
df2.to_excel('file_clean.xlsx', index=False)
-
制图
# 注意区分 col_array = df[colname].values col_series = df[colname] # 成图,kind包括['hist','box','scatter',...] plt.plot(df, kind=, ) #示例 # Plot all columns (default) df.plot() plt.show() # Plot all columns as subplots df.plot(subplots=True) plt.show() # Plot just the Dew Point data column_list1 = ['Dew Point (deg F)'] df[column_list1].plot() plt.show() # Plot the Dew Point and Temperature data, but not the Pressure data column_list2 = ['Temperature (deg F)','Dew Point (deg F)'] df[column_list2].plot() plt.show()
2.Exploratory data analysis
1.图示:lineplot、scatter、box、hist
2.统计量
#数据数量
df[colname].count()
df[[colname1, colname2]].count()
df[colname].mean()
df.mean()
df.std()
df.median()
df.mean()
df.max()
df[colname].describe()
df[colname].unique()
#分位数
q=num #[num1, num2]
df.quantile(q)
3.Time series in pandas
1.导入与选取
#导入
df = pd.read_csv(..., parse_dates=True, index_col='Date')
#选取一整天、整个月,整年、时间段
df.loc['2015-02-05']
df.loc['February 5,2015']
df.loc['2015-Feb-5']
#增添数据,
df.reindex(evening_2_11, method='ffill')
#转换
my_datetimes = pd.to_datetime(date_list, format=time_format)
-
resample
#'D':daily, 'W':weekly, 'A':year df.resample('D').mean() #sample df.resample('_').first().interpolate('__') #选取 df.loc['2015-2-2', 'Units'] #rolling:移动窗口。window=3,也就是3个数取一个均值 smoothed = unsmoothed.rolling(window=24).mean() #示例 # Extract temperature data for August: august august = df['Temperature']['2010-august'] # Downsample to obtain only the daily highest temperatures in August: august_highs august_highs = august.resample('D').max()
3.时间序列的相关函数
#提取小时
df['Date'].dt.hour
#设置时区
central = df['Date'].dt.tz_localize(___)
#改变时区
central =central.dt.tz_convert(___)