import pandas as pd
import numpy as np
gdp1 = pd.Series([2.8,3.01,8.99,8.59,5.18])
gdp2 = pd.Series({'北京':2.8,'上海':3.01,'广东':8.99,'江苏':8.59,'浙江':5.18})
gdp3 = pd.Series(np.array((2.8,3.01,8.99,8.59,5.18)))
print(gdp1)
print(gdp2)
print(gdp3)
0 2.80
1 3.01
2 8.99
3 8.59
4 5.18
dtype: float64
北京 2.80
上海 3.01
广东 8.99
江苏 8.59
浙江 5.18
dtype: float64
0 2.80
1 3.01
2 8.99
3 8.59
4 5.18
dtype: float64
print('行号风格的序列:\n',gdp1[[0,3,4]])
print('行名称风格的序列:\n',gdp2[[0,3,4]])
print('行名称风格的序列:\n',gdp2[['上海','江苏','浙江']])
print('通过numpy函数:\n',np.log(gdp1))
print('通过numpy函数:\n',np.mean(gdp1))
print('通过序列的方法:\n',gdp1.mean())
df1 = pd.DataFrame([['张三',23,'男'],['李四',27,'女'],['王二',26,'女']])
df2 = pd.DataFrame({'姓名':['张三','李四','王二'],'年龄':[23,27,26],'性别':['男','女','女']})
df3 = pd.DataFrame(np.array([['张三',23,'男'],['李四',27,'女'],['王二',26,'女']]))
print('嵌套列表构造数据框:\n',df1)
print('字典构造数据框:\n',df2)
print('二维数组构造数据框:\n',df3)
user_income = pd.read_table(r'C:\Users\Administrator\Desktop\data_test01.txt', sep = ',',
parse_dates={'birthday':[0,1,2]},skiprows=2, skipfooter=3,
comment='#', encoding='utf8', thousands='&')
user_income
child_cloth = pd.read_excel(io = r'C:\Users\Administrator\Desktop\data_test02.xlsx', header = None,
names = ['Prod_Id','Prod_Name','Prod_Color','Prod_Price'], converters = {0:str})
child_cloth
pd.read_excel(io = r'C:\Users\Administrator\Desktop\data_test02.xlsx', header = None,
names = ['Prod_Id','Prod_Name','Prod_Color','Prod_Price'])
import pymysql
conn = pymysql.connect(host='localhost', user='root', password='1q2w3e4r',
database='test', port=3306, charset='utf8')
user = pd.read_sql('select * from topy', conn)
conn.close()
user
import pymssql
connect = pymssql.connect(server = 'localhost', user = '', password = '',
database = 'train', charset = 'utf8')
data = pd.read_sql("select * from sec_buildings where direction = '朝南'", con=connect)
connect.close()
data.head()
sec_cars = pd.read_table(r'C:\Users\Administrator\Desktop\sec_cars.csv', sep = ',')
sec_cars.head()
print('数据集的行列数:\n',sec_cars.shape)
print('各变量的数据类型:\n',sec_cars.dtypes)
sec_cars.Boarding_time = pd.to_datetime(sec_cars.Boarding_time, format = '%Y年%m月')
sec_cars.New_price = sec_cars.New_price.str[:-1].astype('float')
sec_cars.dtypes
sec_cars.describe()
num_variables = sec_cars.columns[sec_cars.dtypes !='object'][1:]
def skew_kurt(x):
skewness = x.skew()
kurtsis = x.kurt()
return pd.Series([skewness,kurtsis], index = ['Skew','Kurt'])
sec_cars[num_variables].apply(func = skew_kurt, axis = 0)
sec_cars.describe(include = ['object'])
Freq = sec_cars.Discharge.value_counts()
Freq_ratio = Freq/sec_cars.shape[0]
Freq_df = pd.DataFrame({'Freq':Freq,'Freq_ratio':Freq_ratio})
Freq_df.head()
Freq_df.reset_index(inplace = True)
Freq_df.head()
df = pd.read_excel(r'C:\Users\Administrator\Desktop\data_test03.xlsx')
print(df.dtypes)
df.birthday = pd.to_datetime(df.birthday, format = '%Y/%m/%d')
df.tel = df.tel.astype('str')
df['age'] = pd.datetime.today().year - df.birthday.dt.year
df['workage'] = pd.datetime.today().year - df.start_work.dt.year
df.tel = df.tel.apply(func = lambda x : x.replace(x[3:7], '****'))
df['email_domain'] = df.email.apply(func = lambda x : x.split('@')[1])
df['profession'] = df.other.str.findall('专业:(.*?),')
df.drop(['birthday','start_work','other'], axis = 1, inplace = True)
df.head()
dates = pd.to_datetime(pd.Series(['1989-8-18 13:14:55','1995-2-16']), format = '%Y-%m-%d %H:%M:%S')
print('返回日期值:\n',dates.dt.date)
print('返回季度:\n',dates.dt.quarter)
print('返回几点钟:\n',dates.dt.hour)
print('返回年中的天:\n',dates.dt.dayofyear)
print('返回年中的周:\n',dates.dt.weekofyear)
print('返回星期几的名称:\n',dates.dt.weekday_name)
print('返回月份的天数:\n',dates.dt.days_in_month)
df = pd.read_excel(r'C:\Users\Administrator\Desktop\data_test04.xlsx')
print('数据集中是否存在重复观测:\n',any(df.duplicated()))
df.drop_duplicates(inplace = True)
df
df = pd.read_excel(r'C:\Users\Administrator\Desktop\data_test05.xlsx')
print('数据集中是否存在缺失值:\n',any(df.isnull()))
df.dropna()
df.drop('age', axis = 1)
df.fillna(method = 'ffill')
df.fillna(method = 'bfill')
df.fillna(value = 0)
df.fillna(value = {'gender':df.gender.mode()[0], 'age':df.age.mean(), 'income':df.income.median()})
sunspots = pd.read_table(r'C:\Users\Administrator\Desktop\sunspots.csv', sep = ',')
xbar = sunspots.counts.mean()
xstd = sunspots.counts.std()
print('标准差法异常值上限检测:\n',any(sunspots.counts > xbar + 2 * xstd))
print('标准差法异常值下限检测:\n',any(sunspots.counts < xbar - 2 * xstd))
Q1 = sunspots.counts.quantile(q = 0.25)
Q3 = sunspots.counts.quantile(q = 0.75)
IQR = Q3 - Q1
print('箱线图法异常值上限检测:\n',any(sunspots.counts > Q3 + 1.5 * IQR))
print('箱线图法异常值下限检测:\n',any(sunspots.counts < Q1 - 1.5 * IQR))
import matplotlib.pyplot as plt
plt.style.use('ggplot')
sunspots.counts.plot(kind = 'hist', bins = 30, normed = True)
sunspots.counts.plot(kind = 'kde')
plt.show()
print('异常值替换前的数据统计特征:\n',sunspots.counts.describe())
UL = Q3 + 1.5 * IQR
print('判别异常值的上限临界值:\n',UL)
replace_value = sunspots.counts[sunspots.counts < UL].max()
print('用以替换异常值的数据:\n',replace_value)
sunspots.counts[sunspots.counts > UL] = replace_value
print('异常值替换后的数据统计特征:\n',sunspots.counts.describe())
df1 = pd.DataFrame({'name':['张三','李四','王二','丁一','李五'],
'gender':['男','女','女','女','男'],
'age':[23,26,22,25,27]}, columns = ['name','gender','age'])
df1
df1.iloc[1:4,[0,2]]
df1.loc[1:3, ['name','age']]
df2 = df1.set_index('name')
df2
df2.iloc[1:4,:]
df2.loc[['李四','王二','丁一'],:]
df1.loc[df1.gender == '男',['name','age']]
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-5-7f6adb98ff7d> in <module>()
19
20 # 使用筛选条件,取出所有男性的姓名和年龄
---> 21 df1.iloc[df1.gender == '男',]
22 df1.loc[df1.gender == '男',['name','age']]
23 # df1.ix[df1.gender == '男',['name','age']]
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1760 except (KeyError, IndexError, AttributeError):
1761 pass
-> 1762 return self._getitem_tuple(key)
1763 else:
1764 # we by definition only have the 0th axis
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
2065 def _getitem_tuple(self, tup: Tuple):
2066
-> 2067 self._has_valid_tuple(tup)
2068 try:
2069 return self._getitem_lowerdim(tup)
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)
701 raise IndexingError("Too many indexers")
702 try:
--> 703 self._validate_key(k, i)
704 except ValueError:
705 raise ValueError(
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_key(self, key, axis)
1979 if key.index.inferred_type == "integer":
1980 raise NotImplementedError(
-> 1981 "iLocation based boolean "
1982 "indexing on an integer type "
1983 "is not available"
NotImplementedError: iLocation based boolean indexing on an integer type is not available
diamonds = pd.read_table(r'C:\Users\Administrator\Desktop\diamonds.csv', sep = ',')
pd.pivot_table(data = diamonds, index = 'color', values = 'price', margins = True, margins_name = '总计')
import numpy as np
pd.pivot_table(data = diamonds, index = 'clarity', columns = 'cut', values = 'carat',
aggfunc = np.size,margins = True, margins_name = '总计')
df1 = pd.DataFrame({'name':['张三','李四','王二'], 'age':[21,25,22], 'gender':['男','女','男']})
df2 = pd.DataFrame({'name':['丁一','赵五'], 'age':[23,22], 'gender':['女','女']},)
pd.concat([df1,df2], keys = ['df1','df2'], )
df2 = pd.DataFrame({'Name':['丁一','赵五'], 'age':[23,22], 'gender':['女','女']})
pd.concat([df1,df2])
df3 = pd.DataFrame({'id':[1,2,3,4,5],'name':['张三','李四','王二','丁一','赵五'],'age':[27,24,25,23,25],'gender':['男','男','男','女','女']})
df4 = pd.DataFrame({'Id':[1,2,2,4,4,4,5],'kemu':['科目1','科目1','科目2','科目1','科目2','科目3','科目1'],'score':[83,81,87,75,86,74,88]})
df5 = pd.DataFrame({'id':[1,3,5],'name':['张三','王二','赵五'],'income':[13500,18000,15000]})
merge1 = pd.merge(left = df3, right = df4, how = 'left', left_on='id', right_on='Id')
merge1
merge2 = pd.merge(left = merge1, right = df5, how = 'left')
merge2
grouped = diamonds.groupby(by = ['color','cut'])
result = grouped.aggregate({'color':np.size, 'carat':np.min, 'price':np.mean, 'face_width':np.max})
result = pd.DataFrame(result, columns=['color','carat','price','face_width'])
result.rename(columns={'color':'counts','carat':'min_weight','price':'avg_price','face_width':'max_face_width'}, inplace=True)
result.reset_index(inplace=True)
result