pandas基础学习

最新推荐文章于 2024-03-13 11:21:40 发布

迷心兔

最新推荐文章于 2024-03-13 11:21:40 发布

阅读量1k

点赞数

原文链接：https://www.machinelearningplus.com/python/101-pandas-exercises-python/

版权

Pthon numpy 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

import numpy as np
import pandas as pd

print(pd.__version__)#检查版本
print(pd.show_versions(as_json=True))#检查版本

# 如何从列表，numpy数组和字典创建系列
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# 解
s1=pd.Series(mylist)
s2=pd.Series(myarr)
s3=pd.Series(mydict)
print("*"*50)
print(s1.head())#获取前五个元素
print(s2.head())
print(s3.head())

0.25.1
{'system': {'commit': None, 'python': '3.7.4.final.0', 'python-bits': 64, 'OS': 'Windows', 'OS-release': '10', 'machine': 'AMD64', 'processor': 'Intel64 Family 6 Model 78 Stepping 3, GenuineIntel', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'None', 'LOCALE': 'None.None'}, 'dependencies': {'pandas': '0.25.1', 'numpy': '1.17.2', 'pytz': '2019.2', 'dateutil': '2.8.0', 'pip': '19.3.1', 'setuptools': '41.4.0', 'Cython': None, 'pytest': '4.3.0', 'hypothesis': None, 'sphinx': None, 'blosc': None, 'feather': None, 'xlsxwriter': '1.2.5', 'lxml.etree': '4.4.1', 'html5lib': None, 'pymysql': '0.9.2', 'psycopg2': None, 'jinja2': '2.10.1', 'IPython': '7.9.0', 'pandas_datareader': None, 'bs4': '4.8.0', 'bottleneck': None, 'fastparquet': None, 'gcsfs': None, 'matplotlib': '3.1.1', 'numexpr': None, 'odfpy': None, 'openpyxl': None, 'pandas_gbq': None, 'pyarrow': None, 'pytables': None, 's3fs': None, 'scipy': '1.3.1', 'sqlalchemy': '1.3.10', 'tables': None, 'xarray': None, 'xlrd': None, 'xlwt': '1.3.0'}}
None
**************************************************
0    a
1    b
2    c
3    e
4    d
dtype: object
0    0
1    1
2    2
3    3
4    4
dtype: int32
a    0
b    1
c    2
e    3
d    4
dtype: int64

# 将序列ser转换为数据框，并将其索引作为数据框的另一列
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# 解
df=ser.to_frame().reset_index()
df.head()

	index	0
0	a	0
1	b	1
2	c	2
3	e	3
4	d	4

# 如何结合多个系列形成一个数据框
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# 解1
df1=pd.concat([ser1,ser2],axis=1)

# 解2
df2=pd.DataFrame({"col_1":ser1,"col_2":ser2})
print(df2.head())

  col_1  col_2
0     a      0
1     b      1
2     c      2
3     e      3
4     d      4

# 如何为系列索引指定名称
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# 解
ser.name="alphabets"
print(ser.head())

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

# 如何从中ser1删除存在的项目ser2
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# 解
print(ser1[~ser1.isin(ser2)])

0    1
1    2
2    3
dtype: int64

# 如何获得A系列和B系列都不通用的物品
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# 解
ser_u = pd.Series(np.union1d(ser1, ser2))  # 联合
print(ser_u)
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # 相交
print("-"*50)
print(ser_i)
print("-"*50)
print(ser_u[~ser_u.isin(ser_i)])#从【联合中的数据】删除【相交中的数据】

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64
--------------------------------------------------
0    4
1    5
dtype: int64
--------------------------------------------------
0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

# 如何获得数字序列【计算的最小值，第25个百分点，中位数，第75个和最大值ser】
state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))
print(ser)

# 解
result=np.percentile(ser, q=[0, 25, 50, 75, 100])
print(result)

0      1.251173
1     11.713402
2     15.765179
3      8.737820
4     14.906604
5     12.571094
6     11.105898
7      4.649783
8      9.052521
9     11.275007
10     7.709865
11    12.175817
12     7.082025
13    14.084235
14    13.363604
15     9.477944
16     7.343598
17    15.148663
18     7.809322
19     4.408409
20    18.094908
21    17.708026
22     8.740604
23     5.787821
24    10.922593
dtype: float64
[ 1.25117263  7.70986507 10.92259345 13.36360403 18.0949083 ]

# 如何获得系列中唯一项目的频率计数
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
# print(ser)

# 解
print(ser.value_counts())

g    7
e    6
c    5
f    5
h    3
a    2
b    2
dtype: int64

# 如何仅保留最常见的前2个值并将其他所有值替换为“Other”
np.random.RandomState(100)
ser=pd.Series(np.random.randint(1,5,12))#1到5的12个数
# print(ser)

# 解
# 获取最多的前面两个数值，并把其他数据变成"Other"
ser[~ser.isin(ser.value_counts().index[:2])]="Other"
print(ser)

0     Other
1         1
2         1
3     Other
4         4
5         4
6     Other
7         4
8     Other
9         1
10        4
11        1
dtype: object

# 将系列重塑ser为具有7行5列的数据框
ser = pd.Series(np.random.randint(1, 10, 35))
# print(ser)

# 解
df=pd.DataFrame(ser.values.reshape(7,5))
print(df)

   0  1  2  3  4
0  7  9  2  3  9
1  4  8  9  9  3
2  7  7  8  6  5
3  4  2  8  4  6
4  1  9  4  2  5
5  8  4  8  3  1
6  4  4  5  2  7

# 从中找到除于3等于0的下标
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)

# 解
result=np.argwhere(ser%3==0)
result

0    1
1    6
2    5
3    4
4    9
5    6
6    1
dtype: int32





array([[1],
       [4],
       [5]], dtype=int64)

# 从中ser，提取列表中位置的项目pos
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# 解
result=ser.take(pos)
result

0     a
4     e
8     i
14    o
20    u
dtype: object

# 如何垂直和水平堆叠两个系列
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# 解
ser1.append(ser2)#垂直
print(ser1)

df = pd.concat([ser1, ser2], axis=1)#水平
df

0    0
1    1
2    2
3    3
4    4
dtype: int64

	0	1
0	0	a
1	1	b
2	2	c
3	3	d
4	4	e

# 对【ser2系列】的值，查找【ser1系列】对应的下标
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# 解
result=[np.where(i==ser1)[0].tolist()[0] for i in ser2]
print(result)
# 解
result=[pd.Index(ser1).get_loc(i) for i in ser2]
result

[5, 4, 0, 8]





[5, 4, 0, 8]

# 计算truth和pred系列的均方误差
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)

0.42318488444073726

# 将中每个单词的第一个字符更改为大写ser
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# 解
pd.Series([i.title() for i in ser])

0     How
1      To
2    Kick
3    Ass?
dtype: object

# 计算系列中每个单词的字符数
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# 解
ser.map(lambda x:len(x))

0    3
1    2
2    4
3    4
dtype: int64

# 计算一系列结果数之间的差异(两值之间的差值)
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# 解
print(ser.tolist())
print(ser.diff().tolist())

[1, 3, 6, 10, 15, 21, 27, 35]
[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]

# 将一系列日期字符串转换为时间序列
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303',
                 '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# 解
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

# 对时间的操作
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', 
                 '2013/04/04', '2015-11-27', '2019-11-27T12:20'])

# 解
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))#转换成统一日期

print("这个月的第几天: ", ser_ts.dt.day.tolist())#这个月的第几天
print("这一年的第几周: ", ser_ts.dt.weekofyear.tolist())#这一年的第几周
print("这一年的第几天: ", ser_ts.dt.dayofyear.tolist())#这一年的第几天
print("这一天的星期几: ", ser_ts.dt.weekday_name.tolist())#这一天的星期几

这个月的第几天:  [1, 2, 3, 4, 27, 27]
这一年的第几周:  [53, 5, 9, 14, 48, 48]
这一年的第几天:  [1, 33, 63, 94, 331, 331]
这一天的星期几:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Friday', 'Wednesday']

# 已知年月的数值，现在设置天数为4
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

# 解
from dateutil.parser import parse
ser.map(lambda x: parse('04 ' + x))

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

# 从中ser，提取包含至少2个元音的单词
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# 解
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) 
                              for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

import re
# 从系列中过滤有效的电子邮件
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com',
                    'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

# 解
emails.str.findall(pattern, flags=re.IGNORECASE)

0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object

# 计算weights每个的平均值fruit
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))

# 解
weights.groupby(fruit).mean()

apple     5.428571
banana    7.500000
carrot    2.000000
dtype: float64

# 计算两个序列之间的欧式距离
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# 解
result=sum((p - q)**2)**.5#方式一
print(result)
result=np.linalg.norm(p-q)#方式二
result

18.16590212458495





18.16590212458495

# 找到一个数值序列中的所有局部最大值（或峰值）
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# 解
dd=np.diff(np.sign(np.diff(ser)))
print(dd)# -2 就是某个点，两边的值比它小
peak_locs=np.where(dd==-2)[0]+1
print(peak_locs)

[-2  2  0  0 -2  2 -2]
[1 5 7]

# 用最不频繁的字符替换字符串中的缺失空格
my_str = 'dbc deb abed gade'
ser = pd.Series(list('dbc deb abed gade'))

# 解
freq = ser.value_counts()#查看数值出现的频率
print(freq)
least_freq = freq.dropna().index[-1]#获取频率最小那个
result="".join(ser.replace(' ', least_freq))
result

d    4
     3
e    3
b    3
a    2
g    1
c    1
dtype: int64





'dbccdebcabedcgade'

# 如何创建一个从2000年1月1日开始和之后的10个周末（星期六）的TimeSeries，
#【并使用随机数作为值】
ser = pd.Series(np.random.randint(1,10,10), 
                pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
print(ser)

2000-01-01    7
2000-01-08    7
2000-01-15    3
2000-01-22    1
2000-01-29    4
2000-02-05    8
2000-02-12    7
2000-02-19    8
2000-02-26    3
2000-03-04    4
Freq: W-SAT, dtype: int32

# ser缺少日期和值。显示所有缺少的日期，并填充上一个日期的值
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime([
                    '2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))
print(ser)
result=ser.resample('D').ffill()#降序替换
print(result)
# ser.resample('D').bfill()#逆序替换
# ser.resample('D').bfill().ffill()#替换成前面的值（比如：都是3.0）

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64
2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64

# 从csv文件中的每第n行导入一个数据框
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)

df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)
df2 = df2.transpose()#转置操作
df2

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	b	lstat	medv
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
50	0.08873	21.0	5.64	0.439	5.963	45.7	6.8147	4.0	243.0	16.8	395.56	13.45	19.7
100	0.14866	0.0	8.56	0.520	6.727	79.9	2.7778	5.0	384.0	20.9	394.76	9.42	27.5
150	1.65660	0.0	19.58	0.871	6.122	97.3	1.6180	5.0	403.0	14.7	372.80	14.10	21.5
200	0.01778	95.0	1.47	0.403	7.135	13.9	7.6534	3.0	402.0	17.0	384.30	4.45	32.9
250	0.14030	22.0	5.86	0.431	6.487	13.0	7.3967	7.0	330.0	19.1	396.28	5.90	24.4
300	0.04417	70.0	2.24	0.400	6.871	47.4	7.8278	5.0	358.0	14.8	390.86	6.07	24.8
350	0.06211	40.0	1.25	0.429	6.490	44.4	8.7921	1.0	335.0	19.7	396.90	5.98	22.9
400	25.04610	0.0	18.10	0.693	5.987	100.0	1.5888	24.0	666.0	20.2	396.90	26.77	5.6
450	6.71772	0.0	18.10	0.713	6.749	92.6	2.3236	24.0	666.0	20.2	0.32	17.44	13.4
500	0.22438	0.0	9.69	0.585	6.027	79.7	2.4982	6.0	391.0	19.2	396.90	14.33	16.8

# 将csv导入数据框时更改列值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})
df

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	b	lstat	medv
0	0.00632	18.0	2.31	0	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	Low
1	0.02731	0.0	7.07	0	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	Low
2	0.02729	0.0	7.07	0	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	High
3	0.03237	0.0	2.18	0	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	High
4	0.06905	0.0	2.18	0	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	High
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.06263	0.0	11.93	0	0.573	6.593	69.1	2.4786	1	273	21.0	391.99	9.67	Low
502	0.04527	0.0	11.93	0	0.573	6.120	76.7	2.2875	1	273	21.0	396.90	9.08	Low
503	0.06076	0.0	11.93	0	0.573	6.976	91.0	2.1675	1	273	21.0	396.90	5.64	Low
504	0.10959	0.0	11.93	0	0.573	6.794	89.3	2.3889	1	273	21.0	393.45	6.48	Low
505	0.04741	0.0	11.93	0	0.573	6.030	80.8	2.5050	1	273	21.0	396.90	7.88	Low

506 rows × 14 columns

# 创建一个数据框，将行作为给定序列的跨步
L = pd.Series(range(15))

def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])

gen_strides(L, stride_len=2, window_len=4)

array([[ 0,  1,  2,  3],
       [ 2,  3,  4,  5],
       [ 4,  5,  6,  7],
       [ 6,  7,  8,  9],
       [ 8,  9, 10, 11],
       [10, 11, 12, 13]], dtype=int64)

# 仅从csv文件导入指定的列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
                 usecols=['crim', 'medv'])
df.head()

	crim	medv
0	0.00632	24.0
1	0.02731	21.6
2	0.02729	34.7
3	0.03237	33.4
4	0.06905	36.2

# 获取数据帧每列的n 行，n列，数据类型和摘要状态
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# print(df.shape)#数据的行，列
# print(df.dtypes)#打印每列数据的类型
# print(df.dtypes.value_counts())#每个dtype下有多少个
# print(df.values.tolist())#把数据合成列表
df.describe()#统计信息

	Min.Price	Price	Max.Price	MPG.city	MPG.highway	EngineSize	Horsepower	RPM	Rev.per.mile	Fuel.tank.capacity	Passengers	Length	Wheelbase	Width	Turn.circle	Rear.seat.room	Luggage.room	Weight
count	86.000000	91.000000	88.000000	84.000000	91.000000	91.000000	86.000000	90.000000	87.000000	85.000000	91.000000	89.000000	92.000000	87.000000	88.000000	89.000000	74.000000	86.000000
mean	17.118605	19.616484	21.459091	22.404762	29.065934	2.658242	144.000000	5276.666667	2355.000000	16.683529	5.076923	182.865169	103.956522	69.448276	38.954545	27.853933	13.986486	3104.593023
std	8.828290	9.724280	10.696563	5.841520	5.370293	1.045845	53.455204	605.554811	486.916616	3.375748	1.045953	14.792651	6.856317	3.778023	3.304157	3.018129	3.120824	600.129993
min	6.700000	7.400000	7.900000	15.000000	20.000000	1.000000	55.000000	3800.000000	1320.000000	9.200000	2.000000	141.000000	90.000000	60.000000	32.000000	19.000000	6.000000	1695.000000
25%	10.825000	12.350000	14.575000	18.000000	26.000000	1.800000	100.750000	4800.000000	2017.500000	14.500000	4.000000	174.000000	98.000000	67.000000	36.000000	26.000000	12.000000	2647.500000
50%	14.600000	17.700000	19.150000	21.000000	28.000000	2.300000	140.000000	5200.000000	2360.000000	16.500000	5.000000	181.000000	103.000000	69.000000	39.000000	27.500000	14.000000	3085.000000
75%	20.250000	23.500000	24.825000	25.000000	31.000000	3.250000	170.000000	5787.500000	2565.000000	19.000000	6.000000	192.000000	110.000000	72.000000	42.000000	30.000000	16.000000	3567.500000
max	45.400000	61.900000	80.000000	46.000000	50.000000	5.700000	300.000000	6500.000000	3755.000000	27.000000	8.000000	219.000000	119.000000	78.000000	45.000000	36.000000	22.000000	4105.000000

# 哪个制造商，型号和类型最高Price？最高Price值的单元格的行号和列号是多少？
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

max_price=df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type']]
print(max_price)

row, col = np.where(df.values == np.max(df.Price))
print(row,col)

# print(df.iat[row[0], col[0]])#获取左上角的数值
print(df.iloc[row[0], col[0]])#获取左上角的数值

print(df.at[row[0], 'Price'])#获取Price列的第一个数值

     Manufacturer Model     Type
58  Mercedes-Benz  300E  Midsize
[58] [4]
61.9
61.9

# 重命名数据框中的特定列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# print(df.columns)
df.columns = df.columns.map(lambda x: x.replace('.', '_'))
print(df.columns)

Index(['Manufacturer', 'Model', 'Type', 'Min_Price', 'Price', 'Max_Price',
       'MPG_city', 'MPG_highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev_per_mile', 'Man_trans_avail',
       'Fuel_tank_capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn_circle', 'Rear_seat_room', 'Luggage_room', 'Weight', 'Origin',
       'Make'],
      dtype='object')

# 检查数据框是否缺少任何值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.isnull().values.any()

True

# 计算每列中缺失值的数量,哪一列的缺失值最大
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

count=df.apply(lambda x: x.isnull().sum())
print(count)#缺失值的数量
print(count.idxmax())#缺失值最大

# 从数据框中选择特定列作为数据框而不是序列
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)

print(type(df[['a']]))
print(type(df.loc[:, ['a']]))
print(type(df.iloc[:, [0]]))

print(type(df.a))
print(type(df['a']))
print(type(df.loc[:, 'a']))
print(type(df.iloc[:, 1]))

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>

df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)#没有改变之前
# print(df[list('cbade')])#交换两列的位置(交换a,c两列)

# 自定义函数交换
def switch_columns(df,col_1=None,col_2=None):
    colnames=df.columns.tolist()
    i_1,i_2=colnames.index(col_1),colnames.index(col_2)
    colnames[i_2],colnames[i_1]=colnames[i_1],colnames[i_2]
    return df[colnames]
df1=switch_columns(df,'a','c')#交换两列的位置
print(df1)
print(sorted(df.columns,reverse=True))#降序排列

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
    c   b   a   d   e
0   2   1   0   3   4
1   7   6   5   8   9
2  12  11  10  13  14
3  17  16  15  18  19
['e', 'd', 'c', 'b', 'a']

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
pd.set_option("display.max_columns",10)
pd.set_option('display.max_rows',10)
df

	Manufacturer	Model	Type	Min.Price	Price	...	Rear.seat.room	Luggage.room	Weight	Origin	Make
0	Acura	Integra	Small	12.9	15.9	...	26.5	NaN	2705.0	non-USA	Acura Integra
1	NaN	Legend	Midsize	29.2	33.9	...	30.0	15.0	3560.0	non-USA	Acura Legend
2	Audi	90	Compact	25.9	29.1	...	28.0	14.0	3375.0	non-USA	Audi 90
3	Audi	100	Midsize	NaN	37.7	...	31.0	17.0	3405.0	non-USA	Audi 100
4	BMW	535i	Midsize	NaN	30.0	...	27.0	13.0	3640.0	non-USA	BMW 535i
...	...	...	...	...	...	...	...	...	...	...	...
88	Volkswagen	Eurovan	Van	16.6	19.7	...	34.0	NaN	3960.0	NaN	Volkswagen Eurovan
89	Volkswagen	Passat	Compact	17.6	20.0	...	31.5	14.0	2985.0	non-USA	Volkswagen Passat
90	Volkswagen	Corrado	Sporty	22.9	23.3	...	26.0	15.0	2810.0	non-USA	Volkswagen Corrado
91	Volvo	240	Compact	21.8	22.7	...	29.5	14.0	2985.0	non-USA	Volvo 240
92	NaN	850	Midsize	24.8	26.7	...	30.0	15.0	3245.0	non-USA	Volvo 850

93 rows × 27 columns

df = pd.DataFrame(np.random.random(4), columns=['random'])
# 格式化或隐藏熊猫数据框中的科学计数法
print(df.round(4))#显示后面4个小数
# 将数据框中的所有值格式化为百分比
out=df.style.format({
    'random':'{0:.2%}'.format,
})
out

   random
0  0.8620
1  0.7903
2  0.0159
3  0.5417

            <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row0" class="row_heading level0 row0" >0</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row0_col0" class="data row0 col0" >86.20%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row1" class="row_heading level0 row1" >1</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row1_col0" class="data row1 col0" >79.03%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row2" class="row_heading level0 row2" >2</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row2_col0" class="data row2 col0" >1.59%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row3" class="row_heading level0 row3" >3</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row3_col0" class="data row3 col0" >54.17%</td>
        </tr>
</tbody></table>

	random

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.iloc[::20, :][['Manufacturer', 'Model', 'Type']]#间隔20取一行

	Manufacturer	Model	Type
0	Acura	Integra	Small
20	Chrysler	LeBaron	Compact
40	Honda	Prelude	Sporty
60	Mercury	Cougar	Midsize
80	Subaru	Loyale	Small

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', 
                 usecols=[0,1,2,3,5])
# df.head()
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna('missing')
# df.head()
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
print(df.index.is_unique)#通过组合相关列来创建主键索引
# df.head()

import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))

print(df['a'])
print(df['a'].argsort())#通过比较值，获取索引
df['a'].argsort()[::-1][5]#获取列中第n个最大值的行号

0    25
1    16
2    12
3     8
4     6
5    17
6    15
7    24
8    16
9    28
Name: a, dtype: int32
0    4
1    3
2    2
3    6
4    1
5    8
6    5
7    7
8    0
9    9
Name: a, dtype: int64





8

ser = pd.Series(np.random.randint(1, 100, 15))

print('ser: ', ser.tolist(), 'mean: ', round(ser.mean()))
np.argwhere(ser>ser.mean())#在中ser，找到第二个最大值的位置大于平均值

ser:  [54, 77, 49, 74, 24, 95, 94, 14, 7, 50, 69, 65, 72, 72, 58] mean:  58.0





array([[ 1],
       [ 3],
       [ 5],
       [ 6],
       [10],
       [11],
       [12],
       [13]], dtype=int64)

# 获得行总和> 100的数据帧的最后n行
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))

rowsums = df.apply(np.sum, axis=1)# 每行相加
print(np.where(rowsums > 100)[0][-2:])#获取大于100的最后2行【返回的是索引】
last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :]
last_two_rows

[11 14]

	0	1	2	3
11	27	32	22	32
14	21	35	37	30

# 将ser较低的5％ile和大于95％ile中的所有值分别替换为第5个和第95％ile值。
ser = pd.Series(np.logspace(-2, 2, 30))

def cap_outliers(ser, low_perc, high_perc):
    low, high = ser.quantile([low_perc, high_perc])
    print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high)
    ser[ser < low] = low
    ser[ser > high] = high
    return ser

capped_ser = cap_outliers(ser, .05, .95)
print(capped_ser)

0.05 %ile:  0.016049294076965887 | 0.95 %ile:  63.876672220183934
0      0.016049
1      0.016049
2      0.018874
3      0.025929
4      0.035622
5      0.048939
6      0.067234
7      0.092367
8      0.126896
9      0.174333
10     0.239503
11     0.329034
12     0.452035
13     0.621017
14     0.853168
15     1.172102
16     1.610262
17     2.212216
18     3.039195
19     4.175319
20     5.736153
21     7.880463
22    10.826367
23    14.873521
24    20.433597
25    28.072162
26    38.566204
27    52.983169
28    63.876672
29    63.876672
dtype: float64

# 在去除负值后将数据框重塑为最大可能的正方形
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
# print(df)
arr = df[df > 0].values.flatten()#把数据展平后，小于0的数变成nan
# print(arr)
arr_qualified = arr[~np.isnan(arr)]#去掉nan
print(arr_qualified)
#比较值，返回索引【倒叙索引】
top_indexes = np.argsort(arr_qualified)[::]
# print(top_indexes)
print(top_indexes[:n**2])
#以元素为单位返回输入的底限
n = int(np.floor(arr_qualified.shape[0]**.5))
# 提取指定索引位置的数据,并以一维数组或者矩阵返回
output = np.take(arr_qualified, sorted(top_indexes[:n**2])).reshape(n, -1)
print(output)

[ 6. 29. 48. 22. 14. 10. 49.  9. 18. 42. 31. 42. 16. 35. 45. 10.  2. 27.
 48.  2. 16. 48. 22. 12. 23. 13. 34. 38. 18. 10. 12. 48. 39. 18. 49. 24.
 35. 13. 16. 30. 35. 22. 44. 46.  8. 30.  1.  5. 30.  7. 15. 22.  6. 43.
 47.  8. 32. 21. 46.  5. 20. 39.  9. 17.]
[46 19 16 59 47  0 52 49 44 55 62  7  5 15 29 30 23 25 37  4 50 12 38 20
 63  8 33 28 60 57 22 51  3 41 24 35 17  1 45 39 48 10 56 26 40 13 36 27
 32 61 11  9 53 42 14 43 58 54  2 18 21 31 34  6]
[[ 6. 29. 48. 22. 14. 10. 49.  9.]
 [18. 42. 31. 42. 16. 35. 45. 10.]
 [ 2. 27. 48.  2. 16. 48. 22. 12.]
 [23. 13. 34. 38. 18. 10. 12. 48.]
 [39. 18. 49. 24. 35. 13. 16. 30.]
 [35. 22. 44. 46.  8. 30.  1.  5.]
 [30.  7. 15. 22.  6. 43. 47.  8.]
 [32. 21. 46.  5. 20. 39.  9. 17.]]

df=pd.DataFrame(np.arange(25).reshape(5,-1))
print(df)

# 交换数据帧的两行
def swap_rows(df,i1,i2):
    df.iloc[i1,:],df.iloc[i2,:]=df.iloc[i2,:].copy(),df.iloc[i1,:].copy()
    return df
result=swap_rows(df,1,2)
result

    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24

	0	1	2	3	4
0	0	1	2	3	4
1	10	11	12	13	14
2	5	6	7	8	9
3	15	16	17	18	19
4	20	21	22	23	24

df = pd.DataFrame(np.arange(25).reshape(5, -1))
print(df)
df.iloc[::-1, :]#反转数据框的行

    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24

	0	1	2	3	4
4	20	21	22	23	24
3	15	16	17	18	19
2	10	11	12	13	14
1	5	6	7	8	9
0	0	1	2	3	4

df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
print(df)
# 获取'a'数据框中列的一键编码df，并将其附加为列
result=pd.get_dummies(df['a'])
df_onehot=pd.concat([result,df[list('bcde')]],axis=1)#合并列显示
df_onehot

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24

	0	5	10	15	20	b	c	d	e
0	1	0	0	0	0	1	2	3	4
1	0	1	0	0	0	6	7	8	9
2	0	0	1	0	0	11	12	13	14
3	0	0	0	1	0	16	17	18	19
4	0	0	0	0	1	21	22	23	24

df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))
print(df)
print(df.apply(np.argmax, axis=1))#每行的最大值【这里axis是相反的功能】
print(df.apply(np.argmax, axis=1).value_counts())#计算出现的次数【降序排序】
print(df.apply(np.argmax, axis=1).value_counts().index[0])#获取最多出现的

    0   1   2   3
0  10  87  19  43
1   5  83  50  80
2  19  24  10  77
3  36  15  95  78
4   8  20  89  48
5  17  17  81  46
6  88  74  52  72
7  91  53  36  61
8  25  53  22  90
9   3  93  86  63
0    1
1    1
2    3
3    2
4    2
5    2
6    0
7    0
8    3
9    1
dtype: int64
2    3
1    3
3    2
0    2
dtype: int64
2

df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), 
                 index=list('abcdefgh'), columns=list('pqrstuvwxy'))
print(df)
print(df.corr())#各类型之间的相关系数DataFrame表格
abs_corrmat = np.abs(df.corr())#区绝对值
print(abs_corrmat)
max_corr = abs_corrmat.apply(lambda x: sorted(x)[-2])#取倒数第二位
print(max_corr)#
print(np.round(max_corr.tolist(), 2))#取小数点后面两位

    p   q   r   s   t   u   v   w   x   y
a  41  72   5  31  67  26  45  65  21  60
b  15  56  72  91  99  32  38  14  52  36
c   7  92  96  84  26  79  81  12  75  50
d  73  46  42  15  80  76  10  34  45   5
e  15  72  55  14  17  54   9  35  36  18
f  12  73  47  84  85   9  31  67  13  64
g  25  43  56  76  62  43  93  25  53  99
h  80  70  30  68  40  74   2  41   7  47
          p         q         r         s         t         u         v  \
p  1.000000 -0.348616 -0.606492 -0.406070  0.056932  0.461015 -0.543905   
q -0.348616  1.000000  0.213753  0.159442 -0.554585  0.088275  0.023752   
r -0.606492  0.213753  1.000000  0.501096 -0.190063  0.290160  0.440666   
s -0.406070  0.159442  0.501096  1.000000  0.260183 -0.243481  0.505580   
t  0.056932 -0.554585 -0.190063  0.260183  1.000000 -0.568596 -0.009954   
u  0.461015  0.088275  0.290160 -0.243481 -0.568596  1.000000 -0.118254   
v -0.543905  0.023752  0.440666  0.505580 -0.009954 -0.118254  1.000000   
w  0.207508  0.125992 -0.797659 -0.285267  0.192809 -0.562124 -0.358931   
x -0.452943 -0.029441  0.798096  0.187066 -0.122999  0.376021  0.637259   
y -0.294716 -0.043568 -0.043181  0.579817  0.083632 -0.434149  0.729595   

          w         x         y  
p  0.207508 -0.452943 -0.294716  
q  0.125992 -0.029441 -0.043568  
r -0.797659  0.798096 -0.043181  
s -0.285267  0.187066  0.579817  
t  0.192809 -0.122999  0.083632  
u -0.562124  0.376021 -0.434149  
v -0.358931  0.637259  0.729595  
w  1.000000 -0.835494  0.145546  
x -0.835494  1.000000 -0.030812  
y  0.145546 -0.030812  1.000000  
          p         q         r         s         t         u         v  \
p  1.000000  0.348616  0.606492  0.406070  0.056932  0.461015  0.543905   
q  0.348616  1.000000  0.213753  0.159442  0.554585  0.088275  0.023752   
r  0.606492  0.213753  1.000000  0.501096  0.190063  0.290160  0.440666   
s  0.406070  0.159442  0.501096  1.000000  0.260183  0.243481  0.505580   
t  0.056932  0.554585  0.190063  0.260183  1.000000  0.568596  0.009954   
u  0.461015  0.088275  0.290160  0.243481  0.568596  1.000000  0.118254   
v  0.543905  0.023752  0.440666  0.505580  0.009954  0.118254  1.000000   
w  0.207508  0.125992  0.797659  0.285267  0.192809  0.562124  0.358931   
x  0.452943  0.029441  0.798096  0.187066  0.122999  0.376021  0.637259   
y  0.294716  0.043568  0.043181  0.579817  0.083632  0.434149  0.729595   

          w         x         y  
p  0.207508  0.452943  0.294716  
q  0.125992  0.029441  0.043568  
r  0.797659  0.798096  0.043181  
s  0.285267  0.187066  0.579817  
t  0.192809  0.122999  0.083632  
u  0.562124  0.376021  0.434149  
v  0.358931  0.637259  0.729595  
w  1.000000  0.835494  0.145546  
x  0.835494  1.000000  0.030812  
y  0.145546  0.030812  1.000000  
p    0.606492
q    0.554585
r    0.798096
s    0.579817
t    0.568596
u    0.568596
v    0.729595
w    0.835494
x    0.835494
y    0.729595
dtype: float64
[0.61 0.55 0.8  0.58 0.57 0.57 0.73 0.84 0.84 0.73]

df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)
# 获取df对象中的值【最小值/最大值】
min_by_max = np.min(df, axis=1)/np.max(df, axis=1)
min_by_max

    0   1   2   3   4   5   6   7   8   9
0  85  63  99  34  13  14  64  33  58  16
1  64  45  77  68  19  45  61   2  11  15
2  78  66  76  51  51  52  20  53  35  64
3  68  85   2  81  52  66  14  28  41  34
4  37  40  99  62  57  70  37  15  14  56
5  13  88  12  51  43   1  54  18  70  67
6  55  19  79  43  19   8  52   6  15  77
7  79  93  54  68  78  61  80  33  72  92





0    0.131313
1    0.025974
2    0.256410
3    0.023529
4    0.141414
5    0.011364
6    0.075949
7    0.354839
dtype: float64

# 创建一个新列'penultimate'，该列的第二行的值第二大df
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
result=df.apply(lambda x:x.sort_values().unique()[-2],axis=1)
df['penultimate'] = result
df

	0	1	2	3	4	5	6	7	8	9	penultimate
0	50	12	77	25	22	97	49	40	27	18	77
1	14	52	78	3	67	5	77	17	43	53	77
2	92	53	10	39	55	34	63	89	60	41	89
3	9	89	66	50	88	4	46	19	87	75	88
4	97	95	75	50	91	60	65	3	24	59	95
5	31	38	4	81	9	1	52	71	84	57	81
6	59	7	19	33	49	40	54	60	48	4	59
7	90	21	77	44	3	50	98	23	84	30	90

# 规范数据框中的所有列
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)

# df通过减去列均值并除以标准偏差来归一化所有列
result=df.apply(lambda x:(
    (x - x.mean())/x.std()
).round(2))
result

    0   1   2   3   4   5   6   7   8   9
0  73  77  53  35   9  80  96  47  35  26
1  58  72  39  80  86  57  41  98  31  90
2  45  76  22  27   5  15  78  90  87  92
3  89  84  97  78  29  70  23  95  97  90
4  55  32  83  49  99  63  22  75  44  26
5  74  42  70  49  57  26  88  77   1   5
6  56  29  42  28  75  16  21  11  38  50
7  99  26  74  74  39  50  61   3  23   3

	0	1	2	3	4	5	6	7	8	9
0	0.24	0.90	-0.28	-0.79	-1.16	1.31	1.36	-0.40	-0.30	-0.57
1	-0.57	0.70	-0.83	1.24	1.03	0.39	-0.41	0.96	-0.42	1.10
2	-1.28	0.86	-1.51	-1.15	-1.28	-1.28	0.78	0.75	1.32	1.15
3	1.10	1.18	1.47	1.15	-0.59	0.91	-0.99	0.88	1.63	1.10
4	-0.74	-0.92	0.91	-0.16	1.40	0.63	-1.02	0.35	-0.02	-0.57
5	0.29	-0.52	0.40	-0.16	0.20	-0.84	1.10	0.40	-1.35	-1.11
6	-0.68	-1.04	-0.71	-1.10	0.72	-1.24	-1.05	-1.36	-0.20	0.06
7	1.64	-1.16	0.56	0.97	-0.31	0.11	0.23	-1.57	-0.67	-1.17

# 规范数据框中的所有列
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)

# 排列所有列的范围df，以使每一列的最小值为0且最大值为1。
result = df.apply(lambda x: (
    1 - (x.max() - x)/(x.max() - x.min())
).round(2))
result

    0   1   2   3   4   5   6   7   8   9
0  78   7  59   4  77  81  93  66  39  28
1  60  51  88  19  23  29  70  82  10  24
2   2  80   7  59  72  51  82  28  38  25
3  36  88   3   8  43   7  87  60  28  99
4  29  69  89  84  87  15  95  87  75  54
5  82  78  60  57  15  29  41  93  57  13
6  72  28  63   2  20  25   6  72  71  32
7  60   2  13  87  82  97  41  23  81  16

	0	1	2	3	4	5	6	7	8	9
0	0.95	0.06	0.65	0.02	0.86	0.82	0.98	0.61	0.41	0.17
1	0.72	0.57	0.99	0.20	0.11	0.24	0.72	0.84	0.00	0.13
2	0.00	0.91	0.05	0.67	0.79	0.49	0.85	0.07	0.39	0.14
3	0.43	1.00	0.00	0.07	0.39	0.00	0.91	0.53	0.25	1.00
4	0.34	0.78	1.00	0.96	1.00	0.09	1.00	0.91	0.92	0.48
5	1.00	0.88	0.66	0.65	0.00	0.24	0.39	1.00	0.66	0.00
6	0.88	0.30	0.70	0.00	0.07	0.20	0.00	0.70	0.86	0.22
7	0.72	0.00	0.12	1.00	0.93	1.00	0.39	0.00	1.00	0.03

# 计算每一行与顺序行的相关系数
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)
print([i for i in range(df.shape[0])[:-1]])#去除最后一行

result=[df.iloc[i].corr(df.iloc[i+1]) for i in range(df.shape[0])[:-1]]
result

    0   1   2   3   4   5   6   7   8   9
0  27   3  37   9  76  68  91  31  44   7
1  11  15  20  47  33  86  65  47   9  30
2  39   1  72  19  35  42  87  77  55  40
3  60   7   8  28  37  14  17   5   3   7
4  47  99  76  28  77  57  32  57  24  16
5   2  50  95  89  84  46  59  84   1   2
6  78  27  58  67  78   1   7  28  89  20
7  12  86  54  81  20  19  77   1   8  56
[0, 1, 2, 3, 4, 5, 6]





[0.5182965633327684,
 0.2595376913412023,
 -0.23874062518280761,
 0.005261734793477499,
 0.4687394611664755,
 -0.06555011633952691,
 -0.30907671467693215]

df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))
rows=df.shape[0]#数据框的行数
for i in range(rows):
    df.iat[i,i]=0
    df.iat[rows-i-1,i]=0
df

	0	1	2	3	4	5	6	7	8	9
0	0	65	92	82	10	1	51	71	32	0
1	79	0	11	99	28	68	24	8	0	83
2	34	4	0	35	11	91	83	0	41	29
3	84	72	5	0	65	76	0	25	25	64
4	98	14	2	10	0	0	2	94	40	84
5	75	8	8	27	0	0	23	62	73	95
6	23	43	38	0	36	43	0	7	65	6
7	80	96	0	82	92	79	64	0	61	67
8	29	0	96	96	76	21	94	72	0	4
9	0	26	27	65	95	19	19	1	90	0

# 通过密钥获取groupby数据帧的特定组
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})
print(df)
# 通过数据框的df['col1']，显示有apple的行数
df.groupby(df['col1']).get_group('apple')

     col1      col2  col3
0   apple  0.703158    12
1  banana  0.535815    13
2  orange  0.177147     8
3   apple  0.159570     2
4  banana  0.411271    10
5  orange  0.279007    11
6   apple  0.576264     4
7  banana  0.578607     9
8  orange  0.242959     6

	col1	col2	col3
0	apple	0.703158	12
3	apple	0.159570	2
6	apple	0.576264	4

# 当按另一列分组时，如何获得某列的第n个最大值
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 4,
                   'taste': np.random.rand(12),
                   'price': np.random.randint(0, 15, 12)})
# 通过df['fruit']下的banana值，查询df['taste']的对应值
banana=df['taste'].groupby(df['fruit']).get_group('banana')
print(banana)
print("特定结果：",banana.sort_values().iloc[-2])#升序排序，找出倒数第二个值
df

1     0.209485
4     0.549818
7     0.498802
10    0.006632
Name: taste, dtype: float64
特定结果： 0.4988018517868045

	fruit	taste	price
0	apple	0.510446	7
1	banana	0.209485	1
2	orange	0.632166	1
3	apple	0.865764	4
4	banana	0.549818	9
5	orange	0.744718	5
6	apple	0.069171	0
7	banana	0.498802	14
8	orange	0.011808	2
9	apple	0.103222	13
10	banana	0.006632	6
11	orange	0.017787	13

# 计算熊猫数据框上的分组均值，并将分组列保留为另一列（而不是索引）
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})
print(df)
# 通过df['fruit']的值，查询price的均值
out = df.groupby(df['fruit'], as_index=False)['price'].mean()
out

    fruit    rating  price
0   apple  0.090672      2
1  banana  0.019506      0
2  orange  0.354463      5
3   apple  0.466694     14
4  banana  0.807733      8
5  orange  0.488868      4
6   apple  0.640913      8
7  banana  0.977691      8
8  orange  0.390033      0

	fruit	price
0	apple	8.000000
1	banana	5.333333
2	orange	3.000000

# 通过2列联接两个数据框，使它们只有公共行
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'pounds': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})
print(df1)
print(df2)
pd.merge(df1, df2, how='inner', 
         left_on=['fruit', 'weight'], 
         right_on=['pazham', 'pounds'], 
         suffixes=['_left', '_right'])

    fruit  weight  price
0   apple    high     13
1  banana  medium      5
2  orange     low     14
3   apple    high      2
4  banana  medium      2
5  orange     low      6
6   apple    high      7
7  banana  medium      1
8  orange     low      6
   pazham pounds  price
0   apple   high     10
1  orange    low     14
2    pine   high     12
3   apple    low     12
4  orange   high     11
5    pine    low      6

	fruit	weight	price_left	pazham	pounds	price_right
0	apple	high	13	apple	high	10
1	apple	high	2	apple	high	10
2	apple	high	7	apple	high	10
3	orange	low	14	orange	low	14
4	orange	low	6	orange	low	14
5	orange	low	6	orange	low	14

# 获得两列值匹配的位置
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})
print(df)
np.where(df.fruit1 == df.fruit2)[0]#如果两个值相等，返回下标

   fruit1  fruit2
0  orange  orange
1  orange   apple
2  orange   apple
3  banana  banana
4   apple  banana
5  orange  orange
6  orange  banana
7  banana  orange
8   apple  banana
9  orange  banana





array([0, 3, 5], dtype=int64)

# 在数据框中创建列的滞后和超前
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4),
                  columns = list('abcd'))

print(df)
df['a_lag'] = df['a'].shift(1)#把a列中的前一个索引放到a_lag
print(df)
df['b_lead'] = df['b'].shift(-1)#把b列中的后一个索引放到a_lag
df

    a   b   c   d
0  31  79  72  32
1   8  18  82  25
2  98  23  41  79
3   2  87  74  76
4  16  89  12  86
    a   b   c   d  a_lag
0  31  79  72  32    NaN
1   8  18  82  25   31.0
2  98  23  41  79    8.0
3   2  87  74  76   98.0
4  16  89  12  86    2.0

	a	b	c	d	a_lag	b_lead
0	31	79	72	32	NaN	18.0
1	8	18	82	25	31.0	23.0
2	98	23	41	79	8.0	87.0
3	2	87	74	76	98.0	89.0
4	16	89	12	86	2.0	NaN

# 获得整个数据帧中唯一值的频率
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), 
                  columns = list('abcd'))
ravel=df.values.ravel()#把数据框中的值变成一维数组
print(ravel)
pd.value_counts(ravel)#计算每个值出现的频率

[6 1 6 8 7 8 3 4 1 8 7 7 1 1 9 8 5 8 5 5]





8    5
1    4
7    3
5    3
6    2
9    1
4    1
3    1
dtype: int64

# 将文本列拆分为两个单独的列
df = pd.DataFrame([ "STD, City,State",
                    "33, Kolkata,West Bengal",
                    "44, Chennai,Tamil Nadu",
                    "40, Hyderabad  ,Telengana",
                    "80, Bangalore,Karnataka"], 
                  columns=['row'])
# print(df)
# 把每一行数据根据","分开
df_out = df.row.str.split(",", expand=True)
# print(df_out)

new_header = df_out.iloc[0]#获取第一行数据框的值【前面】
df_out = df_out[1:]#把后面的每一行变为数据框的值【后面】

df_out.columns = new_header#把第一行数据变为列名
# print(new_header)
# print(new_values)
print(df_out)

0 STD          City        State
1  33       Kolkata  West Bengal
2  44       Chennai   Tamil Nadu
3  40   Hyderabad      Telengana
4  80     Bangalore    Karnataka

迷心兔

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
pandas基础学习

import numpy as npimport pandas as pdprint(pd.__version__)#检查版本print(pd.show_versions(as_json=True))#检查版本# 如何从列表，numpy数组和字典创建系列mylist = list('abcedfghijklmnopqrstuvwxyz')myarr = np.arange(26)...
复制链接

扫一扫

专栏目录

	0	5	10	15	20	b	c	d	e
0	1	0	0	0	0	1	2	3	4
1	0	1	0	0	0	6	7	8	9
2	0	0	1	0	0	11	12	13	14
3	0	0	0	1	0	16	17	18	19
4	0	0	0	0	1	21	22	23	24

	0	1	2	3	4	5	6	7	8	9	penultimate
0	50	12	77	25	22	97	49	40	27	18	77
1	14	52	78	3	67	5	77	17	43	53	77
2	92	53	10	39	55	34	63	89	60	41	89
3	9	89	66	50	88	4	46	19	87	75	88
4	97	95	75	50	91	60	65	3	24	59	95
5	31	38	4	81	9	1	52	71	84	57	81
6	59	7	19	33	49	40	54	60	48	4	59
7	90	21	77	44	3	50	98	23	84	30	90

	0	1	2	3	4	5	6	7	8	9
0	0	65	92	82	10	1	51	71	32	0
1	79	0	11	99	28	68	24	8	0	83
2	34	4	0	35	11	91	83	0	41	29
3	84	72	5	0	65	76	0	25	25	64
4	98	14	2	10	0	0	2	94	40	84
5	75	8	8	27	0	0	23	62	73	95
6	23	43	38	0	36	43	0	7	65	6
7	80	96	0	82	92	79	64	0	61	67
8	29	0	96	96	76	21	94	72	0	4
9	0	26	27	65	95	19	19	1	90	0

	0	5	10	15	20	b	c	d	e
0	1	0	0	0	0	1	2	3	4
1	0	1	0	0	0	6	7	8	9
2	0	0	1	0	0	11	12	13	14
3	0	0	0	1	0	16	17	18	19
4	0	0	0	0	1	21	22	23	24

	0	1	2	3	4	5	6	7	8	9	penultimate
0	50	12	77	25	22	97	49	40	27	18	77
1	14	52	78	3	67	5	77	17	43	53	77
2	92	53	10	39	55	34	63	89	60	41	89
3	9	89	66	50	88	4	46	19	87	75	88
4	97	95	75	50	91	60	65	3	24	59	95
5	31	38	4	81	9	1	52	71	84	57	81
6	59	7	19	33	49	40	54	60	48	4	59
7	90	21	77	44	3	50	98	23	84	30	90

	0	1	2	3	4	5	6	7	8	9
0	0	65	92	82	10	1	51	71	32	0
1	79	0	11	99	28	68	24	8	0	83
2	34	4	0	35	11	91	83	0	41	29
3	84	72	5	0	65	76	0	25	25	64
4	98	14	2	10	0	0	2	94	40	84
5	75	8	8	27	0	0	23	62	73	95
6	23	43	38	0	36	43	0	7	65	6
7	80	96	0	82	92	79	64	0	61	67
8	29	0	96	96	76	21	94	72	0	4
9	0	26	27	65	95	19	19	1	90	0

	0	5	10	15	20	b	c	d	e
0	1	0	0	0	0	1	2	3	4
1	0	1	0	0	0	6	7	8	9
2	0	0	1	0	0	11	12	13	14
3	0	0	0	1	0	16	17	18	19
4	0	0	0	0	1	21	22	23	24

	0	1	2	3	4	5	6	7	8	9	penultimate
0	50	12	77	25	22	97	49	40	27	18	77
1	14	52	78	3	67	5	77	17	43	53	77
2	92	53	10	39	55	34	63	89	60	41	89
3	9	89	66	50	88	4	46	19	87	75	88
4	97	95	75	50	91	60	65	3	24	59	95
5	31	38	4	81	9	1	52	71	84	57	81
6	59	7	19	33	49	40	54	60	48	4	59
7	90	21	77	44	3	50	98	23	84	30	90

	0	1	2	3	4	5	6	7	8	9
0	0	65	92	82	10	1	51	71	32	0
1	79	0	11	99	28	68	24	8	0	83
2	34	4	0	35	11	91	83	0	41	29
3	84	72	5	0	65	76	0	25	25	64
4	98	14	2	10	0	0	2	94	40	84
5	75	8	8	27	0	0	23	62	73	95
6	23	43	38	0	36	43	0	7	65	6
7	80	96	0	82	92	79	64	0	61	67
8	29	0	96	96	76	21	94	72	0	4
9	0	26	27	65	95	19	19	1	90	0