Pandas - Review

最新推荐文章于 2022-10-29 05:45:00 发布

HoneyGrapefruit

最新推荐文章于 2022-10-29 05:45:00 发布

阅读量803

点赞数

文章标签： python 数据挖掘机器学习

本文链接：https://blog.csdn.net/Lemon_Review/article/details/121805899

版权

title: Pandas回顾及应用
category: 数据分析基础

Pandas 的应用

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['STFangsong']
plt.rcParams['axes.unicode_minus'] = False

%config InlineBackend.figure_format = 'svg'

Series

创建Series对象

# 1
ser1 = pd.Series([122,98,45,79],index = ['一季度','二季度','三季度','四季度'])  # index（*季度）可重复，但在通过该index索引数据时，得到的是一个新的Series
ser1

一季度    122
二季度     98
三季度     45
四季度     79
dtype: int64

# 2
ser2 = pd.Series({
   
    '一季度':999,
    '二季度':998,
    '三季度':997,
    '四季度':996,
})
ser2
# 通过字典创建数据系列时，当键（index）重复时，后者覆盖前者

一季度    999
二季度    998
三季度    997
四季度    996
dtype: int64

# 切片
# index切片时左闭右开
ser2[0:2]

一季度    999
二季度    998
dtype: int64

# 通过自己给定的索引切片时，两边均闭合
ser2['一季度':'三季度']

一季度    999
二季度    998
三季度    997
dtype: int64

# 布尔索引
ser1[ser1 > 100]

一季度    122
dtype: int64

# 花式索引
ser1[['一季度','四季度']],ser2[[1,3]]

(一季度    122
 四季度     79
 dtype: int64,
 二季度    998
 四季度    996
 dtype: int64)

Series 的属性 / 方法

# Series的值obj.values
ser2.values

array([999, 998, 997, 996], dtype=int64)

# Series 的index的值 obj.index.values
ser2.index.values

array(['一季度', '二季度', '三季度', '四季度'], dtype=object)

# Series 中的数据是否单调
ser2.is_monotonic

False

# 描述性统计信息
des = ser2.describe()
des

count      4.000000
mean     997.500000
std        1.290994
min      996.000000
25%      996.750000
50%      997.500000
75%      998.250000
max      999.000000
dtype: float64

des['min'],des['25%']

(996.0, 996.75)

ser3 = pd.Series([122,98,45,79,79,122],index = ['一季度','二季度','三季度','四季度','四季度','一季度'])
ser3

一季度    122
二季度     98
三季度     45
四季度     79
四季度     79
一季度    122
dtype: int64

# 独一无二的值
ser3.unique()

array([122,  98,  45,  79], dtype=int64)

# 不重复的值  返回个数
ser3.nunique()

# 删除重复值
ser3_ = ser3.drop_duplicates()
ser3_

一季度    122
二季度     98
三季度     45
四季度     79
dtype: int64

# 重复值
ser3.duplicated()

一季度    False
二季度    False
三季度    False
四季度    False
四季度     True
一季度     True
dtype: bool

# 统计频次
ser3.value_counts()

122    2
79     2
98     1
45     1
dtype: int64

ser4 = pd.Series([122,98,np.nan,45,79,np.nan],index = ['一季度','二季度','三季度','四季度','五季度','六季度'])
ser4

一季度    122.0
二季度     98.0
三季度      NaN
四季度     45.0
五季度     79.0
六季度      NaN
dtype: float64

# 是否含有空值
ser4.isnull(),ser4.notnull()

(一季度    False
 二季度    False
 三季度     True
 四季度    False
 五季度    False
 六季度     True
 dtype: bool,
 一季度     True
 二季度     True
 三季度    False
 四季度     True
 五季度     True
 六季度    False
 dtype: bool)

# 删除重复值
ser4.dropna()

一季度    122.0
二季度     98.0
四季度     45.0
五季度     79.0
dtype: float64

# 指定值填充空值
ser4.fillna(100)

一季度    122.0
二季度     98.0
三季度    100.0
四季度     45.0
五季度     79.0
六季度    100.0
dtype: float64

# 将空值填充为平均值
ser4.fillna(ser4.mean())

一季度    122.0
二季度     98.0
三季度     86.0
四季度     45.0
五季度     79.0
六季度     86.0
dtype: float64

# 用空值的前(ffill)/后(bfill)一个数据填充空值
ser4.fillna(method='ffill')

一季度    122.0
二季度     98.0
三季度     98.0
四季度     45.0
五季度     79.0
六季度     79.0
dtype: float64

ser5 = pd.Series(np.arange(1,10))
ser5

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
dtype: int32

# 返回满足条件的值以指定值填充不满足条件的值（默认填充为空值）
ser5.where(ser5<5),ser5.where(ser5<5,99)

(0    1.0
 1    2.0
 2    3.0
 3    4.0
 4    NaN
 5    NaN
 6    NaN
 7    NaN
 8    NaN
 dtype: float64,
 0     1
 1     2
 2     3
 3     4
 4    99
 5    99
 6    99
 7    99
 8    99
 dtype: int32)

# 返回不满足条件的值以指定值填充满足条件的值（默认填充为空值）
ser5.mask(ser5<5),ser5.mask(ser5<5,99)

(0    NaN
 1    NaN
 2    NaN
 3    NaN
 4    5.0
 5    6.0
 6    7.0
 7    8.0
 8    9.0
 dtype: float64,
 0    99
 1    99
 2    99
 3    99
 4     5
 5     6
 6     7
 7     8
 8     9
 dtype: int32)

ser6 = pd.Series(np.random.randint(30,80,50))
ser6

0     72
1     33
2     44
3     30
4     44
5     78
6     56
7     56
8     55
9     72
10    70
11    74
12    41
13    74
14    55
15    76
16    54
17    66
18    63
19    62
20    54
21    32
22    46
23    52
24    57
25    54
26    43
27    64
28    77
29    62
30    46
31    53
32    59
33    72
34    50
35    73
36    58
37    77
38    72
39    49
40    73
41    65
42    74
43    30
44    32
45    79
46    40
47    76
48    41
49    72
dtype: int32

ser6.map(lambda x: round((x ** 0.5 * 10),0))

0     85.0
1     57.0
2     66.0
3     55.0
4     66.0
5     88.0
6     75.0
7     75.0
8     74.0
9     85.0
10    84.0
11    86.0
12    64.0
13    86.0
14    74.0
15    87.0
16    73.0
17    81.0
18    79.0
19    79.0
20    73.0
21    57.0
22    68.0
23    72.0
24    75.0
25    73.0
26    66.0
27    80.0
28    88.0
29    79.0
30    68.0
31    73.0
32    77.0
33    85.0
34    71.0
35    85.0
36    76.0
37    88.0
38    85.0
39    70.0
40    85.0
41    81.0
42    86.0
43    55.0
44    57.0
45    89.0
46    63.0
47    87.0
48    64.0
49    85.0
dtype: float64

# 排序与头部值
ser7 = pd.Series([99,54,78,35,86],['apple', 'banana', 'pitaya', 'pitaya', 'durian'])
ser7

apple     99
banana    54
pitaya    78
pitaya    35
durian    86
dtype: int64

ser7.sort_values(inplace=True)
ser7

pitaya    35
banana    54
pitaya    78
durian    86
apple     99
dtype: int64

ser7.sort_index(inplace = True)
ser7

apple     99
banana    54
durian    86
pitaya    35
pitaya    78
dtype: int64

ser7.nlargest(3),ser7.nsmallest(3)

(apple     99
 durian    86
 pitaya    78
 dtype: int64,
 pitaya    35
 banana    54
 pitaya    78
 dtype: int64)

ser1.plot(figsize = (8,4), kind = 'bar', width = 0.3)   # 水平柱状图：barh
plt.grid(True,alpha = 0.3,axis = 'y', linestyle = '--')   # 设置网格线便于观察。alpha设置透明度、axis设置需要添加网格的轴
plt.xticks(rotation = 0)   # 横坐标标签水平显示
plt.yticks(np.arange(0,151,30))   # 订制纵坐标
for i in range(ser1.size):
    plt.text(i,ser1[i] + 1,ser1[i],ha = 'center')
plt.show()

ser1.plot(kind = 'pie',autopct = '%.2f%%',wedgeprops = dict(width = 0.4,edgecolor = 'white'),pctdistance = 0.8)
# wedgeprops = dict(width = 0.4,edgecolor = 'white'),pctdistance = 0.8
# width:环状图宽度；edgecolor：边界颜色；pctdistance：百分比显示距离（距圆心）
plt.ylabel('')
plt.title('2021各季度销售占比')
plt.show()

DataFrame

# 创建dataframe
scores = np.random.randint(60, 101, (5, 3))
courses = ['语文', '数学', '英语']
ids = [1001, 1002, 1003, 1004, 1005]
df1 = pd.DataFrame(data=scores, columns=courses, index=ids)
df1

	语文	数学	英语
1001	69	62	82
1002	75	80	60
1003	71	67	70
1004	90	96	65
1005	66	89	93

scores = {
   
    '语文': [62, 72, 93, 88, 93],
    '数学': [95, 65, 86, 66, 87],
    '英语': [66, 75, 82, 69, 82],
}
ids = [1001, 1002, 1003, 1004, 1005]
df2 = pd.DataFrame(data=scores, index=ids)
df2

	语文	数学	英语
1001	62	95	66
1002	72	65	75
1003	93	86	82
1004	88	66	69
1005	93	87	82

df3 = pd.read_csv('../files/data/2018年北京积分落户数据.csv',
                  index_col='id',
#                  quotechar='包围内容的字符（双引号自动去）',
                 )
df3.head()

	name	birthday	company	score
id
1	杨效丰	1972-12	北京利德华福电气技术有限公司	122.59
2	纪丰伟	1974-12	北京航天数据股份有限公司	121.25
3	王永	1974-05	品牌联盟(北京)咨询股份公司	118.96
4	杨静	1975-07	中科专利商标代理有限责任公司	118.21
5	张凯江	1974-11	北京阿里巴巴云计算技术有限公司	117.79

df4 = pd.read_csv('../files/data/bilibili.csv',encoding='GBK')
df4.head()

	title	url	watchnum	dm	uptime	upname
0	阿里云大学课程（云计算、中间件、大数据、云安全）	//www.bilibili.com/video/BV1Lv411s7wu?from=search	2954	4	2021/1/21	韭菜滚雪球
1	视觉传达设计专业的小朋友大学课程有哪些，强度怎么样，需要什么技能？学姐给新生的解答与建议	//www.bilibili.com/video/BV1Ea4y1a7CX?from=search	3526	8	2020/7/25	铧仔仔儿的奋斗史
2	CAP：适合高中生的大学课程（上大学之前提前学习大学的课程）同济大学《微积分CAP》	//www.bilibili.com/video/BV1X4411Y7u8?from=search	5597	17	2019/5/11	愚甘杂货铺
3	干货！论文读写系列｜写作风格：例文解析（1）｜人文社科｜教育学｜大学课程、知识	//www.bilibili.com/video/BV1VC4y1b7ZA?from=search	1.1万	29	2020/7/26	cici西西熙熙
4	《用户体验与心理-第1期》大学课程	//www.bilibili.com/video/BV1r7411M7gY?from=search	1373	9	2020/2/24	Luka老师

# 以制表符为分隔符的文件
df5 = pd.read_csv('../files/data/chipotle.tsv',delimiter='\t')
df5.head()

	order_id	quantity	item_name	choice_description	item_price
0	1	1	Chips and Fresh Tomato Salsa	NaN	$2.39
1	1	1	Izze	[Clementine]	$3.39
2	1	1	Nantucket Nectar	[Apple]	$3.39
3	1	1	Chips and Tomatillo-Green Chili Salsa	NaN	$2.39
4	2	2	Chicken Bowl	[Tomatillo-Red Chili Salsa (Hot), [Black Beans...	$16.98

df6 = pd.read_excel('../files/data/2020年销售数据.xlsx',header=1,sheet_name = 'Sheet1')
# usecols = [] - 指定列； nrows = 100 - 只读100行； skiprows = np.arange(2,102) - 去跳过前100行
df6.head()

	销售日期	销售信息	销售区域	销售渠道	销售订单	品牌	售价	销售数量	销售额
0	2020-01-01	上海拼多多订单182894-455	上海	拼多多	182894-455	八匹马	99	83	8217
1	2020-01-01	上海抖音订单205635-402	上海	抖音	205635-402	八匹马	219	29	6351
2	2020-01-01	上海天猫订单205654-021	上海	天猫	205654-021	八匹马	169	85	14365
3	2020-01-01	上海天猫订单205654-519	上海	天猫	205654-519	八匹马	169	14	2366
4	2020-01-01	上海天猫订单377781-010	上海	天猫	377781-010	皮皮虾	249	61	15189

df7 = pd.read_excel('../files/data/口罩销售数据.xlsx')
df7.head()

	日期	销售城市	产品类别	销售员	数量	单价	金额
0	2020-01-01	广州	纯棉口罩	王大锤	6	1	6
1	2020-01-01	武汉	明星口罩	秦红棉	30	30	900
2	2020-01-01	深圳	明星口罩	秦红棉	2	30	60
3	2020-01-03	上海	防雾霾口罩	白元芳	63	28	1764
4	2020-01-04	武汉	明星口罩	白元芳	25	30	750

df8 = pd.read_excel('../files/data/某视频网站运营数据.xlsx')
df8.head()

	video_id	title	channel_title	tags	views	likes	dislikes	comment_count
0	2kyS6SvSYSE	WE WANT TO TALK ABOUT OUR MARRIAGE	CaseyNeistat	SHANtell martin	748374	57527	2966	15954
1	1ZAPwfrtAFY	The Trump Presidency: Last Week Tonight with J...	LastWeekTonight	last week tonight trump presidency\|"last week ...	2418783	97185	6146	12703
2	5qpjK5DgCt4	Racist Superman \| Rudy Mancuso, King Bach & Le...	Rudy Mancuso	racist superman\|"rudy"\|"mancuso"\|"king"\|"bach"...	3191434	146033	5339	8181