Pandas
[Pandas官方文档](DataFrame — pandas 2.2.2 documentation (pydata.org))
数据结构 Series
- series 是pandas中的一维数据,类似表格中的一列
pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
- data 存放的数据 可以是列表 数组 字典
- index 索引 默认为从0开始的整数
- detype 数据类型 可以是Numpy的数据类型
- name series的名称
# 使用列表创建 Series
s = pd.Series([1, 2, 3, 4])
print(s)
# 使用 NumPy 数组创建 Series
s = pd.Series(np.array([1, 2, 3, 4]))
print(s)
# 使用字典创建 Series
s = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4})
print(s) # 字典的键作为series的index索引
# 获取索引
index = s.index
print(index)
# 获取值数组
values = s.values
print(values)
# 获取描述统计信息
stats = s.describe()
print(stats)
'''
0 1
1 2
2 3
3 4
dtype: int64
0 1
1 2
2 3
3 4
dtype: int32
a 1
b 2
c 3
d 4
dtype: int64
Index(['a', 'b', 'c', 'd'], dtype='object')
[1 2 3 4]
count 4.000000
mean 2.500000
std 1.290994
min 1.000000
25% 1.750000
50% 2.500000
75% 3.250000
max 4.000000
dtype: float64
'''
# 获取最大值和最小值的索引
max_index = s.idxmax()
min_index = s.idxmin()
# 其他属性和方法
# print(s.dtype) # 数据类型
# print(s.shape) # 形状
# print(s.size) # 元素个数
# print(s.head()) # 前几个元素,默认是前 5 个
# print(s.tail()) # 后几个元素,默认是后 5 个
# print(s.sum()) # 求和
# print(s.mean()) # 平均值
# print(s.std()) # 标准差
# print(s.min()) # 最小值
# print(s.max()) # 最大值
数据结构 DataFrame
DataFrame是Pandas中的数据结构 用于标识二维表格的数据
存放了多个Series对象,共用相同的索引
DataFrame 的列索引对应了Series对象
pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
- data 存放的数据 可以是字典 二维数组 series 等
- index 行索引
- columns 列索引
- dtype 指定DataFrame 的数据类型
# 列表创建
li = [
['math',100],
['English',90],
['chinese',100]
]
df = pd.DataFrame(li,columns=['class','score'])
# print(df)
data = [{'a': 1, 'b': 2}, # 第一行数据
{'a': 5, 'b': 10, 'c': 20}]# 第二行数据 # 列表嵌套字典
df = pd.DataFrame(data)
'''
class score
0 math 100
1 English 90
2 chinese 100
'''
# 字典创建
dic = {
'class':['math','English'], # 第一列
'score':[100,100] # 第二列
}
df = pd.DataFrame(dic) # 键作为列索引
读取和保存csv文件
df = pd.read_csv('./titanic.csv') # 读取文件
df = df[0:5] # 修改df
df.to_csv('./new_titanic.csv') # 保存新的csv文件
JSON文件
(Pandas JSON | 菜鸟教程 (runoob.com))
json文件 jso.json
[
{
"id": "A001",
"name": "百度",
"url": "www.baidu.com",
"likes": 61
},
{
"id": "A002",
"name": "Google",
"url": "www.google.com",
"likes": 124
},
{
"id": "A003",
"name": "淘宝",
"url": "www.taobao.com",
"likes": 45
}
]
# json文件的读取
df = pd.read_json('./jso.json')
print(df)
'''
id name url likes
0 A001 百度 www.baidu.com 61
1 A002 Google www.google.com 124
2 A003 淘宝 www.taobao.com 45
'''
# 从网址获取
df = pd.read_json('https://static.jyshare.com/download/sites.json')
print(df)
neiqian_js.json
{
"school_name":"ABC primary school",
"class": "Year 1",
"students":[
{
"id": "A001",
"name": "Tom",
"math": 60,
"physics": 66,
"chemistry": 61
},
{
"id": "A002",
"name": "James",
"math": 89,
"physics": 76,
"chemistry": 51
},
{
"id": "A003",
"name": "Jenny",
"math": 79,
"physics": 90,
"chemistry": 78
}]
}
import json
# df = pd.read_json('./jso.json')
# print(df)
'''
id name url likes
0 A001 百度 www.baidu.com 61
1 A002 Google www.google.com 124
2 A003 淘宝 www.taobao.com 45
'''
# df = pd.read_json('https://static.jyshare.com/download/sites.json')
# print(df)
df = pd.read_json('./neiqian_js.json')
# print(df.to_string()) # to_string()返回数据,全部显示 不适用to_sting则会...
'''
school_name class students
0 ABC primary school Year 1 {'id': 'A001', 'name': 'Tom', 'math': 60, 'physics': 66, 'chemistry': 61}
1 ABC primary school Year 1 {'id': 'A002', 'name': 'James', 'math': 89, 'physics': 76, 'chemistry': 51}
2 ABC primary school Year 1 {'id': 'A003', 'name': 'Jenny', 'math': 79, 'physics': 90, 'chemistry': 78}
'''
import json
with open('./neiqian_js.json') as f:
data = json.loads(f.read()) # 读取json文件并使用json解析为字典
print(type(data))# dict
df1 = pd.json_normalize(data,record_path=['students']) # record_path要解析的内部嵌套json
df2 = pd.json_normalize(data,record_path=['students'],meta=['class','school_name']) # meta 原本的外层数据
print(df1)
print(df2)
'''
id name math physics chemistry
0 A001 Tom 60 66 61
1 A002 James 89 76 51
2 A003 Jenny 79 90 78
id name math physics chemistry class school_name
0 A001 Tom 60 66 61 Year 1 ABC primary school
1 A002 James 89 76 51 Year 1 ABC primary school
2 A003 Jenny 79 90 78 Year 1 ABC primary school
'''
neiqian_2.json
{
"school_name": "local primary school",
"class": "Year 1",
"info": {
"president": "John Kasich",
"address": "ABC road, London, UK",
"contacts": {
"email": "admin@e.com",
"tel": "123456789"
}
},
"students": [
{
"id": "A001",
"name": "Tom",
"math": 60,
"physics": 66,
"chemistry": 61
},
{
"id": "A002",
"name": "James",
"math": 89,
"physics": 76,
"chemistry": 51
},
{
"id": "A003",
"name": "Jenny",
"math": 79,
"physics": 90,
"chemistry": 78
}]
}
with open('./neiqian_2.json') as f:
data = json.loads(f.read())
# 更复杂的嵌套
df = pd.json_normalize(
data,
record_path=['students'],
meta=[
['info','contacts','tel'],
['school_name']
]
)
print(df.to_string())
'''
id name math physics chemistry info.contacts.tel school_name
0 A001 Tom 60 66 61 123456789 local primary school
1 A002 James 89 76 51 123456789 local primary school
2 A003 Jenny 79 90 78 123456789 local primary school
'''
数据清洗和数据处理
head
前n行数据 默认5行tail
后n行数据 默认5行info
显示DataFrame的信息 行数 列数 每一行存储的数据类型
# print(data_train.info()) #[800000 rows x 47 columns]> 熟悉数据类型
# info()函数和info 属性不同 info函数输出的是每一列的数据类型
with open('./testA.csv') as f:
df = pd.DataFrame(f)
# print(df.head())
# print(df.tail())
# print(df.info())
# print(df.info)
'''
0
0 id,loanAmnt,term,interestRate,installment,grad...
1 800000,14000.0,3,10.99,458.28,B,B3,7027.0,10+ ...
2 800001,20000.0,5,14.65,472.14,C,C5,60426.0,10+...
3 800002,12000.0,3,19.99,445.91,D,D4,23547.0,2 y...
4 800003,17500.0,5,14.31,410.02,C,C4,636.0,4 yea...
0
199996 999995,7000.0,3,11.14,229.64,B,B2,330967.0,7 y...
199997 999996,6000.0,3,6.24,183.19,A,A2,38930.0,1 yea...
199998 999997,14000.0,5,15.88,339.57,C,C4,282016.0,8 ...
199999 999998,8000.0,3,18.06,289.47,D,D2,97.0,4 years...
200000 999999,8000.0,3,6.68,245.85,A,A3,320.0,7 years...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200001 entries, 0 to 200000
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 200001 non-null object
dtypes: object(1)
memory usage: 1.5+ MB
None
<bound method DataFrame.info of 0
0 id,loanAmnt,term,interestRate,installment,grad...
1 800000,14000.0,3,10.99,458.28,B,B3,7027.0,10+ ...
2 800001,20000.0,5,14.65,472.14,C,C5,60426.0,10+...
3 800002,12000.0,3,19.99,445.91,D,D4,23547.0,2 y...
4 800003,17500.0,5,14.31,410.02,C,C4,636.0,4 yea...
... ...
199996 999995,7000.0,3,11.14,229.64,B,B2,330967.0,7 y...
199997 999996,6000.0,3,6.24,183.19,A,A2,38930.0,1 yea...
199998 999997,14000.0,5,15.88,339.57,C,C4,282016.0,8 ...
199999 999998,8000.0,3,18.06,289.47,D,D2,97.0,4 years...
200000 999999,8000.0,3,6.68,245.85,A,A3,320.0,7 years...
'''
清洗空值
isnull
判断是否为空值dropna
删除包含空值的行
print(df.isnull().any()) # 如果列内有一个空值就返回true
print(df.isnull().any().sum()) # 列内的空值的行
print(df.isnull().sum())
清洗格式错误日期
to_datetime
格式化日期
清洗重复数据
duplicated
判断数据是否重复,返回布尔值drop_duplicated
删除重复数据
判断数据是离散型还是连续型
nunique()
返回series 每一列的不同值的数量
离散型数据
df['term'].value_counts()
# 统计每种数据有多少样本数量