一:
import pandas as pd
pd.options.display.max_columns = 20
# 建立
dataset = pd.read_csv("file.txt", sep=",", header=None, index_col=None, names=["col1", "col2"])
dataset = pd.DataFrame({"col1": [1, 3, 2, 4], "col2": [5, 7, 6, 7]})
"""
col1 col2
0 1 5
1 3 7
2 2 6
3 4 7
"""
# 排序
dataset = dataset.sort_values(by=["col1"])
"""
col1 col2
0 1 5
2 2 6
1 3 7
3 4 7
"""
# 重置索引
dataset.reset_index()
"""
col1 col2
0 1 5
1 2 6
2 3 7
3 4 7
"""
# 去除重复
dataset = dataset.drop_duplicates(subset="col2", keep="first")
"""
col1 col2
0 1 5
1 2 6
2 3 7
"""
# 更改数据类型
dataset["col4"] = dataset["col4"].astype("int32")
"""
col1 col2 col3 col4
0 1 5 6 -0.833333
1 2 6 8 -1.000000
2 3 7 10 -1.166667
col1 col2 col3 col4
0 1 5 6 0
1 2 6 8 -1
2 3 7 10 -1
"""
# 更改列名称
sub_set2_re = sub_set2.rename(columns={"col1": "col5", "col2": "col6", "col3": "col7", "col4": "col8"})
二:
"""
dataset["xxx"]是<class 'pandas.core.series.Series'>
dataset[["xxx"]]是<class 'pandas.core.frame.DataFrame'>
"""
##########################################################
col1 = dataset["col1"]
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
0 1
1 2
2 3
Name: col1, dtype: int64
"""
# 转换成列表
col1_list = col1.tolist()
"""
[1, 2, 3]
"""
sub_set0 = dataset["col1"][1:]
sub_set1 = dataset["col1"].iloc[1:]
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
1 2
2 3
Name: col1, dtype: int64
1 2
2 3
Name: col1, dtype: int64
"""
#########################################################
col1 = dataset[["col1"]]
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
col1
0 1
1 2
2 3
"""
sub_set0 = dataset[["col1"]].iloc[1:]
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
col1
1 2
2 3
"""
sub_set1 = dataset[["col1", "col3"]].iloc[1:]
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
col1 col3
1 2 8
2 3 10
"""
##########################################################
dataset = dataset.iloc[1:, 1:]
"""
col1 col2 col3
0 1 5 6
2 2 6 8
1 3 7 10
col2 col3
2 6 8
1 7 10
"""
dataset = dataset.iloc[1:][1:]
"""
col1 col2 col3
0 1 5 6
2 2 6 8
1 3 7 10
col1 col2 col3
1 3 7 10
"""
三:
# 选择列为指定值的行
nums = [1, 2]
dataset = dataset[dataset["col1"].isin(nums)]
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
col1 col2 col3
0 1 5 6
1 2 6 8
"""
# 选择列满足指定条件的行
dataset = dataset[dataset["col1"] > 1]
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
col1 col2 col3
1 2 6 8
2 3 7 10
"""
# 列之间的计算
dataset["col4"] = (dataset["col1"] - dataset["col3"])/dataset["col3"].min()
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
col1 col2 col3 col4
0 1 5 6 -0.833333
1 2 6 8 -1.000000
2 3 7 10 -1.166667
"""
# apply的应用
def helper(dataset):
return dataset["col1"] + dataset["col2"]
dataset["col3"] = dataset.apply(helper, axis=1)
"""
col1 col2 col3
0 1 5 6
1 2 6 8
2 3 7 10
"""
四:
# 随机选取某些行
sub_set0 = dataset.sample(frac=0.5, random_state=1)
# 没有被选中的那些行
sub_set1 = dataset.drop(sub_set0.index)
"""
col1 col2 col3 col4
0 1 5 6 -0.833333
1 2 6 8 -1.000000
2 3 7 10 -1.166667
col1 col2 col3 col4
2 3 7 10 -1.166667
1 2 6 8 -1.000000
col1 col2 col3 col4
0 1 5 6 -0.833333
"""
sub_set2 = sub_set0.append(sub_set1)
"""
col1 col2 col3 col4
0 1 5 6 -0.833333
2 3 7 10 -1.166667
col1 col2 col3 col4
1 2 6 8 -1.0
col1 col2 col3 col4
0 1 5 6 -0.833333
2 3 7 10 -1.166667
1 2 6 8 -1.000000
"""
# 合并两个数据集
sub_sets = [sub_set2, sub_set2]
sub_set3 = pd.concat(sub_sets, axis=1)
"""
col1 col2 col3 col4 col1 col2 col3 col4
0 1 5 6 -0.833333 1 5 6 -0.833333
2 3 7 10 -1.166667 3 7 10 -1.166667
1 2 6 8 -1.000000 2 6 8 -1.000000
"""
sub_set4 = pd.concat(sub_sets, axis=0)
"""
col1 col2 col3 col4
0 1 5 6 -0.833333
2 3 7 10 -1.166667
1 2 6 8 -1.000000
0 1 5 6 -0.833333
2 3 7 10 -1.166667
1 2 6 8 -1.000000
"""
五:
"""
col1 col2 col3 col4
0 1 5 6 -0.833333
1 2 6 8 -1.000000
2 3 7 10 -1.166667
"""
# 形状
print(dataset.shape)
"""
(3, 4)
"""
# 数据类型
print(dataset.dtypes)
"""
col1 int64
col2 int64
col3 int64
col4 float64
dtype: object
"""
# 列数据类型
print(dataset["col1"].dtype)
"""
int64
"""
# 数据集表述
print(dataset.describe())
"""
col1 col2 col3 col4
count 3.0 3.0 3.0 3.000000
mean 2.0 6.0 8.0 -1.000000
std 1.0 1.0 2.0 0.166667
min 1.0 5.0 6.0 -1.166667
25% 1.5 5.5 7.0 -1.083333
50% 2.0 6.0 8.0 -1.000000
75% 2.5 6.5 9.0 -0.916667
max 3.0 7.0 10.0 -0.833333
"""
# 前两行 后两行
print(dataset.head(2))
"""
col1 col2 col3 col4
0 1 5 6 -0.833333
1 2 6 8 -1.000000
"""
print(dataset.tail(2))
"""
col1 col2 col3 col4
1 2 6 8 -1.000000
2 3 7 10 -1.166667
"""
六:
# 行遍历
for index, row in dataset.iterrows():
print(index, '---------------------')
print(row)
print("row: col1 col2:", row["col1"], row["col2"], '#######')
"""
0 ---------------------
col1 1
col2 5
col3 6
col4 0
Name: 0, dtype: int64
row: col1 col2: 1 5 #######
2 ---------------------
col1 2
col2 6
col3 8
col4 -1
Name: 2, dtype: int64
row: col1 col2: 2 6 #######
1 ---------------------
col1 3
col2 7
col3 10
col4 -1
Name: 1, dtype: int64
row: col1 col2: 3 7 #######
"""
# 列遍历
for index, col in dataset.iteritems():
print(index, '---------------------')
print(col)
print("col: row0 row1:", col[0], col[1], "#######")
"""
col1 col2 col3 col4
0 1 5 6 0
2 2 6 8 -1
1 3 7 10 -1
col1 ---------------------
0 1
2 2
1 3
Name: col1, dtype: int64
col: row0 row1: 1 3 #######
col2 ---------------------
0 5
2 6
1 7
Name: col2, dtype: int64
col: row0 row1: 5 7 #######
col3 ---------------------
0 6
2 8
1 10
Name: col3, dtype: int64
col: row0 row1: 6 10 #######
col4 ---------------------
0 0
2 -1
1 -1
Name: col4, dtype: int32
col: row0 row1: 0 -1 #######
"""