pandas基础学习

import numpy as np
import pandas as pd

print(pd.__version__)#检查版本
print(pd.show_versions(as_json=True))#检查版本

# 如何从列表,numpy数组和字典创建系列
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# 解
s1=pd.Series(mylist)
s2=pd.Series(myarr)
s3=pd.Series(mydict)
print("*"*50)
print(s1.head())#获取前五个元素
print(s2.head())
print(s3.head())
0.25.1
{'system': {'commit': None, 'python': '3.7.4.final.0', 'python-bits': 64, 'OS': 'Windows', 'OS-release': '10', 'machine': 'AMD64', 'processor': 'Intel64 Family 6 Model 78 Stepping 3, GenuineIntel', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'None', 'LOCALE': 'None.None'}, 'dependencies': {'pandas': '0.25.1', 'numpy': '1.17.2', 'pytz': '2019.2', 'dateutil': '2.8.0', 'pip': '19.3.1', 'setuptools': '41.4.0', 'Cython': None, 'pytest': '4.3.0', 'hypothesis': None, 'sphinx': None, 'blosc': None, 'feather': None, 'xlsxwriter': '1.2.5', 'lxml.etree': '4.4.1', 'html5lib': None, 'pymysql': '0.9.2', 'psycopg2': None, 'jinja2': '2.10.1', 'IPython': '7.9.0', 'pandas_datareader': None, 'bs4': '4.8.0', 'bottleneck': None, 'fastparquet': None, 'gcsfs': None, 'matplotlib': '3.1.1', 'numexpr': None, 'odfpy': None, 'openpyxl': None, 'pandas_gbq': None, 'pyarrow': None, 'pytables': None, 's3fs': None, 'scipy': '1.3.1', 'sqlalchemy': '1.3.10', 'tables': None, 'xarray': None, 'xlrd': None, 'xlwt': '1.3.0'}}
None
**************************************************
0    a
1    b
2    c
3    e
4    d
dtype: object
0    0
1    1
2    2
3    3
4    4
dtype: int32
a    0
b    1
c    2
e    3
d    4
dtype: int64
# 将序列ser转换为数据框,并将其索引作为数据框的另一列
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# 解
df=ser.to_frame().reset_index()
df.head()
index0
0a0
1b1
2c2
3e3
4d4
# 如何结合多个系列形成一个数据框
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# 解1
df1=pd.concat([ser1,ser2],axis=1)

# 解2
df2=pd.DataFrame({"col_1":ser1,"col_2":ser2})
print(df2.head())
  col_1  col_2
0     a      0
1     b      1
2     c      2
3     e      3
4     d      4
# 如何为系列索引指定名称
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# 解
ser.name="alphabets"
print(ser.head())
0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object
# 如何从中ser1删除存在的项目ser2
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# 解
print(ser1[~ser1.isin(ser2)])
0    1
1    2
2    3
dtype: int64
# 如何获得A系列和B系列都不通用的物品
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# 解
ser_u = pd.Series(np.union1d(ser1, ser2))  # 联合
print(ser_u)
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # 相交
print("-"*50)
print(ser_i)
print("-"*50)
print(ser_u[~ser_u.isin(ser_i)])#从【联合中的数据】删除【相交中的数据】
0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64
--------------------------------------------------
0    4
1    5
dtype: int64
--------------------------------------------------
0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64
# 如何获得数字序列【计算的最小值,第25个百分点,中位数,第75个和最大值ser】
state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))
print(ser)

# 解
result=np.percentile(ser, q=[0, 25, 50, 75, 100])
print(result)
0      1.251173
1     11.713402
2     15.765179
3      8.737820
4     14.906604
5     12.571094
6     11.105898
7      4.649783
8      9.052521
9     11.275007
10     7.709865
11    12.175817
12     7.082025
13    14.084235
14    13.363604
15     9.477944
16     7.343598
17    15.148663
18     7.809322
19     4.408409
20    18.094908
21    17.708026
22     8.740604
23     5.787821
24    10.922593
dtype: float64
[ 1.25117263  7.70986507 10.92259345 13.36360403 18.0949083 ]
# 如何获得系列中唯一项目的频率计数
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
# print(ser)

# 解
print(ser.value_counts())
g    7
e    6
c    5
f    5
h    3
a    2
b    2
dtype: int64
# 如何仅保留最常见的前2个值并将其他所有值替换为“Other”
np.random.RandomState(100)
ser=pd.Series(np.random.randint(1,5,12))#1到5的12个数
# print(ser)

# 解
# 获取最多的前面两个数值,并把其他数据变成"Other"
ser[~ser.isin(ser.value_counts().index[:2])]="Other"
print(ser)
0     Other
1         1
2         1
3     Other
4         4
5         4
6     Other
7         4
8     Other
9         1
10        4
11        1
dtype: object
# 将系列重塑ser为具有7行5列的数据框
ser = pd.Series(np.random.randint(1, 10, 35))
# print(ser)

# 解
df=pd.DataFrame(ser.values.reshape(7,5))
print(df)
   0  1  2  3  4
0  7  9  2  3  9
1  4  8  9  9  3
2  7  7  8  6  5
3  4  2  8  4  6
4  1  9  4  2  5
5  8  4  8  3  1
6  4  4  5  2  7
# 从中找到除于3等于0的下标
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)

# 解
result=np.argwhere(ser%3==0)
result
0    1
1    6
2    5
3    4
4    9
5    6
6    1
dtype: int32





array([[1],
       [4],
       [5]], dtype=int64)
# 从中ser,提取列表中位置的项目pos
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# 解
result=ser.take(pos)
result
0     a
4     e
8     i
14    o
20    u
dtype: object
# 如何垂直和水平堆叠两个系列
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# 解
ser1.append(ser2)#垂直
print(ser1)

df = pd.concat([ser1, ser2], axis=1)#水平
df
0    0
1    1
2    2
3    3
4    4
dtype: int64
01
00a
11b
22c
33d
44e
# 对【ser2系列】的值,查找【ser1系列】对应的下标
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# 解
result=[np.where(i==ser1)[0].tolist()[0] for i in ser2]
print(result)
# 解
result=[pd.Index(ser1).get_loc(i) for i in ser2]
result
[5, 4, 0, 8]





[5, 4, 0, 8]
# 计算truth和pred系列的均方误差
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)
0.42318488444073726
# 将中每个单词的第一个字符更改为大写ser
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# 解
pd.Series([i.title() for i in ser])
0     How
1      To
2    Kick
3    Ass?
dtype: object
# 计算系列中每个单词的字符数
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# 解
ser.map(lambda x:len(x))
0    3
1    2
2    4
3    4
dtype: int64
# 计算一系列结果数之间的差异(两值之间的差值)
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# 解
print(ser.tolist())
print(ser.diff().tolist())
[1, 3, 6, 10, 15, 21, 27, 35]
[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
# 将一系列日期字符串转换为时间序列
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303',
                 '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# 解
pd.to_datetime(ser)
0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]
# 对时间的操作
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', 
                 '2013/04/04', '2015-11-27', '2019-11-27T12:20'])

# 解
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))#转换成统一日期

print("这个月的第几天: ", ser_ts.dt.day.tolist())#这个月的第几天
print("这一年的第几周: ", ser_ts.dt.weekofyear.tolist())#这一年的第几周
print("这一年的第几天: ", ser_ts.dt.dayofyear.tolist())#这一年的第几天
print("这一天的星期几: ", ser_ts.dt.weekday_name.tolist())#这一天的星期几
这个月的第几天:  [1, 2, 3, 4, 27, 27]
这一年的第几周:  [53, 5, 9, 14, 48, 48]
这一年的第几天:  [1, 33, 63, 94, 331, 331]
这一天的星期几:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Friday', 'Wednesday']
# 已知年月的数值,现在设置天数为4
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])

# 解
from dateutil.parser import parse
ser.map(lambda x: parse('04 ' + x))
0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]
# 从中ser,提取包含至少2个元音的单词
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# 解
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) 
                              for i in list('aeiou')]) >= 2)
ser[mask]
0     Apple
1    Orange
4     Money
dtype: object
import re
# 从系列中过滤有效的电子邮件
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com',
                    'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

# 解
emails.str.findall(pattern, flags=re.IGNORECASE)
0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object
# 计算weights每个的平均值fruit
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))

# 解
weights.groupby(fruit).mean()
apple     5.428571
banana    7.500000
carrot    2.000000
dtype: float64
# 计算两个序列之间的欧式距离
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# 解
result=sum((p - q)**2)**.5#方式一
print(result)
result=np.linalg.norm(p-q)#方式二
result
18.16590212458495





18.16590212458495
# 找到一个数值序列中的所有局部最大值(或峰值)
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# 解
dd=np.diff(np.sign(np.diff(ser)))
print(dd)# -2 就是某个点,两边的值比它小
peak_locs=np.where(dd==-2)[0]+1
print(peak_locs)
[-2  2  0  0 -2  2 -2]
[1 5 7]
# 用最不频繁的字符替换字符串中的缺失空格
my_str = 'dbc deb abed gade'
ser = pd.Series(list('dbc deb abed gade'))

# 解
freq = ser.value_counts()#查看数值出现的频率
print(freq)
least_freq = freq.dropna().index[-1]#获取频率最小那个
result="".join(ser.replace(' ', least_freq))
result
d    4
     3
e    3
b    3
a    2
g    1
c    1
dtype: int64





'dbccdebcabedcgade'
# 如何创建一个从2000年1月1日开始和之后的10个周末(星期六)的TimeSeries,
#【并使用随机数作为值】
ser = pd.Series(np.random.randint(1,10,10), 
                pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
print(ser)

2000-01-01    7
2000-01-08    7
2000-01-15    3
2000-01-22    1
2000-01-29    4
2000-02-05    8
2000-02-12    7
2000-02-19    8
2000-02-26    3
2000-03-04    4
Freq: W-SAT, dtype: int32
# ser缺少日期和值。显示所有缺少的日期,并填充上一个日期的值
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime([
                    '2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))
print(ser)
result=ser.resample('D').ffill()#降序替换
print(result)
# ser.resample('D').bfill()#逆序替换
# ser.resample('D').bfill().ffill()#替换成前面的值(比如:都是3.0)
2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64
2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64
# 从csv文件中的每第n行导入一个数据框
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', chunksize=50)

df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)
df2 = df2.transpose()#转置操作
df2
crimzninduschasnoxrmagedisradtaxptratioblstatmedv
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
500.0887321.05.640.00.4395.96345.76.81474.0243.016.8395.5613.4519.7
1000.148660.08.560.00.5206.72779.92.77785.0384.020.9394.769.4227.5
1501.656600.019.580.00.8716.12297.31.61805.0403.014.7372.8014.1021.5
2000.0177895.01.470.00.4037.13513.97.65343.0402.017.0384.304.4532.9
2500.1403022.05.860.00.4316.48713.07.39677.0330.019.1396.285.9024.4
3000.0441770.02.240.00.4006.87147.47.82785.0358.014.8390.866.0724.8
3500.0621140.01.250.00.4296.49044.48.79211.0335.019.7396.905.9822.9
40025.046100.018.100.00.6935.987100.01.588824.0666.020.2396.9026.775.6
4506.717720.018.100.00.7136.74992.62.323624.0666.020.20.3217.4413.4
5000.224380.09.690.00.5856.02779.72.49826.0391.019.2396.9014.3316.8
# 将csv导入数据框时更改列值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})
df
crimzninduschasnoxrmagedisradtaxptratioblstatmedv
00.0063218.02.3100.5386.57565.24.0900129615.3396.904.98Low
10.027310.07.0700.4696.42178.94.9671224217.8396.909.14Low
20.027290.07.0700.4697.18561.14.9671224217.8392.834.03High
30.032370.02.1800.4586.99845.86.0622322218.7394.632.94High
40.069050.02.1800.4587.14754.26.0622322218.7396.905.33High
.............................................
5010.062630.011.9300.5736.59369.12.4786127321.0391.999.67Low
5020.045270.011.9300.5736.12076.72.2875127321.0396.909.08Low
5030.060760.011.9300.5736.97691.02.1675127321.0396.905.64Low
5040.109590.011.9300.5736.79489.32.3889127321.0393.456.48Low
5050.047410.011.9300.5736.03080.82.5050127321.0396.907.88Low

506 rows × 14 columns

# 创建一个数据框,将行作为给定序列的跨步
L = pd.Series(range(15))

def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])

gen_strides(L, stride_len=2, window_len=4)

array([[ 0,  1,  2,  3],
       [ 2,  3,  4,  5],
       [ 4,  5,  6,  7],
       [ 6,  7,  8,  9],
       [ 8,  9, 10, 11],
       [10, 11, 12, 13]], dtype=int64)
# 仅从csv文件导入指定的列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
                 usecols=['crim', 'medv'])
df.head()
crimmedv
00.0063224.0
10.0273121.6
20.0272934.7
30.0323733.4
40.0690536.2
# 获取数据帧每列的n 行,n列,数据类型和摘要状态
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# print(df.shape)#数据的行,列
# print(df.dtypes)#打印每列数据的类型
# print(df.dtypes.value_counts())#每个dtype下有多少个
# print(df.values.tolist())#把数据合成列表
df.describe()#统计信息
Min.PricePriceMax.PriceMPG.cityMPG.highwayEngineSizeHorsepowerRPMRev.per.mileFuel.tank.capacityPassengersLengthWheelbaseWidthTurn.circleRear.seat.roomLuggage.roomWeight
count86.00000091.00000088.00000084.00000091.00000091.00000086.00000090.00000087.00000085.00000091.00000089.00000092.00000087.00000088.00000089.00000074.00000086.000000
mean17.11860519.61648421.45909122.40476229.0659342.658242144.0000005276.6666672355.00000016.6835295.076923182.865169103.95652269.44827638.95454527.85393313.9864863104.593023
std8.8282909.72428010.6965635.8415205.3702931.04584553.455204605.554811486.9166163.3757481.04595314.7926516.8563173.7780233.3041573.0181293.120824600.129993
min6.7000007.4000007.90000015.00000020.0000001.00000055.0000003800.0000001320.0000009.2000002.000000141.00000090.00000060.00000032.00000019.0000006.0000001695.000000
25%10.82500012.35000014.57500018.00000026.0000001.800000100.7500004800.0000002017.50000014.5000004.000000174.00000098.00000067.00000036.00000026.00000012.0000002647.500000
50%14.60000017.70000019.15000021.00000028.0000002.300000140.0000005200.0000002360.00000016.5000005.000000181.000000103.00000069.00000039.00000027.50000014.0000003085.000000
75%20.25000023.50000024.82500025.00000031.0000003.250000170.0000005787.5000002565.00000019.0000006.000000192.000000110.00000072.00000042.00000030.00000016.0000003567.500000
max45.40000061.90000080.00000046.00000050.0000005.700000300.0000006500.0000003755.00000027.0000008.000000219.000000119.00000078.00000045.00000036.00000022.0000004105.000000
# 哪个制造商,型号和类型最高Price?最高Price值的单元格的行号和列号是多少?
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

max_price=df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type']]
print(max_price)

row, col = np.where(df.values == np.max(df.Price))
print(row,col)

# print(df.iat[row[0], col[0]])#获取左上角的数值
print(df.iloc[row[0], col[0]])#获取左上角的数值

print(df.at[row[0], 'Price'])#获取Price列的第一个数值
     Manufacturer Model     Type
58  Mercedes-Benz  300E  Midsize
[58] [4]
61.9
61.9
# 重命名数据框中的特定列
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
# print(df.columns)
df.columns = df.columns.map(lambda x: x.replace('.', '_'))
print(df.columns)
Index(['Manufacturer', 'Model', 'Type', 'Min_Price', 'Price', 'Max_Price',
       'MPG_city', 'MPG_highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev_per_mile', 'Man_trans_avail',
       'Fuel_tank_capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn_circle', 'Rear_seat_room', 'Luggage_room', 'Weight', 'Origin',
       'Make'],
      dtype='object')
# 检查数据框是否缺少任何值
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.isnull().values.any()
True
# 计算每列中缺失值的数量,哪一列的缺失值最大
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

count=df.apply(lambda x: x.isnull().sum())
print(count)#缺失值的数量
print(count.idxmax())#缺失值最大
# 从数据框中选择特定列作为数据框而不是序列
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)

print(type(df[['a']]))
print(type(df.loc[:, ['a']]))
print(type(df.iloc[:, [0]]))

print(type(df.a))
print(type(df['a']))
print(type(df.loc[:, 'a']))
print(type(df.iloc[:, 1]))
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
print(df)#没有改变之前
# print(df[list('cbade')])#交换两列的位置(交换a,c两列)

# 自定义函数交换
def switch_columns(df,col_1=None,col_2=None):
    colnames=df.columns.tolist()
    i_1,i_2=colnames.index(col_1),colnames.index(col_2)
    colnames[i_2],colnames[i_1]=colnames[i_1],colnames[i_2]
    return df[colnames]
df1=switch_columns(df,'a','c')#交换两列的位置
print(df1)
print(sorted(df.columns,reverse=True))#降序排列

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
    c   b   a   d   e
0   2   1   0   3   4
1   7   6   5   8   9
2  12  11  10  13  14
3  17  16  15  18  19
['e', 'd', 'c', 'b', 'a']
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
pd.set_option("display.max_columns",10)
pd.set_option('display.max_rows',10)
df
ManufacturerModelTypeMin.PricePrice...Rear.seat.roomLuggage.roomWeightOriginMake
0AcuraIntegraSmall12.915.9...26.5NaN2705.0non-USAAcura Integra
1NaNLegendMidsize29.233.9...30.015.03560.0non-USAAcura Legend
2Audi90Compact25.929.1...28.014.03375.0non-USAAudi 90
3Audi100MidsizeNaN37.7...31.017.03405.0non-USAAudi 100
4BMW535iMidsizeNaN30.0...27.013.03640.0non-USABMW 535i
....................................
88VolkswagenEurovanVan16.619.7...34.0NaN3960.0NaNVolkswagen Eurovan
89VolkswagenPassatCompact17.620.0...31.514.02985.0non-USAVolkswagen Passat
90VolkswagenCorradoSporty22.923.3...26.015.02810.0non-USAVolkswagen Corrado
91Volvo240Compact21.822.7...29.514.02985.0non-USAVolvo 240
92NaN850Midsize24.826.7...30.015.03245.0non-USAVolvo 850

93 rows × 27 columns

df = pd.DataFrame(np.random.random(4), columns=['random'])
# 格式化或隐藏熊猫数据框中的科学计数法
print(df.round(4))#显示后面4个小数
# 将数据框中的所有值格式化为百分比
out=df.style.format({
    'random':'{0:.2%}'.format,
})
out
   random
0  0.8620
1  0.7903
2  0.0159
3  0.5417
            <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row0" class="row_heading level0 row0" >0</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row0_col0" class="data row0 col0" >86.20%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row1" class="row_heading level0 row1" >1</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row1_col0" class="data row1 col0" >79.03%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row2" class="row_heading level0 row2" >2</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row2_col0" class="data row2 col0" >1.59%</td>
        </tr>
        <tr>
                    <th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row3" class="row_heading level0 row3" >3</th>
                    <td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row3_col0" class="data row3 col0" >54.17%</td>
        </tr>
</tbody></table>
random
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

df.iloc[::20, :][['Manufacturer', 'Model', 'Type']]#间隔20取一行
ManufacturerModelType
0AcuraIntegraSmall
20ChryslerLeBaronCompact
40HondaPreludeSporty
60MercuryCougarMidsize
80SubaruLoyaleSmall
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', 
                 usecols=[0,1,2,3,5])
# df.head()
df[['Manufacturer', 'Model', 'Type']] = df[['Manufacturer', 'Model', 'Type']].fillna('missing')
# df.head()
df.index = df.Manufacturer + '_' + df.Model + '_' + df.Type
print(df.index.is_unique)#通过组合相关列来创建主键索引
# df.head()
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))

print(df['a'])
print(df['a'].argsort())#通过比较值,获取索引
df['a'].argsort()[::-1][5]#获取列中第n个最大值的行号
0    25
1    16
2    12
3     8
4     6
5    17
6    15
7    24
8    16
9    28
Name: a, dtype: int32
0    4
1    3
2    2
3    6
4    1
5    8
6    5
7    7
8    0
9    9
Name: a, dtype: int64





8
ser = pd.Series(np.random.randint(1, 100, 15))

print('ser: ', ser.tolist(), 'mean: ', round(ser.mean()))
np.argwhere(ser>ser.mean())#在中ser,找到第二个最大值的位置大于平均值
ser:  [54, 77, 49, 74, 24, 95, 94, 14, 7, 50, 69, 65, 72, 72, 58] mean:  58.0





array([[ 1],
       [ 3],
       [ 5],
       [ 6],
       [10],
       [11],
       [12],
       [13]], dtype=int64)
# 获得行总和> 100的数据帧的最后n行
df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))

rowsums = df.apply(np.sum, axis=1)# 每行相加
print(np.where(rowsums > 100)[0][-2:])#获取大于100的最后2行【返回的是索引】
last_two_rows = df.iloc[np.where(rowsums > 100)[0][-2:], :]
last_two_rows
[11 14]
0123
1127322232
1421353730
# 将ser较低的5%ile和大于95%ile中的所有值分别替换为第5个和第95%ile值。
ser = pd.Series(np.logspace(-2, 2, 30))

def cap_outliers(ser, low_perc, high_perc):
    low, high = ser.quantile([low_perc, high_perc])
    print(low_perc, '%ile: ', low, '|', high_perc, '%ile: ', high)
    ser[ser < low] = low
    ser[ser > high] = high
    return ser

capped_ser = cap_outliers(ser, .05, .95)
print(capped_ser)
0.05 %ile:  0.016049294076965887 | 0.95 %ile:  63.876672220183934
0      0.016049
1      0.016049
2      0.018874
3      0.025929
4      0.035622
5      0.048939
6      0.067234
7      0.092367
8      0.126896
9      0.174333
10     0.239503
11     0.329034
12     0.452035
13     0.621017
14     0.853168
15     1.172102
16     1.610262
17     2.212216
18     3.039195
19     4.175319
20     5.736153
21     7.880463
22    10.826367
23    14.873521
24    20.433597
25    28.072162
26    38.566204
27    52.983169
28    63.876672
29    63.876672
dtype: float64
# 在去除负值后将数据框重塑为最大可能的正方形
df = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10,-1))
# print(df)
arr = df[df > 0].values.flatten()#把数据展平后,小于0的数变成nan
# print(arr)
arr_qualified = arr[~np.isnan(arr)]#去掉nan
print(arr_qualified)
#比较值,返回索引【倒叙索引】
top_indexes = np.argsort(arr_qualified)[::]
# print(top_indexes)
print(top_indexes[:n**2])
#以元素为单位返回输入的底限
n = int(np.floor(arr_qualified.shape[0]**.5))
# 提取指定索引位置的数据,并以一维数组或者矩阵返回
output = np.take(arr_qualified, sorted(top_indexes[:n**2])).reshape(n, -1)
print(output)

[ 6. 29. 48. 22. 14. 10. 49.  9. 18. 42. 31. 42. 16. 35. 45. 10.  2. 27.
 48.  2. 16. 48. 22. 12. 23. 13. 34. 38. 18. 10. 12. 48. 39. 18. 49. 24.
 35. 13. 16. 30. 35. 22. 44. 46.  8. 30.  1.  5. 30.  7. 15. 22.  6. 43.
 47.  8. 32. 21. 46.  5. 20. 39.  9. 17.]
[46 19 16 59 47  0 52 49 44 55 62  7  5 15 29 30 23 25 37  4 50 12 38 20
 63  8 33 28 60 57 22 51  3 41 24 35 17  1 45 39 48 10 56 26 40 13 36 27
 32 61 11  9 53 42 14 43 58 54  2 18 21 31 34  6]
[[ 6. 29. 48. 22. 14. 10. 49.  9.]
 [18. 42. 31. 42. 16. 35. 45. 10.]
 [ 2. 27. 48.  2. 16. 48. 22. 12.]
 [23. 13. 34. 38. 18. 10. 12. 48.]
 [39. 18. 49. 24. 35. 13. 16. 30.]
 [35. 22. 44. 46.  8. 30.  1.  5.]
 [30.  7. 15. 22.  6. 43. 47.  8.]
 [32. 21. 46.  5. 20. 39.  9. 17.]]
df=pd.DataFrame(np.arange(25).reshape(5,-1))
print(df)

# 交换数据帧的两行
def swap_rows(df,i1,i2):
    df.iloc[i1,:],df.iloc[i2,:]=df.iloc[i2,:].copy(),df.iloc[i1,:].copy()
    return df
result=swap_rows(df,1,2)
result
    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24
01234
001234
11011121314
256789
31516171819
42021222324
df = pd.DataFrame(np.arange(25).reshape(5, -1))
print(df)
df.iloc[::-1, :]#反转数据框的行
    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24
01234
42021222324
31516171819
21011121314
156789
001234
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
print(df)
# 获取'a'数据框中列的一键编码df,并将其附加为列
result=pd.get_dummies(df['a'])
df_onehot=pd.concat([result,df[list('bcde')]],axis=1)#合并列显示
df_onehot
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24
05101520bcde
0100001234
1010006789
20010011121314
30001016171819
40000121222324
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))
print(df)
print(df.apply(np.argmax, axis=1))#每行的最大值【这里axis是相反的功能】
print(df.apply(np.argmax, axis=1).value_counts())#计算出现的次数【降序排序】
print(df.apply(np.argmax, axis=1).value_counts().index[0])#获取最多出现的
    0   1   2   3
0  10  87  19  43
1   5  83  50  80
2  19  24  10  77
3  36  15  95  78
4   8  20  89  48
5  17  17  81  46
6  88  74  52  72
7  91  53  36  61
8  25  53  22  90
9   3  93  86  63
0    1
1    1
2    3
3    2
4    2
5    2
6    0
7    0
8    3
9    1
dtype: int64
2    3
1    3
3    2
0    2
dtype: int64
2
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), 
                 index=list('abcdefgh'), columns=list('pqrstuvwxy'))
print(df)
print(df.corr())#各类型之间的相关系数DataFrame表格
abs_corrmat = np.abs(df.corr())#区绝对值
print(abs_corrmat)
max_corr = abs_corrmat.apply(lambda x: sorted(x)[-2])#取倒数第二位
print(max_corr)#
print(np.round(max_corr.tolist(), 2))#取小数点后面两位
    p   q   r   s   t   u   v   w   x   y
a  41  72   5  31  67  26  45  65  21  60
b  15  56  72  91  99  32  38  14  52  36
c   7  92  96  84  26  79  81  12  75  50
d  73  46  42  15  80  76  10  34  45   5
e  15  72  55  14  17  54   9  35  36  18
f  12  73  47  84  85   9  31  67  13  64
g  25  43  56  76  62  43  93  25  53  99
h  80  70  30  68  40  74   2  41   7  47
          p         q         r         s         t         u         v  \
p  1.000000 -0.348616 -0.606492 -0.406070  0.056932  0.461015 -0.543905   
q -0.348616  1.000000  0.213753  0.159442 -0.554585  0.088275  0.023752   
r -0.606492  0.213753  1.000000  0.501096 -0.190063  0.290160  0.440666   
s -0.406070  0.159442  0.501096  1.000000  0.260183 -0.243481  0.505580   
t  0.056932 -0.554585 -0.190063  0.260183  1.000000 -0.568596 -0.009954   
u  0.461015  0.088275  0.290160 -0.243481 -0.568596  1.000000 -0.118254   
v -0.543905  0.023752  0.440666  0.505580 -0.009954 -0.118254  1.000000   
w  0.207508  0.125992 -0.797659 -0.285267  0.192809 -0.562124 -0.358931   
x -0.452943 -0.029441  0.798096  0.187066 -0.122999  0.376021  0.637259   
y -0.294716 -0.043568 -0.043181  0.579817  0.083632 -0.434149  0.729595   

          w         x         y  
p  0.207508 -0.452943 -0.294716  
q  0.125992 -0.029441 -0.043568  
r -0.797659  0.798096 -0.043181  
s -0.285267  0.187066  0.579817  
t  0.192809 -0.122999  0.083632  
u -0.562124  0.376021 -0.434149  
v -0.358931  0.637259  0.729595  
w  1.000000 -0.835494  0.145546  
x -0.835494  1.000000 -0.030812  
y  0.145546 -0.030812  1.000000  
          p         q         r         s         t         u         v  \
p  1.000000  0.348616  0.606492  0.406070  0.056932  0.461015  0.543905   
q  0.348616  1.000000  0.213753  0.159442  0.554585  0.088275  0.023752   
r  0.606492  0.213753  1.000000  0.501096  0.190063  0.290160  0.440666   
s  0.406070  0.159442  0.501096  1.000000  0.260183  0.243481  0.505580   
t  0.056932  0.554585  0.190063  0.260183  1.000000  0.568596  0.009954   
u  0.461015  0.088275  0.290160  0.243481  0.568596  1.000000  0.118254   
v  0.543905  0.023752  0.440666  0.505580  0.009954  0.118254  1.000000   
w  0.207508  0.125992  0.797659  0.285267  0.192809  0.562124  0.358931   
x  0.452943  0.029441  0.798096  0.187066  0.122999  0.376021  0.637259   
y  0.294716  0.043568  0.043181  0.579817  0.083632  0.434149  0.729595   

          w         x         y  
p  0.207508  0.452943  0.294716  
q  0.125992  0.029441  0.043568  
r  0.797659  0.798096  0.043181  
s  0.285267  0.187066  0.579817  
t  0.192809  0.122999  0.083632  
u  0.562124  0.376021  0.434149  
v  0.358931  0.637259  0.729595  
w  1.000000  0.835494  0.145546  
x  0.835494  1.000000  0.030812  
y  0.145546  0.030812  1.000000  
p    0.606492
q    0.554585
r    0.798096
s    0.579817
t    0.568596
u    0.568596
v    0.729595
w    0.835494
x    0.835494
y    0.729595
dtype: float64
[0.61 0.55 0.8  0.58 0.57 0.57 0.73 0.84 0.84 0.73]
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)
# 获取df对象中的值【最小值/最大值】
min_by_max = np.min(df, axis=1)/np.max(df, axis=1)
min_by_max
    0   1   2   3   4   5   6   7   8   9
0  85  63  99  34  13  14  64  33  58  16
1  64  45  77  68  19  45  61   2  11  15
2  78  66  76  51  51  52  20  53  35  64
3  68  85   2  81  52  66  14  28  41  34
4  37  40  99  62  57  70  37  15  14  56
5  13  88  12  51  43   1  54  18  70  67
6  55  19  79  43  19   8  52   6  15  77
7  79  93  54  68  78  61  80  33  72  92





0    0.131313
1    0.025974
2    0.256410
3    0.023529
4    0.141414
5    0.011364
6    0.075949
7    0.354839
dtype: float64
# 创建一个新列'penultimate',该列的第二行的值第二大df
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
result=df.apply(lambda x:x.sort_values().unique()[-2],axis=1)
df['penultimate'] = result
df
0123456789penultimate
05012772522974940271877
114527836757717435377
29253103955346389604189
398966508844619877588
4979575509160653245995
53138481915271845781
659719334940546048459
7902177443509823843090
# 规范数据框中的所有列
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)

# df通过减去列均值并除以标准偏差来归一化所有列
result=df.apply(lambda x:(
    (x - x.mean())/x.std()
).round(2))
result
    0   1   2   3   4   5   6   7   8   9
0  73  77  53  35   9  80  96  47  35  26
1  58  72  39  80  86  57  41  98  31  90
2  45  76  22  27   5  15  78  90  87  92
3  89  84  97  78  29  70  23  95  97  90
4  55  32  83  49  99  63  22  75  44  26
5  74  42  70  49  57  26  88  77   1   5
6  56  29  42  28  75  16  21  11  38  50
7  99  26  74  74  39  50  61   3  23   3
0123456789
00.240.90-0.28-0.79-1.161.311.36-0.40-0.30-0.57
1-0.570.70-0.831.241.030.39-0.410.96-0.421.10
2-1.280.86-1.51-1.15-1.28-1.280.780.751.321.15
31.101.181.471.15-0.590.91-0.990.881.631.10
4-0.74-0.920.91-0.161.400.63-1.020.35-0.02-0.57
50.29-0.520.40-0.160.20-0.841.100.40-1.35-1.11
6-0.68-1.04-0.71-1.100.72-1.24-1.05-1.36-0.200.06
71.64-1.160.560.97-0.310.110.23-1.57-0.67-1.17
# 规范数据框中的所有列
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)

# 排列所有列的范围df,以使每一列的最小值为0且最大值为1。
result = df.apply(lambda x: (
    1 - (x.max() - x)/(x.max() - x.min())
).round(2))
result
    0   1   2   3   4   5   6   7   8   9
0  78   7  59   4  77  81  93  66  39  28
1  60  51  88  19  23  29  70  82  10  24
2   2  80   7  59  72  51  82  28  38  25
3  36  88   3   8  43   7  87  60  28  99
4  29  69  89  84  87  15  95  87  75  54
5  82  78  60  57  15  29  41  93  57  13
6  72  28  63   2  20  25   6  72  71  32
7  60   2  13  87  82  97  41  23  81  16
0123456789
00.950.060.650.020.860.820.980.610.410.17
10.720.570.990.200.110.240.720.840.000.13
20.000.910.050.670.790.490.850.070.390.14
30.431.000.000.070.390.000.910.530.251.00
40.340.781.000.961.000.091.000.910.920.48
51.000.880.660.650.000.240.391.000.660.00
60.880.300.700.000.070.200.000.700.860.22
70.720.000.121.000.931.000.390.001.000.03
# 计算每一行与顺序行的相关系数
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
print(df)
print([i for i in range(df.shape[0])[:-1]])#去除最后一行

result=[df.iloc[i].corr(df.iloc[i+1]) for i in range(df.shape[0])[:-1]]
result
    0   1   2   3   4   5   6   7   8   9
0  27   3  37   9  76  68  91  31  44   7
1  11  15  20  47  33  86  65  47   9  30
2  39   1  72  19  35  42  87  77  55  40
3  60   7   8  28  37  14  17   5   3   7
4  47  99  76  28  77  57  32  57  24  16
5   2  50  95  89  84  46  59  84   1   2
6  78  27  58  67  78   1   7  28  89  20
7  12  86  54  81  20  19  77   1   8  56
[0, 1, 2, 3, 4, 5, 6]





[0.5182965633327684,
 0.2595376913412023,
 -0.23874062518280761,
 0.005261734793477499,
 0.4687394611664755,
 -0.06555011633952691,
 -0.30907671467693215]
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))
rows=df.shape[0]#数据框的行数
for i in range(rows):
    df.iat[i,i]=0
    df.iat[rows-i-1,i]=0
df
0123456789
006592821015171320
179011992868248083
234403511918304129
384725065760252564
49814210002944084
57588270023627395
62343380364307656
7809608292796406167
829096967621947204
902627659519191900
# 通过密钥获取groupby数据帧的特定组
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})
print(df)
# 通过数据框的df['col1'],显示有apple的行数
df.groupby(df['col1']).get_group('apple')
     col1      col2  col3
0   apple  0.703158    12
1  banana  0.535815    13
2  orange  0.177147     8
3   apple  0.159570     2
4  banana  0.411271    10
5  orange  0.279007    11
6   apple  0.576264     4
7  banana  0.578607     9
8  orange  0.242959     6
col1col2col3
0apple0.70315812
3apple0.1595702
6apple0.5762644
# 当按另一列分组时,如何获得某列的第n个最大值
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 4,
                   'taste': np.random.rand(12),
                   'price': np.random.randint(0, 15, 12)})
# 通过df['fruit']下的banana值,查询df['taste']的对应值
banana=df['taste'].groupby(df['fruit']).get_group('banana')
print(banana)
print("特定结果:",banana.sort_values().iloc[-2])#升序排序,找出倒数第二个值
df
1     0.209485
4     0.549818
7     0.498802
10    0.006632
Name: taste, dtype: float64
特定结果: 0.4988018517868045
fruittasteprice
0apple0.5104467
1banana0.2094851
2orange0.6321661
3apple0.8657644
4banana0.5498189
5orange0.7447185
6apple0.0691710
7banana0.49880214
8orange0.0118082
9apple0.10322213
10banana0.0066326
11orange0.01778713
# 计算熊猫数据框上的分组均值,并将分组列保留为另一列(而不是索引)
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})
print(df)
# 通过df['fruit']的值,查询price的均值
out = df.groupby(df['fruit'], as_index=False)['price'].mean()
out
    fruit    rating  price
0   apple  0.090672      2
1  banana  0.019506      0
2  orange  0.354463      5
3   apple  0.466694     14
4  banana  0.807733      8
5  orange  0.488868      4
6   apple  0.640913      8
7  banana  0.977691      8
8  orange  0.390033      0
fruitprice
0apple8.000000
1banana5.333333
2orange3.000000
# 通过2列联接两个数据框,使它们只有公共行
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'pounds': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})
print(df1)
print(df2)
pd.merge(df1, df2, how='inner', 
         left_on=['fruit', 'weight'], 
         right_on=['pazham', 'pounds'], 
         suffixes=['_left', '_right'])
    fruit  weight  price
0   apple    high     13
1  banana  medium      5
2  orange     low     14
3   apple    high      2
4  banana  medium      2
5  orange     low      6
6   apple    high      7
7  banana  medium      1
8  orange     low      6
   pazham pounds  price
0   apple   high     10
1  orange    low     14
2    pine   high     12
3   apple    low     12
4  orange   high     11
5    pine    low      6
fruitweightprice_leftpazhampoundsprice_right
0applehigh13applehigh10
1applehigh2applehigh10
2applehigh7applehigh10
3orangelow14orangelow14
4orangelow6orangelow14
5orangelow6orangelow14
# 获得两列值匹配的位置
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                    'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})
print(df)
np.where(df.fruit1 == df.fruit2)[0]#如果两个值相等,返回下标
   fruit1  fruit2
0  orange  orange
1  orange   apple
2  orange   apple
3  banana  banana
4   apple  banana
5  orange  orange
6  orange  banana
7  banana  orange
8   apple  banana
9  orange  banana





array([0, 3, 5], dtype=int64)
# 在数据框中创建列的滞后和超前
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4),
                  columns = list('abcd'))

print(df)
df['a_lag'] = df['a'].shift(1)#把a列中的前一个索引放到a_lag
print(df)
df['b_lead'] = df['b'].shift(-1)#把b列中的后一个索引放到a_lag
df
    a   b   c   d
0  31  79  72  32
1   8  18  82  25
2  98  23  41  79
3   2  87  74  76
4  16  89  12  86
    a   b   c   d  a_lag
0  31  79  72  32    NaN
1   8  18  82  25   31.0
2  98  23  41  79    8.0
3   2  87  74  76   98.0
4  16  89  12  86    2.0
abcda_lagb_lead
031797232NaN18.0
1818822531.023.0
2982341798.087.0
3287747698.089.0
4168912862.0NaN
# 获得整个数据帧中唯一值的频率
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), 
                  columns = list('abcd'))
ravel=df.values.ravel()#把数据框中的值变成一维数组
print(ravel)
pd.value_counts(ravel)#计算每个值出现的频率
[6 1 6 8 7 8 3 4 1 8 7 7 1 1 9 8 5 8 5 5]





8    5
1    4
7    3
5    3
6    2
9    1
4    1
3    1
dtype: int64
# 将文本列拆分为两个单独的列
df = pd.DataFrame([ "STD, City,State",
                    "33, Kolkata,West Bengal",
                    "44, Chennai,Tamil Nadu",
                    "40, Hyderabad  ,Telengana",
                    "80, Bangalore,Karnataka"], 
                  columns=['row'])
# print(df)
# 把每一行数据根据","分开
df_out = df.row.str.split(",", expand=True)
# print(df_out)

new_header = df_out.iloc[0]#获取第一行数据框的值【前面】
df_out = df_out[1:]#把后面的每一行变为数据框的值【后面】

df_out.columns = new_header#把第一行数据变为列名
# print(new_header)
# print(new_values)
print(df_out)
0 STD          City        State
1  33       Kolkata  West Bengal
2  44       Chennai   Tamil Nadu
3  40   Hyderabad      Telengana
4  80     Bangalore    Karnataka
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值