# -*- coding: utf-8 -*-
"""
Created on Tue Feb 26 13:32:28 2019
@author: Administrator
"""
"""
pandas 数据处理
"""
from pandas import DataFrame
from pandas import Series
df=DataFrame({
'age':Series([26,85,64,85,85]),
'name':Series(['ben','john','jerry','john','john'])
})
df
#判断整行重复
df.duplicated()
#判断一列重复
df.duplicated('name')
df.drop_duplicates('age')
df1=df.drop_duplicates('name')
df1
#删除字符串左、右或首、尾指定的字符
df['name'].str.strip()
df['name'].str.rstrip('n')
#重置索引
df.set_index('name')
df4=df.set_index('name')
df4.ix['john']#索引查找
df4.loc['john']#索引查找
from pandas import DataFrame
from pandas import read_excel
df2=read_excel(r'K:\\demodata\\rz.xlsx',sheetname='Sheet2')
df2
"""
rz.xlsx的Sheet2数据:
学号 姓名 英语 数分 高代 解几
0 2308024241 成龙 76 40.0 23.0 60
1 2308024244 周怡 66 47.0 47.0 44
2 2308024251 张波 85 NaN 45.0 60
3 2308024249 朱浩 65 72.0 62.0 71
4 2308024219 封印 73 61.0 47.0 46
5 2308024201 迟培 60 71.0 76.0 71
6 2308024347 李华 67 61.0 65.0 78
7 2308024307 陈田 76 69.0 NaN 69
8 2308024326 余皓 66 65.0 61.0 71
9 2308024219 封印 73 61.0 47.0 46
"""
#缺失数据识别
df2.isnull()
df2.notnull()
#缺失数据删除
df2.dropna()
newDF=df2.dropna()
newDF
#缺失值填充
df2.fillna('101')
#用前一个值替代空值
df2.fillna(method='pad')
#用后一个值代替空值
df2.fillna(method='bfill')
#用均值来填充
df2.fillna(df.mean())
#用选定列的均值来填充
df2.fillna(df.mean()['高代':'解几'])#用解几的平均值填充高代的空值
#用不同列用指定的值填充
df2.fillna({'数分':100,'高代':0})
"""
i_nuc.xls的数据:
学号 电话 IP
2308024241 18922254812 221.205.98.55
2308024244 13522255003 183.184.226.205
2308024251 13422259938 221.205.98.55
2308024249 18822256753 222.31.51.200
2308024219 18922253721 120.207.64.3
2308024201 222.31.51.200
2308024347 13822254373 222.31.59.220
2308024307 13322252452 221.205.98.55
2308024326 18922257681 183.184.230.38
2308024320 13322252452 221.205.98.55
2308024342 18922257681 183.184.230.38
2308024310 19934210999 183.184.230.39
2308024435 19934210911 185.184.230.40
2308024432 19934210912 183.154.230.41
2308024446 19934210913 183.184.231.42
2308024421 19934210914 183.154.230.43
2308024433 19934210915 173.184.230.44
2308024428 19934210916
2308024402 19934210917 183.184.230.4
2308024422 19934210918 153.144.230.7
"""
df3=read_excel(r'K:\\demodata\\i_nuc.xls',sheetname='Sheet4')
df3.head()#前5行数据
df3['电话']#抽取某列
df3['电话']=df3['电话'].astype(str)#电话列转换成字符串
#slice操作,抽取
df3['电话'].str.slice(0,3)#抽取电话前三位
df3['电话'].str.slice(3,7)
df3['电话'].str.slice(7,11)
#split操作,字段拆分
df3['IP'].str.strip()
df3['IP'].str.split('.',1,True)#按第一"."切,新增1列
newDF3=df3['IP'].str.split('.',1,True)
newDF3.columns=['IP1','IP2-4']#给新dataframe添加列名
#条件过滤
df3=read_excel(r'K:\\demodata\\i_nuc.xls',sheetname='Sheet4')
df3[df3.电话>13500000000]
df3[df3.电话.between(1340000000,13999999999)]
df3[(df3.电话>=13422259938)&(df3.电话<138222254373)]
import numpy
r=numpy.random.randint(0,10,3)
df3.loc[r,:]#抽取r行数据
df5=df3.set_index('学号')
df5.loc[2308024241:2308024201]#按索引抽取范围内数据
df5.loc[:,'电话'].head()#选取“电话”列的数据
import pandas as pd
from pandas import DataFrame
df6=DataFrame({
'a':[1,2,3],
'b':['a','b','c'],
'c':['A','B','C']
})
df6.loc[[1,2]]
index_loc=['a','b']
index_iloc=[1,2]
data=[[1,2,3,4],[5,6,7,8]]
columns=['one','two','three','four']
df7=pd.DataFrame(data=data,index=index_loc,columns=columns)
df8=pd.DataFrame(data=data,index=index_iloc,columns=columns)
df7.ix['a']
df8.ix[1]
df7
df8
#记录合并
pd.concat([df7,df8])
#追加,忽略索引
df7.append(df8,ignore_index=True)
df7['one']+df7['two']
#数学运算
"""
df2
Out[249]:
学号 姓名 英语 数分 高代 解几
0 2308024241 成龙 76 40.0 23.0 60
1 2308024244 周怡 66 47.0 47.0 44
2 2308024251 张波 85 NaN 45.0 60
3 2308024249 朱浩 65 72.0 62.0 71
4 2308024219 封印 73 61.0 47.0 46
5 2308024201 迟培 60 71.0 76.0 71
6 2308024347 李华 67 61.0 65.0 78
7 2308024307 陈田 76 69.0 NaN 69
8 2308024326 余皓 66 65.0 61.0 71
9 2308024219 封印 73 61.0 47.0 46
"""
df2['解几'].astype(int).min()
df2['解几'].astype(int).max()
#scale测量,标准化
from sklearn import preprocessing
import numpy as np
df9=df2['数分'].fillna(80)
df9_scaled=preprocessing.scale(df9)