python中读取txt文件、统计其中所有字母出现的频度_python——pandas练习题6-10

前五套练习小rrrrr:python——pandas练习题1-5​zhuanlan.zhihu.com

练习6-统计

探索风速数据

import pandas as pd

import datetime

data = pd.read_table(r"exercise_data/wind.data", sep = "\s+", parse_dates = [[0,1,2]])

data.head()

#创建一个函数并用它去修复2061年这个bug

def fix_year(x):

year=x.year-100 if x.year>1989 else x.year

return datetime.date(year,x.month,x.day)

data['Yr_Mo_Dy']=data['Yr_Mo_Dy'].apply(fix_year)

data.head()

#将日期设为索引,注意数据类型,应该是datetime64[ns]

data['Yr_Mo_Dy']=pd.to_datetime(data['Yr_Mo_Dy'])

data=data.set_index('Yr_Mo_Dy')

data.head()

#对应每个location 有多少个缺失值

pd.isnull(data).sum()

RPT 6

VAL 3

ROS 2

KIL 5

SHA 2

BIR 0

DUB 3

CLA 2

MUL 3

CLO 1

BEL 0

MAL 4

dtype: int64

#对应每一个location,一共有多少完整的数据值

data.shape[0]-pd.isnull(data).sum()

RPT 6568

VAL 6571

ROS 6572

KIL 6569

SHA 6572

BIR 6574

DUB 6571

CLA 6572

MUL 6571

CLO 6573

BEL 6574

MAL 6570

dtype: int64

#对于全体数据,计算风速的平均值

data.mean().mean()

#df.mean()每一列均值。每一列的平均值再求平均

10.227982360836924

#创建一个名为loc_stats的数据框去计算并存储每个location的风速最小值,最大值,平均值和标准

#min(axis=0)每一纵行,axis=0可以省略不写

loc_stats=pd.DataFrame()

loc_stats['min']=data.min()

loc_stats['max']=data.max()

loc_stats['mean']=data.mean()

loc_stats['std']=data.std()

loc_stats

#创建一个名为day_stats的数据框去计算并存储所有日期的风速最小值,最大值,平均值和标准差

#axis=1 每一横行

day_stats=pd.DataFrame()

day_stats['min']=data.min(axis=1)

day_stats['max']=data.max(axis=1)

day_stats['mean']=data.mean(axis=1)

day_stats['std']=data.std(axis=1)

day_stats.head()

#对于每一个location,计算一月份的平均风速

#注意1961年的1月和1962年的1月应该区别对待

data['date'] = data.index

# creates a column for each value from date

data['month'] = data['date'].apply(lambda date: date.month)

data['year']=data['date'].apply(lambda date:date.year)

data['day'] = data['date'].apply(lambda date: date.day)

# gets all value from the month 1 and assign to janurary_winds

janurary_winds = data.query('month == 1')

# gets the mean from january_winds, using .loc to not print the mean of month, year and day

janurary_winds.loc[:,'RPT':"MAL"].mean()

RPT 14.847325

VAL 12.914560

ROS 13.299624

KIL 7.199498

SHA 11.667734

BIR 8.054839

DUB 11.819355

CLA 9.512047

MUL 9.543208

CLO 10.053566

BEL 14.550520

MAL 18.028763

dtype: float64

取样 data.query(**) 参数为取样标准

#对于数据记录按照年为频率取样

data.query('month==1 and day==1')

#对于数据记录按照月为频率取样

data.query('day==1')

data.head()

练习7-可视化

探索泰坦尼克灾难数据

相应数据集:train.csv

import pandas as pd

import numpy as np

titanic=pd.read_csv(r'exercise_data/train.csv')

titanic.head()

#将PassengerId设置为索引

titanic.set_index('PassengerId').head()

# sum the instances of males and females

males = (titanic['Sex'] == 'male').sum()

females = (titanic['Sex'] == 'female').sum()

# put them into a list called proportions

proportions = [males, females]

# Create a pie chart

plt.pie(

# using proportions

proportions,

# with the labels being officer names

labels = ['Males', 'Females'],

# with no shadows

shadow = False,

# with colors

colors = ['blue','red'],

# with one slide exploded out

explode = (0.15 , 0),

# with the start angle at 90%

startangle = 90,

# with the percent listed as a fraction

autopct = '%1.1f%%'

)

# View the plot drop above

plt.axis('equal')

# Set labels

plt.title("Sex Proportion")

# View the plot

plt.tight_layout()

plt.show()

#绘制一个展示船票Fare, 与乘客年龄和性别的散点图

# creates the plot using

lm = sns.lmplot(x = 'Age', y = 'Fare', data = titanic, hue = 'Sex', fit_reg=False)

# set title

lm.set(title = 'Fare x Age')

# get the axes object and tweak it

axes = lm.axes

axes[0,0].set_ylim(-5,)

axes[0,0].set_xlim(-5,85)

plt.tight_layout()

plt.show()

#有多少人生还?

titanic['Survived'].sum()

342

# sort the values from the top to the least value and slice the first 5 items

df = titanic.Fare.sort_values(ascending = False)

df

# create bins interval using numpy

binsVal = np.arange(0,600,10)

binsVal

# create the plot

plt.hist(df, bins = binsVal)

# Set the title and labels

plt.xlabel('Fare')

plt.ylabel('Frequency')

plt.title('Fare Payed Histrogram')

# show the plot

plt.show()

练习8-创建数据框

探索Pokemon数据

相应数据集:自定义数据框

import pandas as pd

raw_data = {"name": ['Bulbasaur', 'Charmander','Squirtle','Caterpie'],

"evolution": ['Ivysaur','Charmeleon','Wartortle','Metapod'],

"type": ['grass', 'fire', 'water', 'bug'],

"hp": [45, 39, 44, 45],

"pokedex": ['yes', 'no','yes','no']

}

pokemon=pd.DataFrame(raw_data)

pokemon

#数据框的列排序是字母顺序,请重新修改为name, type, hp, evolution, pokedex这个顺序

pokemon=pokemon[['name', 'type', 'hp', 'evolution', 'pokedex']]

pokemon

#添加一个列place

pokemon['place']=['park','street','lake','forest']

pokemon

#查看每个列的数据类型

pokemon.dtypes

name object

type object

hp int64

evolution object

pokedex object

place object

dtype: object

练习9-时间序列

探索Apple公司股价数据

相应数据集:Apple_stock.csv

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

apple=pd.read_csv(r'exercise_data/Apple_stock.csv')

apple.head()

#查看每一列的数据类型

apple.dtypes

Date object

Open float64

High float64

Low float64

Close float64

Volume int64

Adj Close float64

dtype: object

#将Date这个列转换为datetime类型

apple['Date']=pd.to_datetime(apple['Date'])

apple['Date'].head()

0 2014-07-08

1 2014-07-07

2 2014-07-03

3 2014-07-02

4 2014-07-01

Name: Date, dtype: datetime64[ns]

#将Date设置为索引

apple=apple.set_index('Date')

apple.head()

#有重复日期吗?

apple.index.is_unique

True

#将index设置为升序

apple.sort_index(ascending = True).head()pandas resample重采样参数

resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, limit=None, base=0, on=None, level=None)

BM业务月末频度

#找到每个月的最后一个交易日(business day)

apple_month=apple.resample('BM')

apple_month.head()

Index索引的用途总结:

更方便的数据查询;

使用index可以获得性能提升;

自动的数据对齐功能;

更多更强大的数据结构支持;

#数据集中最早的日期和最晚的日期相差多少天?

(apple.index.max()-apple.index.min()).days

12261

#在数据中一共有多少个月?

apple_month=apple.resample('BM')

len(apple_month.index)

404

#按照时间顺序可视化Adj Close值

# makes the plot and assign it to a variable

appl_open = apple['Adj Close'].plot(title = "Apple Stock")

# changes the size of the graph

fig = appl_open.get_figure()

fig.set_size_inches(13.5, 9)

练习10-删除数据

探索Iris纸鸢花数据

相应数据集:iris.csv

import pandas as pd

iris=pd.read_csv(r'exercise_data/iris.csv')

iris.head()

#创建列名称

iris=pd.read_csv(r'exercise_data/iris.csv',names=['sepal_length','sepal_width', 'petal_length', 'petal_width', 'class'])

iris.head()

#数据框中有空值吗?

pd.isnull(iris).sum()

Out[6]:

sepal_length 0

sepal_width 0

petal_length 0

petal_width 0

class 0

dtype: int64

#将列petal_length的第10到19行设置为缺失值

iris.iloc[10:20,2:3]=np.nan

iris.head(20)

#将缺失值全部替换为1.0

iris['petal_length'].fillna(1,inplace=True)

iris.head(20)

#删除列class

del iris['class']

iris.head()

#将数据框前三行设置为缺失值

iris.iloc[0:3,:]=np.nan

iris.head()

#删除所有缺失行

iris=iris.dropna(how='any')

iris.head()

#重新设置索引

iris=iris.reset_index(drop=True)

iris.head()

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值