Pandas玩转数据(六) -- 通过apply对数据进行处理

数据分析汇总学习

https://blog.csdn.net/weixin_39778570/article/details/81157884

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

# 打开一个csv文件对其中一列分成几列,把data列进行分组
f = open('apply_demo.csv')
df = pd.read_csv(f)
df.head()
Out[8]: 
         time                                data
0  1473411962   Symbol: APPL Seqno: 0 Price: 1623
1  1473411962   Symbol: APPL Seqno: 0 Price: 1623
2  1473411963   Symbol: APPL Seqno: 0 Price: 1623
3  1473411963   Symbol: APPL Seqno: 0 Price: 1623
4  1473411963   Symbol: APPL Seqno: 1 Price: 1649

# 简单的apply应用

s1 = Series(['a']*7978)

df['A'] = s1

df.head()
Out[12]: 
         time                                data  A
0  1473411962   Symbol: APPL Seqno: 0 Price: 1623  a
1  1473411962   Symbol: APPL Seqno: 0 Price: 1623  a
2  1473411963   Symbol: APPL Seqno: 0 Price: 1623  a
3  1473411963   Symbol: APPL Seqno: 0 Price: 1623  a
4  1473411963   Symbol: APPL Seqno: 1 Price: 1649  a

# apply传如一个函数,对某一列进行处理
df['A'] = df['A'].apply(str.upper)
df.head()
Out[14]: 
         time                                data  A
0  1473411962   Symbol: APPL Seqno: 0 Price: 1623  A
1  1473411962   Symbol: APPL Seqno: 0 Price: 1623  A
2  1473411963   Symbol: APPL Seqno: 0 Price: 1623  A
3  1473411963   Symbol: APPL Seqno: 0 Price: 1623  A
4  1473411963   Symbol: APPL Seqno: 1 Price: 1649  A

# 通过分析,我们需要的是如下数据
df['data'][0]
Out[15]: ' Symbol: APPL Seqno: 0 Price: 1623'

df['data'][0].strip().split(' ')
Out[16]: ['Symbol:', 'APPL', 'Seqno:', '0', 'Price:', '1623']

l1 = df['data'][0].strip().split(' ')

l1[1], l1[3], l1[5]
Out[18]: ('APPL', '0', '1623')

# 定义一个处理函数
def foo(line):
    items = line.strip().split(' ')
    return Series([items[1], items[3], items[5]])

# 使用apply处理data列
df_temp = df['data'].apply(foo)
df_temp.head()
Out[21]: 
      0  1     2
0  APPL  0  1623
1  APPL  0  1623
2  APPL  0  1623
3  APPL  0  1623
4  APPL  1  1649

# 修改列名
df_temp = df_temp.rename(columns = {0:'Symbol', 1:'Seqno', 2:'Price'})
df_temp.head()
Out[28]: 
  Symbol Seqno Price
0   APPL     0  1623
1   APPL     0  1623
2   APPL     0  1623
3   APPL     0  1623
4   APPL     1  1649

# 添加到原df中
df_new = df.combine_first(df_temp)

df_new.head()
Out[30]: 
   A   Price  Seqno Symbol                                data        time
0  A  1623.0    0.0   APPL   Symbol: APPL Seqno: 0 Price: 1623  1473411962
1  A  1623.0    0.0   APPL   Symbol: APPL Seqno: 0 Price: 1623  1473411962
2  A  1623.0    0.0   APPL   Symbol: APPL Seqno: 0 Price: 1623  1473411963
3  A  1623.0    0.0   APPL   Symbol: APPL Seqno: 0 Price: 1623  1473411963
4  A  1649.0    1.0   APPL   Symbol: APPL Seqno: 1 Price: 1649  1473411963

# 删除掉无关列并生成csv
del df_new['data'], df_new[ 'A']

df_new.head()
Out[33]: 
    Price  Seqno Symbol        time
0  1623.0    0.0   APPL  1473411962
1  1623.0    0.0   APPL  1473411962
2  1623.0    0.0   APPL  1473411963
3  1623.0    0.0   APPL  1473411963
4  1649.0    1.0   APPL  1473411963

df_new.to_csv('demo_duplicate.csv')
发布了266 篇原创文章 · 获赞 417 · 访问量 34万+
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 技术黑板 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览