python中的pandas包的数据清洗能力

pandas很强大,前几天公司要求利用已知用户身份证、手机号知道客户的星座、性别、年龄等相关信息做用户画像,刚刚开始想到用R语言来实现,后来想到以后性能的问题,放弃了,由于公司没sas,就用pandas快速实现,参考代码:

# coding: UTF-8
'''
Created on 2015年8月25日

@author: ZHOUMEIXU204
'''
import  MySQLdb
import pandas as pd
import time
import  datetime
Table_id =pd.read_excel(u'D:\\Users\\zhoumeixu204\\Desktop\\全国身份证归属地数据库.xlsx')
print(Table_id.head())
num_str = lambda x: str(x)
num_sub = lambda x:x[0:6]
id_dict = dict(zip(Table_id['BM'].apply(num_str).apply(num_sub), Table_id['DQ']))
con = MySQLdb.connect(host="202.69.27.239", port=8443, user="root", passwd="Pa123456!", db="analyse", use_unicode=True, charset="utf8")
con_dev = MySQLdb.connect(host="202.69.27.239", port=8443, user="root", passwd="Pa123456!", db="analyse_dev", use_unicode=True, charset="utf8")
table_id_decode = pd.read_sql("select * from table_id", con)
f = lambda x: x[0:6]
table_id_decode['cert_address'] = table_id_decode['cert_id'].apply(f).map(id_dict).fillna(u'无法匹配')
def constellation(x):
    if  len(x)>=15:
        monthday=int(x[10:12]+x[12:14])
        if monthday>=321 and monthday<=419:
            constellation=u'白羊座'
        elif monthday>=420 and monthday<=520:
            constellation=u'金牛座'
        elif monthday>=521 and monthday<=621:
            constellation=u'双子座'
        elif monthday>=622 and monthday<=722:
            constellation=u'巨蟹座'
        elif monthday>=723 and monthday<=822:
            constellation=u'狮子座'
        elif monthday>=823 and monthday<=922:
            constellation=u'处女座'
        elif monthday>=823 and monthday<=1023:
            constellation=u'天枰座'
        elif monthday>=1024 and monthday<=1121:
            constellation=u'天蝎座'
        elif monthday>=1122 and monthday<=1221:
            constellation=u'射手座'
        elif (monthday>=1222 and monthday<=1231) or (monthday>=101 and monthday<=119) :
            constellation=u'魔蝎座'
        elif  monthday>=120 and monthday<=218:
            constellation=u'水瓶座'
        elif monthday>=219 and monthday<=320:
            constellation=u'双鱼座'
        else:
            constellation=u'其他'
    else:
        constellation=u'无法识别'

    return(constellation)  

def  zodiac(x):
    if  len(x)>=15:
        year=int(x[6:10])
        if year==11:
            zodiac=u'羊'
        elif year==10:
            zodiac=u'马'
        elif year==9:
            zodiac=u'蛇'
        elif year==8:
            zodiac=u'龙'
        elif year==7:
            zodiac=u'兔'
        elif year==6:
            zodiac=u'虎'
        elif year==5:
            zodiac=u'牛'
        elif year==4:
            zodiac=u'鼠'
        elif year==3:
            zodiac=u'猪'
        elif year==2:
            zodiac=u'狗'
        elif year==1:
            zodiac=u'鸡'
        elif year==0:
            zodiac=u'猴'
        else:
            zodiac=u'其他'
    else:
        zodiac=u'无法获得'
    return(zodiac)
def   sex(x):
    if len(x)==15:
        if int(x[len(x)-1])%2==1:
            sex=u'男'
        else:
            sex=u'女'
    elif len(x)==18:
        if int(x[len(x)-2])%2==1:
            sex=u'男'
        else:
            sex=u'女'
    else:
        sex=u'无法识别'
    return(sex)
time.sleep(0.5)
def  birthday(x):
    if len(x)>=15:
        birthday=x[6:10]+"-"+x[10:12]+"-"+x[12:14]
    else:
        birthday=u'无法获得'
    return(birthday)
def  age(x):
    if len(x)>=15:
        now = datetime.datetime.now()
        otherStyleTime =now.strftime("%Y-%m-%d %H:%M:%S")[0:4]
        age=str(int(otherStyleTime)-int(x[6:10]))
    else:
        age=u'无法获得'
    return(age)
table_id_decode['zodiac']=table_id_decode['cert_id'].apply(zodiac)
table_id_decode['constellation']=table_id_decode['cert_id'].apply(constellation)
table_id_decode['sex']=table_id_decode['cert_id'].apply(sex)
table_id_decode['birthday']=table_id_decode['cert_id'].apply(birthday)
table_id_decode['age']=table_id_decode['cert_id'].apply(age)    
cert_address = table_id_decode.drop('usr_name',axis=1)
# print(cert_address)
cert_address.to_sql("cert_info", con_dev, flavor="mysql", if_exists='replace', index=False) 
con.commit()
con.close()
con_dev.commit()
con_dev.close()
print("suceess")


  • 2
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值