Pandas——练习题一

练习一:(使用jupyter notebook 工具)

import pandas as pd
import numpy as np
from pandas import Series,DataFrame
Step 1. 给定的原始数据集
raw_data = {'regiment':['Nighks','Nighks','Nighks','Nighks','Dragns','Dragns','Dragns','Dragns','Scts','Scts','Scts','Scts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, 43, 234, 523, 62, 62, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, 2, 3, 3, 2, 1, 2, 3, 2, 1, 2, 3],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin':['Arizona','Calif','Texas','Florida','Maine','Iowa','Alaska','Washiton','Oregon','Wyoming','Louina','Gegia']}
Step 2. 创建一个DataFrame,并赋值给变量army
army = DataFrame(raw_data)
army.head()
regimentcompanydeathsbattlessizeveteransreadinessarmoreddesertersorigin
0Nighks1st523510451114Arizona
1Nighks1st524295752024Calif
2Nighks2nd2521099623131Texas
3Nighks2nd6162140026312Florida
4Dragns1st434159273203Maine
Step 3. 指定列为索引:设定数据中的origin字段为索引
army1 = army.set_index(["origin"])
army1.head()
regimentcompanydeathsbattlessizeveteransreadinessarmoreddeserters
origin
ArizonaNighks1st523510451114
CalifNighks1st524295752024
TexasNighks2nd2521099623131
FloridaNighks2nd6162140026312
MaineDragns1st434159273203
Step 4. 筛选出列 regiments 的值不为"Dragns"的所有数据
army1.loc[army1["regiment"] != "Dragns"]
regimentcompanydeathsbattlessizeveteransreadinessarmoreddeserters
origin
ArizonaNighks1st523510451114
CalifNighks1st524295752024
TexasNighks2nd2521099623131
FloridaNighks2nd6162140026312
OregonScts1st62497348202
WyomingScts1st7371005435103
LouinaScts2nd378109963212
GegiaScts2nd3591523345313

Step 5. 获取指定行与列,区域行与区域列(单独切片–连续切片)
  1. 筛选出 第 3–6行,第3列、 第7列这两列的所有数据(如下左侧表)
  2. 筛选出 第 3–6行,第3–6列的所有数据(如下中部表)
  3. 筛选出 第 3、5、7、9行,第3、7列这两列的所有数据(自由切片、如下右侧表)
army1.iloc[2:6,[2,6]]       # 1.筛选出 第 3-6行,第3列、 第7列这两列的所有数据
army1.iloc[2:6,2:6]         # 2.筛选出 第 3--6行,第3--6列的所有数据
army1.iloc[[2,4,6,8],[2,6]] # 2.筛选出 第 3、5、7、9行,第3列、第7列的所有数据
deathsreadinessdeathsbattlessizeveteransdeathsreadiness
originoriginorigin
Texas253Texas252109962Texas253
Florida6163Florida6162140026Maine432
Maine432Maine434159273Alaska5232
Iowa2341Iowa2347100637Oregon622

练习二:在校生饮酒消费数据分析

数据集下载地址:https://download.csdn.net/download/wsp_1138886114/10563032

Step 1. 导入相关的模块
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
Step 2. 导入数据,并赋值给变量df
df = pd.read_csv("./datasets/Student_Alcohol.csv") 
df.head()
  ~  schoolsexageadd
ress
famsizePstatusMeduFeduMjobFjobabse
nces
G1G2G3
0GPF18UGT3A44at_hometeacher6566
1GPF17UGT3T11at_homeother4556
2GPF15ULE3T11at_homeother107810
3GPF15UGT3T42healthservices2151415
4GPF16UGT3T33otherother461010

395 rows × 33 columns


Step 3. 连续切片(获取[school:guardian]两列以及中间的所有数据)
df.iloc[:,0:12]    
  ~  schoolsexageaddressfamsizePstatusMeduFeduMjobFjobreasonguardian
0GPF18UGT3A44at_hometeachercoursemother
1GPF17UGT3T11at_homeothercoursefather
2GPF15ULE3T11at_homeotherothermother
3GPF15UGT3T42healthserviceshomemother
4GPF16UGT3T33otherotherhomefather
391MSM17ULE3T31servicesservicescoursemother
392MSM21RGT3T11otherothercourseother
393MSM18RLE3T32servicesothercoursemother
394MSM19ULE3T11otherat_homecoursefather

395 rows × 12 columns


Step 4. 将数据列 Mjob 和 Fjob中所有数据实现首字母大写
data2 = df.iloc[:,[8,9]]                         #获取 "Mjob","Fjob" 两列
data21 = Series(data2["Mjob"])                   #将两列转成Series格式
data22 = Series(data2["Fjob"])
df["Mjob"] =data21.map(lambda x:x.capitalize())  #将"Mjob"列所有值 首字母大写
df["Fjob"] =data22.map(lambda x:x.capitalize())  #将"Fjob"列所有值 首字母大写
df         #查看执行效果
  ~  schoolsexageadd
ress
famsizePsta
tus
MeduFeduMjobFjobabse
nces
G1G2G3
0GPF18UGT3A44At_homeTeacher6566
1GPF17UGT3T11At_homeOther4556
2GPF15ULE3T11At_homeOther107810
3GPF15UGT3T42HealthServices2151415
4GPF16UGT3T33OtherOther461010
5GPM16ULE3T43ServicesOther10151515
390MSM20ULE3A22ServicesServices11999
391MSM17ULE3T31ServicesServices3141616
392MSM21RGT3T11OtherOther31087
393MSM18RLE3T32ServicesOther0111210
394MSM19ULE3T11OtherAt_home5899

395 rows × 12 columns


Step 5.创建一个名为majority函数,并根据age列数据返回一个布尔值添加到新的数据列,列名为 legal_drinker (根据年龄这一列数据,大于17岁为合法饮酒)
majority = lambda x:["合法" if x>17 else "不合法"] 
df["legal_drinker"] = df["age"].map(majority)
df.head()
  ~  sch
ool
sexageaddr
ess
famsizePsta
tus
MeduFeduMjobFjobG1G2G3legal_
drinker
0GPF18UGT3A44At_homeTeacher566[合法]
1GPF17UGT3T11At_homeOther556[不合法]
2GPF15ULE3T11At_homeOther7810[不合法]
3GPF15UGT3T42HealthServices151415[不合法]
4GPF16UGT3T33OtherOther61010[不合法]

练习三:身份证数据筛选

正则测试网站:https://c.runoob.com/front-end/854

1. 导入数据并筛选出不同类型的身份证数据
import pandas as pd
import re
import numpy as np
from pandas import DataFrame,Series


excel_path = '././hehe_re_id_card2.xls'
All_data = pd.read_excel(excel_path)
idcard_face = All_data[All_data.type == "身份证人像面"]
idcard_back = All_data[All_data.type == "身份证国徽面"]
idcard_face_back = All_data[All_data.type == "身份证正反两面"]
idcard_face_back.head()
filenametypenamesexpeoplebirthdayaddressid_numberissue_authorityvalidityerror_msg
0SFZ00001.jpg身份证正反两面阳七曲云汉91900年5月12日山东省青岛市市南区台西753203190777122963晋岛市公安局市南分局2025.09:17-2035.09.17OK
1SFZ00002.jpg身份证正反两面虹游燕玲NaN1909年09月10日河南面国始县柳树店乡五河村中心组75302619923916园给县公安局1017.03.03-203703.03OK
2SFZ00003.jpg身份证正反两面谢薇薇女.1989年09月20日江苏省徐州市贾汪区汴塘镇399号753305198711993741徐州市公安局贾汪分局2027.02.17-2047.02.17OK
15SFZ00016.jpg身份证正反两面张楚云1990年09月16日广东省佛山市顺德区龙江镇如意巷100号75362319.a-8964741佛山市公安尽顺德分局2096.06.01-2166.06.01OK
16SFZ00017.jpg身份证正反两面宋军1939年12月18日北京市西城区德宝新团4963102199948783741北京市公安局西城分局2074.03.22-长明OK
2 筛选身份证所有字段
# 姓名: 长度[2,4]的中文(无其它任意字符)
def Name(idcard_face):
    idcard_face = idcard_face[(idcard_face.name.str.len()>=2) & (idcard_face.name.str.len()<=4)]
    idcard_face['name_zh'] = idcard_face.name.str.findall('[\u4e00-\u9fa5]')
    idcard_face['name_zh'] = idcard_face['name_zh'].str.join("")
    idcard_face = idcard_face.loc[idcard_face['name_zh'].str.len()==idcard_face['name'].str.len()]
    return idcard_face

# 出生:符合正则:\d{4}年\d{1,2}月\d{1,2}日
def Birthday(idcard_face):
    idcard_face = idcard_face[(idcard_face.birthday.str.len() >= 9) & (idcard_face.birthday.str.len() <= 11)]
    re_date = re.compile("\d{4}年\d{1,2}月\d{1,2}日")
    idcard_face = idcard_face.loc[idcard_face["birthday"].str.match(re_date)]
    return idcard_face

# 性别:男或女
def Sex(idcard_face):
    return idcard_face.loc[(idcard_face.sex == '男') | (idcard_face.sex == '女')]

# 民族:属于peoples 列表中的五十六个民族
def Peoples(idcard_face):
    peoples = ['汉', '蒙古', '回', '藏', '维吾尔', '苗', '彝', '壮', '布依', '朝鲜', '满', '侗', '瑶',\
               '白', '土家', '哈尼', '哈萨克', '傣', '黎', '傈僳', '佤', '畲', '高山', '拉祜', '水',\
               '东乡', '纳西', '景颇', '柯尔克孜', '土', '达斡尔', '仫佬', '羌', '布朗', '撒拉', '毛南',\
               '仡佬', '锡伯', '阿昌', '普米', '塔吉克', '怒', '乌孜别克', '俄罗斯', '鄂温克', '德昂',\
               '保安', '裕固', '京', '塔塔尔', '独龙', '鄂伦春', '赫哲', '门巴', '珞巴', '基诺族']
    return idcard_face[idcard_face.people.isin(peoples)]

def re_adress(x):
    pattern = re.compile(r'^([\u4E00-\u9FA5A-Z0-9]+(省|市|区|县|乡|镇|村|路|弄|街|楼|户|房|号|组|队|室)){2,}$')
    result = pattern.search(x)
    if result!=None:
        return result.group()
    else:
        return ""

def Address(idcard_face):
    idcard_face = idcard_face[(idcard_face.address.str.len() >= 11)]
    idcard_face["address"] = idcard_face["address"].apply(re_adress)
    idcard_face = idcard_face[(idcard_face.address.str.len() >= 9)]

    idcard_face["address_pro"] = idcard_face['address'].str.extract("([\u4E00-\u9FA5]+)省", expand=False)
    idcard_face_pro = idcard_face[idcard_face["address_pro"].str.len() >= 1]
    provice = ['河北', '山西', '辽宁', '吉林', '黑龙江', '江苏', '浙江', '安徽',\
               '福建', '江西', '山东', '河南', '湖北', '湖南', '广东', '海南',\
               '四川', '贵州', '云南', '陕西', '甘肃', '青海']
    idcard_face_pro_ok = idcard_face_pro[idcard_face_pro.address_pro.isin(provice)]

    idcard_face_pro = idcard_face_pro.append(idcard_face_pro_ok)
    idcard_face_pro = idcard_face_pro.append(idcard_face_pro_ok)
    idcard_face_diff = idcard_face_pro.drop_duplicates(subset=['filename', 'type', 'name', 'sex', 'people', \
                                                               'birthday', 'address', 'id_number', 'issue_authority', \
                                                               'validity', 'error_msg', 'eep_id', 'address_pro'],
                                                       keep=False)
    idcard_face = idcard_face.append(idcard_face_diff)
    idcard_face = idcard_face.append(idcard_face_diff)
    idcard_face = idcard_face.drop_duplicates(subset=['filename', 'type', 'name', 'sex', 'people',\
                                                      'birthday', 'address', 'id_number', 'issue_authority',\
                                                      'validity', 'error_msg', 'eep_id'], keep=False)
    return idcard_face




def Id_number(idcard_face):
    idcard_face = idcard_face.loc[idcard_face["id_number"].str.len() == 18]
    idcard_face["id_number_start"] = idcard_face.id_number.str[:17]
    idcard_face["id_number_end"] = idcard_face.id_number.str[-1]
    x_number = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'X']
    idcard_face = idcard_face[(idcard_face.id_number_start.str.isdigit()) & \
                              (idcard_face.id_number_end.isin(x_number))]
    return idcard_face


def Issue_authority(idcard_back):
    idcard_back = idcard_back[(idcard_back.issue_authority.str.len() >= 5) & \
                              (idcard_back.issue_authority.str.len() <= 15)]
    idcard_back['issue_authority_zh'] = idcard_back.issue_authority.str.findall('[\u4e00-\u9fa5]')
    idcard_back['issue_authority_zh'] = idcard_back['issue_authority_zh'].str.join("")
    idcard_back = idcard_back.loc[idcard_back['issue_authority_zh'].str.len() ==\
                                  idcard_back['issue_authority'].str.len()]
    idcard_back = idcard_back[(idcard_back.issue_authority.str.endswith("分局")) | \
                              (idcard_back.issue_authority.str.endswith("公安局"))]
    return idcard_back


def Validity(idcard_back):
    re_valid1 = re.compile("\d{4}.\d{2}.\d{2}-\d{4}.\d{2}.\d{2}")
    re_valid2 = re.compile("\d{4}.\d{2}.\d{2}-长期")
    idcard_back = idcard_back.loc[(idcard_back["validity"].str.match(re_valid1)) | \
                                  (idcard_back["validity"].str.match(re_valid2))]
    return idcard_back


def Idcard_face(idcard_face):
    idcard_face = Id_number(idcard_face)
    idcard_face = Birthday(idcard_face)
    idcard_face = Sex(idcard_face)
    idcard_face = Name(idcard_face)
    idcard_face = Peoples(idcard_face)
    idcard_face = Address(idcard_face)
    return idcard_face

def Idcard_back(idcard_back):
    idcard_back = Issue_authority(idcard_back)
    idcard_back = Validity(idcard_back)
    return idcard_back
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

SongpingWang

你的鼓励是我创作的最大动力!

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值