练习一:(使用jupyter notebook 工具)
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
Step 1. 给定的原始数据集
raw_data = {'regiment':['Nighks','Nighks','Nighks','Nighks','Dragns','Dragns','Dragns','Dragns','Scts','Scts','Scts','Scts'],
'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
'deaths': [523, 52, 25, 616, 43, 234, 523, 62, 62, 73, 37, 35],
'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
'readiness': [1, 2, 3, 3, 2, 1, 2, 3, 2, 1, 2, 3],
'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
'origin':['Arizona','Calif','Texas','Florida','Maine','Iowa','Alaska','Washiton','Oregon','Wyoming','Louina','Gegia']}
Step 2. 创建一个DataFrame,并赋值给变量army
army = DataFrame(raw_data)
army.head()
regiment | company | deaths | battles | size | veterans | readiness | armored | deserters | origin | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Nighks | 1st | 523 | 5 | 1045 | 1 | 1 | 1 | 4 | Arizona |
1 | Nighks | 1st | 52 | 42 | 957 | 5 | 2 | 0 | 24 | Calif |
2 | Nighks | 2nd | 25 | 2 | 1099 | 62 | 3 | 1 | 31 | Texas |
3 | Nighks | 2nd | 616 | 2 | 1400 | 26 | 3 | 1 | 2 | Florida |
4 | Dragns | 1st | 43 | 4 | 1592 | 73 | 2 | 0 | 3 | Maine |
Step 3. 指定列为索引:设定数据中的origin字段为索引
army1 = army.set_index(["origin"])
army1.head()
regiment | company | deaths | battles | size | veterans | readiness | armored | deserters | |
---|---|---|---|---|---|---|---|---|---|
origin | |||||||||
Arizona | Nighks | 1st | 523 | 5 | 1045 | 1 | 1 | 1 | 4 |
Calif | Nighks | 1st | 52 | 42 | 957 | 5 | 2 | 0 | 24 |
Texas | Nighks | 2nd | 25 | 2 | 1099 | 62 | 3 | 1 | 31 |
Florida | Nighks | 2nd | 616 | 2 | 1400 | 26 | 3 | 1 | 2 |
Maine | Dragns | 1st | 43 | 4 | 1592 | 73 | 2 | 0 | 3 |
Step 4. 筛选出列 regiments 的值不为"Dragns"的所有数据
army1.loc[army1["regiment"] != "Dragns"]
regiment | company | deaths | battles | size | veterans | readiness | armored | deserters | |
---|---|---|---|---|---|---|---|---|---|
origin | |||||||||
Arizona | Nighks | 1st | 523 | 5 | 1045 | 1 | 1 | 1 | 4 |
Calif | Nighks | 1st | 52 | 42 | 957 | 5 | 2 | 0 | 24 |
Texas | Nighks | 2nd | 25 | 2 | 1099 | 62 | 3 | 1 | 31 |
Florida | Nighks | 2nd | 616 | 2 | 1400 | 26 | 3 | 1 | 2 |
Oregon | Scts | 1st | 62 | 4 | 973 | 48 | 2 | 0 | 2 |
Wyoming | Scts | 1st | 73 | 7 | 1005 | 435 | 1 | 0 | 3 |
Louina | Scts | 2nd | 37 | 8 | 1099 | 63 | 2 | 1 | 2 |
Gegia | Scts | 2nd | 35 | 9 | 1523 | 345 | 3 | 1 | 3 |
Step 5. 获取指定行与列,区域行与区域列(单独切片–连续切片)
- 筛选出 第 3–6行,第3列、 第7列这两列的所有数据(如下左侧表)
- 筛选出 第 3–6行,第3–6列的所有数据(如下中部表)
- 筛选出 第 3、5、7、9行,第3、7列这两列的所有数据(自由切片、如下右侧表)
army1.iloc[2:6,[2,6]] # 1.筛选出 第 3-6行,第3列、 第7列这两列的所有数据
army1.iloc[2:6,2:6] # 2.筛选出 第 3--6行,第3--6列的所有数据
army1.iloc[[2,4,6,8],[2,6]] # 2.筛选出 第 3、5、7、9行,第3列、第7列的所有数据
deaths | readiness | deaths | battles | size | veterans | deaths | readiness | |||||
origin | origin | origin | ||||||||||
Texas | 25 | 3 | Texas | 25 | 2 | 1099 | 62 | Texas | 25 | 3 | ||
Florida | 616 | 3 | Florida | 616 | 2 | 1400 | 26 | Maine | 43 | 2 | ||
Maine | 43 | 2 | Maine | 43 | 4 | 1592 | 73 | Alaska | 523 | 2 | ||
Iowa | 234 | 1 | Iowa | 234 | 7 | 1006 | 37 | Oregon | 62 | 2 |
练习二:在校生饮酒消费数据分析
数据集下载地址:https://download.csdn.net/download/wsp_1138886114/10563032
Step 1. 导入相关的模块
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
Step 2. 导入数据,并赋值给变量df
df = pd.read_csv("./datasets/Student_Alcohol.csv")
df.head()
~ | school | sex | age | add ress | famsize | Pstatus | Medu | Fedu | Mjob | Fjob | … | abse nces | G1 | G2 | G3 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GP | F | 18 | U | GT3 | A | 4 | 4 | at_home | teacher | … | 6 | 5 | 6 | 6 |
1 | GP | F | 17 | U | GT3 | T | 1 | 1 | at_home | other | … | 4 | 5 | 5 | 6 |
2 | GP | F | 15 | U | LE3 | T | 1 | 1 | at_home | other | … | 10 | 7 | 8 | 10 |
3 | GP | F | 15 | U | GT3 | T | 4 | 2 | health | services | … | 2 | 15 | 14 | 15 |
4 | GP | F | 16 | U | GT3 | T | 3 | 3 | other | other | … | 4 | 6 | 10 | 10 |
395 rows × 33 columns
Step 3. 连续切片(获取[school:guardian]两列以及中间的所有数据)
df.iloc[:,0:12]
~ | school | sex | age | address | famsize | Pstatus | Medu | Fedu | Mjob | Fjob | reason | guardian |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GP | F | 18 | U | GT3 | A | 4 | 4 | at_home | teacher | course | mother |
1 | GP | F | 17 | U | GT3 | T | 1 | 1 | at_home | other | course | father |
2 | GP | F | 15 | U | LE3 | T | 1 | 1 | at_home | other | other | mother |
3 | GP | F | 15 | U | GT3 | T | 4 | 2 | health | services | home | mother |
4 | GP | F | 16 | U | GT3 | T | 3 | 3 | other | other | home | father |
… | … | … | … | … | … | … | … | … | … | … | … | … |
391 | MS | M | 17 | U | LE3 | T | 3 | 1 | services | services | course | mother |
392 | MS | M | 21 | R | GT3 | T | 1 | 1 | other | other | course | other |
393 | MS | M | 18 | R | LE3 | T | 3 | 2 | services | other | course | mother |
394 | MS | M | 19 | U | LE3 | T | 1 | 1 | other | at_home | course | father |
395 rows × 12 columns
Step 4. 将数据列 Mjob 和 Fjob中所有数据实现首字母大写
data2 = df.iloc[:,[8,9]] #获取 "Mjob","Fjob" 两列
data21 = Series(data2["Mjob"]) #将两列转成Series格式
data22 = Series(data2["Fjob"])
df["Mjob"] =data21.map(lambda x:x.capitalize()) #将"Mjob"列所有值 首字母大写
df["Fjob"] =data22.map(lambda x:x.capitalize()) #将"Fjob"列所有值 首字母大写
df #查看执行效果
~ | school | sex | age | add ress | famsize | Psta tus | Medu | Fedu | Mjob | Fjob | … | abse nces | G1 | G2 | G3 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GP | F | 18 | U | GT3 | A | 4 | 4 | At_home | Teacher | … | 6 | 5 | 6 | 6 |
1 | GP | F | 17 | U | GT3 | T | 1 | 1 | At_home | Other | … | 4 | 5 | 5 | 6 |
2 | GP | F | 15 | U | LE3 | T | 1 | 1 | At_home | Other | … | 10 | 7 | 8 | 10 |
3 | GP | F | 15 | U | GT3 | T | 4 | 2 | Health | Services | … | 2 | 15 | 14 | 15 |
4 | GP | F | 16 | U | GT3 | T | 3 | 3 | Other | Other | … | 4 | 6 | 10 | 10 |
5 | GP | M | 16 | U | LE3 | T | 4 | 3 | Services | Other | … | 10 | 15 | 15 | 15 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
390 | MS | M | 20 | U | LE3 | A | 2 | 2 | Services | Services | … | 11 | 9 | 9 | 9 |
391 | MS | M | 17 | U | LE3 | T | 3 | 1 | Services | Services | … | 3 | 14 | 16 | 16 |
392 | MS | M | 21 | R | GT3 | T | 1 | 1 | Other | Other | … | 3 | 10 | 8 | 7 |
393 | MS | M | 18 | R | LE3 | T | 3 | 2 | Services | Other | … | 0 | 11 | 12 | 10 |
394 | MS | M | 19 | U | LE3 | T | 1 | 1 | Other | At_home | … | 5 | 8 | 9 | 9 |
395 rows × 12 columns
Step 5.创建一个名为majority函数,并根据age列数据返回一个布尔值添加到新的数据列,列名为 legal_drinker (根据年龄这一列数据,大于17岁为合法饮酒)
majority = lambda x:["合法" if x>17 else "不合法"]
df["legal_drinker"] = df["age"].map(majority)
df.head()
~ | sch ool | sex | age | addr ess | famsize | Psta tus | Medu | Fedu | Mjob | Fjob | … | G1 | G2 | G3 | legal_ drinker |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | GP | F | 18 | U | GT3 | A | 4 | 4 | At_home | Teacher | … | 5 | 6 | 6 | [合法] |
1 | GP | F | 17 | U | GT3 | T | 1 | 1 | At_home | Other | … | 5 | 5 | 6 | [不合法] |
2 | GP | F | 15 | U | LE3 | T | 1 | 1 | At_home | Other | … | 7 | 8 | 10 | [不合法] |
3 | GP | F | 15 | U | GT3 | T | 4 | 2 | Health | Services | … | 15 | 14 | 15 | [不合法] |
4 | GP | F | 16 | U | GT3 | T | 3 | 3 | Other | Other | … | 6 | 10 | 10 | [不合法] |
练习三:身份证数据筛选
正则测试网站:https://c.runoob.com/front-end/854
1. 导入数据并筛选出不同类型的身份证数据
import pandas as pd
import re
import numpy as np
from pandas import DataFrame,Series
excel_path = '././hehe_re_id_card2.xls'
All_data = pd.read_excel(excel_path)
idcard_face = All_data[All_data.type == "身份证人像面"]
idcard_back = All_data[All_data.type == "身份证国徽面"]
idcard_face_back = All_data[All_data.type == "身份证正反两面"]
idcard_face_back.head()
filename | type | name | sex | people | birthday | address | id_number | issue_authority | validity | error_msg | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | SFZ00001.jpg | 身份证正反两面 | 阳七曲云 | 女 | 汉9 | 1900年5月12日 | 山东省青岛市市南区台西 | 753203190777122963 | 晋岛市公安局市南分局 | 2025.09:17-2035.09.17 | OK |
1 | SFZ00002.jpg | 身份证正反两面 | 虹游燕玲 | 女 | NaN | 1909年09月10日 | 河南面国始县柳树店乡五河村中心组 | 75302619923916 | 园给县公安局 | 1017.03.03-203703.03 | OK |
2 | SFZ00003.jpg | 身份证正反两面 | 谢薇薇 | 女. | 汉 | 1989年09月20日 | 江苏省徐州市贾汪区汴塘镇399号 | 753305198711993741 | 徐州市公安局贾汪分局 | 2027.02.17-2047.02.17 | OK |
15 | SFZ00016.jpg | 身份证正反两面 | 张楚云 | 女 | 汉 | 1990年09月16日 | 广东省佛山市顺德区龙江镇如意巷100号 | 75362319.a-8964741 | 佛山市公安尽顺德分局 | 2096.06.01-2166.06.01 | OK |
16 | SFZ00017.jpg | 身份证正反两面 | 宋军 | 男 | 汉 | 1939年12月18日 | 北京市西城区德宝新团4 | 963102199948783741 | 北京市公安局西城分局 | 2074.03.22-长明 | OK |
2 筛选身份证所有字段
# 姓名: 长度[2,4]的中文(无其它任意字符)
def Name(idcard_face):
idcard_face = idcard_face[(idcard_face.name.str.len()>=2) & (idcard_face.name.str.len()<=4)]
idcard_face['name_zh'] = idcard_face.name.str.findall('[\u4e00-\u9fa5]')
idcard_face['name_zh'] = idcard_face['name_zh'].str.join("")
idcard_face = idcard_face.loc[idcard_face['name_zh'].str.len()==idcard_face['name'].str.len()]
return idcard_face
# 出生:符合正则:\d{4}年\d{1,2}月\d{1,2}日
def Birthday(idcard_face):
idcard_face = idcard_face[(idcard_face.birthday.str.len() >= 9) & (idcard_face.birthday.str.len() <= 11)]
re_date = re.compile("\d{4}年\d{1,2}月\d{1,2}日")
idcard_face = idcard_face.loc[idcard_face["birthday"].str.match(re_date)]
return idcard_face
# 性别:男或女
def Sex(idcard_face):
return idcard_face.loc[(idcard_face.sex == '男') | (idcard_face.sex == '女')]
# 民族:属于peoples 列表中的五十六个民族
def Peoples(idcard_face):
peoples = ['汉', '蒙古', '回', '藏', '维吾尔', '苗', '彝', '壮', '布依', '朝鲜', '满', '侗', '瑶',\
'白', '土家', '哈尼', '哈萨克', '傣', '黎', '傈僳', '佤', '畲', '高山', '拉祜', '水',\
'东乡', '纳西', '景颇', '柯尔克孜', '土', '达斡尔', '仫佬', '羌', '布朗', '撒拉', '毛南',\
'仡佬', '锡伯', '阿昌', '普米', '塔吉克', '怒', '乌孜别克', '俄罗斯', '鄂温克', '德昂',\
'保安', '裕固', '京', '塔塔尔', '独龙', '鄂伦春', '赫哲', '门巴', '珞巴', '基诺族']
return idcard_face[idcard_face.people.isin(peoples)]
def re_adress(x):
pattern = re.compile(r'^([\u4E00-\u9FA5A-Z0-9]+(省|市|区|县|乡|镇|村|路|弄|街|楼|户|房|号|组|队|室)){2,}$')
result = pattern.search(x)
if result!=None:
return result.group()
else:
return ""
def Address(idcard_face):
idcard_face = idcard_face[(idcard_face.address.str.len() >= 11)]
idcard_face["address"] = idcard_face["address"].apply(re_adress)
idcard_face = idcard_face[(idcard_face.address.str.len() >= 9)]
idcard_face["address_pro"] = idcard_face['address'].str.extract("([\u4E00-\u9FA5]+)省", expand=False)
idcard_face_pro = idcard_face[idcard_face["address_pro"].str.len() >= 1]
provice = ['河北', '山西', '辽宁', '吉林', '黑龙江', '江苏', '浙江', '安徽',\
'福建', '江西', '山东', '河南', '湖北', '湖南', '广东', '海南',\
'四川', '贵州', '云南', '陕西', '甘肃', '青海']
idcard_face_pro_ok = idcard_face_pro[idcard_face_pro.address_pro.isin(provice)]
idcard_face_pro = idcard_face_pro.append(idcard_face_pro_ok)
idcard_face_pro = idcard_face_pro.append(idcard_face_pro_ok)
idcard_face_diff = idcard_face_pro.drop_duplicates(subset=['filename', 'type', 'name', 'sex', 'people', \
'birthday', 'address', 'id_number', 'issue_authority', \
'validity', 'error_msg', 'eep_id', 'address_pro'],
keep=False)
idcard_face = idcard_face.append(idcard_face_diff)
idcard_face = idcard_face.append(idcard_face_diff)
idcard_face = idcard_face.drop_duplicates(subset=['filename', 'type', 'name', 'sex', 'people',\
'birthday', 'address', 'id_number', 'issue_authority',\
'validity', 'error_msg', 'eep_id'], keep=False)
return idcard_face
def Id_number(idcard_face):
idcard_face = idcard_face.loc[idcard_face["id_number"].str.len() == 18]
idcard_face["id_number_start"] = idcard_face.id_number.str[:17]
idcard_face["id_number_end"] = idcard_face.id_number.str[-1]
x_number = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'X']
idcard_face = idcard_face[(idcard_face.id_number_start.str.isdigit()) & \
(idcard_face.id_number_end.isin(x_number))]
return idcard_face
def Issue_authority(idcard_back):
idcard_back = idcard_back[(idcard_back.issue_authority.str.len() >= 5) & \
(idcard_back.issue_authority.str.len() <= 15)]
idcard_back['issue_authority_zh'] = idcard_back.issue_authority.str.findall('[\u4e00-\u9fa5]')
idcard_back['issue_authority_zh'] = idcard_back['issue_authority_zh'].str.join("")
idcard_back = idcard_back.loc[idcard_back['issue_authority_zh'].str.len() ==\
idcard_back['issue_authority'].str.len()]
idcard_back = idcard_back[(idcard_back.issue_authority.str.endswith("分局")) | \
(idcard_back.issue_authority.str.endswith("公安局"))]
return idcard_back
def Validity(idcard_back):
re_valid1 = re.compile("\d{4}.\d{2}.\d{2}-\d{4}.\d{2}.\d{2}")
re_valid2 = re.compile("\d{4}.\d{2}.\d{2}-长期")
idcard_back = idcard_back.loc[(idcard_back["validity"].str.match(re_valid1)) | \
(idcard_back["validity"].str.match(re_valid2))]
return idcard_back
def Idcard_face(idcard_face):
idcard_face = Id_number(idcard_face)
idcard_face = Birthday(idcard_face)
idcard_face = Sex(idcard_face)
idcard_face = Name(idcard_face)
idcard_face = Peoples(idcard_face)
idcard_face = Address(idcard_face)
return idcard_face
def Idcard_back(idcard_back):
idcard_back = Issue_authority(idcard_back)
idcard_back = Validity(idcard_back)
return idcard_back