利用chatgtp进行数据分类分级
问题描述:写一段代码并打包为测试工具,要求可以自动扫描excel内容并识别出属于手机号、身份证号、中文姓名、银行卡号、统一社会信用代码、经纬度地址等的敏感数据识别 :
import pandas as pd
#读取excel文件
df = pd.read_excel('data.xlsx')
#定义正则表达式
regex_mobile = r"((13[0-9])|(14[5|7])|(15([0-3]|[5-9]))|(18[0,5-9]))\d{8}"
regex_idcard = r"\d{17}(\d|X|x)"
regex_name = r"^[\u4e00-\u9fa5]+$"
regex_bankcard = r"\d{12,19}"
regex_creditcode = r"[0-9A-HJ-NPQRTUWXY]{2}\d{6}[0-9A-HJ-NPQRTUWXY]{10}"
regex_address = r"^\s*[+-]?(180(\.0{1,6})?" + r"|((1[0-7]\d)|([1-9]?\d))(\.\d{1,6})?)\s*$"
#定义需要识别的列
mobile_columns = ['手机号']
idcard_columns = ['身份证号']
name_columns = ['姓名']
bankcard_columns = ['银行卡号']
creditcode_columns = ['统一社会信用代码']
address_columns = ['经纬度地址']
#扫描识别
for column in df.columns:
if column in mobile_columns:
df['mobile'] = df[column].str.extract(regex_mobile)
elif column in idcard_columns:
df['idcard'] = df[column].str.extract(regex_idcard)
elif column in name_columns:
df['name'] = df[column].str.extract(regex_name)
elif column in bankcard_columns:
df['bankcard'] = df[column].str.extract(regex_bankcard)
elif column in creditcode_columns:
df['creditcode'] = df[column].str.extract(regex_creditcode)
elif column in address_columns:
df['address'] = df[column].str.extract(regex_address)
#将识别结果写入excel文件
df.to_excel('data_output.xlsx',index=False)
- 问题描述:写一段代码并打包为测试工具,要求可以自动扫描excel内容,识别出属于手机号、身份证号、中文姓名等类别的敏感数据,并在excel中该数据的后一列写上所属类别
# python
import xlrd
import re
# 打开Excel文件
workbook = xlrd.open_workbook('data.xlsx')
# 取得第一个sheet
sheet = workbook.sheet_by_index(0)
# 定义正则表达式
phone_pattern = re.compile(r'^1[34578]\d{9}$')
ID_pattern = re.compile(r'^[1-9]\d{5}(18|19|([23]\d))\d{2}((0[1-9])|(10|11|12))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$')
name_pattern = re.compile(u'^[\u4e00-\u9fa5]{2,4}$')
# 遍历每一行
for row in range(sheet.nrows):
# 遍历每一列
for col in range(sheet.ncols):
# 获取数据
data = sheet.cell_value(row, col)
# 匹配正则表达式
if phone_pattern.match(data):
sheet.write(row, col + 1, '手机号')
elif ID_pattern.match(data):
sheet.write(row, col + 1, '身份证号')
elif name_pattern.match(data):
sheet.write(row, col + 1, '中文姓名')
# 保存
workbook.save('data.xlsx')