3. 数据处理阶段
分析
为方便后面进行回归分析,所以需要将所有数据转换为数字的形式,在这里我们需要完成的功能如下:
-
将被判年限如
四年
转换为4*12
(以月份为单位) -
将赔偿金额如
五万八千元
转换为58000
(以元为单位) -
将法院地址、被告籍贯、被告的文化水平通过字典的方式映射为对应的数字,如:
-
将被告生日转为当前年龄
-
将性别转为0|1数字
代码实现
先说一下将法院地址、被告籍贯、被告的文化水平通过字典的方式映射为对应的数字的功能实现,因为我当时没有反应过来numpy
有一个专门的方法去重,所以自己将它实现了一遍
import csv
import pandas as pd
'''
将所有的被告籍贯做归一化处理
'''
# ChineseAdminiDivisionsDict.py
# 中国行政区划编码
count=0
ProvinceCode = {}
CityCode = {}
LOCATIONDICT={}
def ETLHome(home):
'''转换对应的结果输出'''
if home in LOCATIONDICT.keys():
homecode=LOCATIONDICT[home]
return homecode
else:
return "NaN"
def getLocationCode(homes):
locations = set()
'''过滤重复数据'''
for x in homes:
locations.add(x)
'''初始化地区字典'''
init_locations(locations)
'''根据字典,将对应的省市赋值'''
location_list = []
parseLocation(locations,location_list)
return location_list
def parseLocation(locations,location_list):
for x in locations:
pid = parseProvince(x)
cid = parseCity(x)
location_id=str(pid)+"."+str(cid)
location_list.append({"location":x,"location_id":location_id})
LOCATIONDICT[x]=location_id
# '''打印字典作为后面的对照'''
# save_title_csv()
# for x in location_list:
# save_data_csv(x)
def parseProvince(home):
if home.__contains__("龙岩市"):
home= "福建省"+home
for x in ProvinceCode.keys():
if home.__contains__(x):
return ProvinceCode[x]
def parseCity(home):
for x in CityCode.keys():
if home.__contains__(x):
return CityCode[x]
def init_locations(locations):
province_set = set()
city_set = set()
for x in locations:
if x.__contains__("省"):
(province,city)=x.split("省")
province_set.add(province)
city_set.add(city)
elif x.__contains__("自治区"):
(province, city) = x.split("自治区")
province_set.add(province)
city_set.add(city)
elif x.__contains__("重庆市"):
(province, city) = x.split("市")
province_set.add(province)
city_set.add(city)
elif x.__contains__("龙岩市"):
province_set.add("福建省")
city_set.add("龙岩市")
else:
province_set.add(x)
for index,province in enumerate(province_set):
ProvinceCode[province]=index
for index,city in enumerate(city_set):
CityCode[city]=index
def save_title_csv():
data_title = ['location','location_id']
with open('location.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, data_title)
writer.writeheader()
def save_data_csv(data):
global count
with open('location.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow([i for i in data.values()])
count += 1
print('=' * 20 + '第{}条csv写入成功'.format(count) + '=' * 20)
if __name__ == '__main__':
df = pd.read_csv("../ETL.csv")
homes = df['home']
getLocationCode(homes)
for home in homes:
print(home,ETLHome(home))
而这么多代码用numpy的unique
函数就可以实现,好气啊!
import csv
import pandas as pd
import numpy as np
def get_court_map(court_df):
map_list = dict()
temp_uni = np.unique(court_df)
for i in range(len(temp_uni)):
map_list[temp_uni[i]] = i
# {'东阳市人民检察院': 0, '乐安县人民检察院': 1,...}
return map_list
def savecourtmap(court_df):
save_title_csv()
court_map =get_court_map(court_df)
courtNames = court_map.keys()
court_dict = dict()
for x in courtNames:
court_dict["courtName"]=x
court_dict["courtCode"]=court_map[x]
save_data_csv(court_dict)
count = 0
def save_title_csv():
data_title = ['court','court_id']
with open('court.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, data_title)
writer.writeheader()
def save_data_csv(data):
global count
with open('court.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow([i for i in data.values()])
count += 1
print('=' * 20 + '第{}条csv写入成功'.format(count) + '=' * 20)
if __name__ == '__main__':
df =pd.read_csv("../ETL.csv")
court_df =df["court"]
savecourtmap(court_df)
所以利用这个方法,最终的代码如下:
import csv
import numpy as np
from 无讼爬虫.util.AgeTransfer import ageTransfer
from 无讼爬虫.util.MoneyTransfer import ETLPayment
from 无讼爬虫.util.YearTrasfer import ETLYear
class DataNormalize():
def __init__(self):
self.information = dict()
self.count = 0
self.len = []
def tranfer_data(self):
# 为保证数据的原子性,一条一条的转换数据
self.save_csv()
home_map_list = self.map(mode='home')
degree_map_list = self.map(mode='degree')
court_map_list = self.map(mode='court')
with open('ETL.csv', encoding='utf-8-sig', newline='') as f:
reader = csv.DictReader(f)
for row in reader:
# 获得被告名称
name = row['name']
self.information['name'] = name.split("被告人")[1]
# 被告性别
sex = row['gender']
if sex == '男':
self.information['gender'] = 1
else:
self.information['gender'] = 0
# 被告年龄
birthday = row['birthday']
self.information['age'] = ageTransfer(2020, birthday)
# 被告籍贯
home = row['home']
self.information['home'] = home_map_list[home]
# 被告文化程度
degree = row['degree']
self.information['degree'] = degree_map_list[degree]
# 法院
court = row['court']
self.information['court'] = court_map_list[court]
# 判处罚款
payment = row['payment']
self.information['payment'] = ETLPayment(payment)
# 判刑时间
judge_time = row['year']
self.information['judge_time'] = ETLYear(judge_time)
self.save_data(self.information)
def map(self, mode):
temp_list = []
map_list = dict()
with open('ETL.csv', encoding='utf-8-sig', newline='') as f:
reader = csv.DictReader(f)
for row in reader:
temp = row[mode]
temp_list.append(temp)
temp_uni = np.unique(temp_list)
for i in range(len(temp_uni)):
map_list[temp_uni[i]] = i
return map_list
def save_csv(self):
data_title = ['name', 'gender', 'age', 'home', 'degree', 'court', 'payment', 'prisonYear']
with open('NormalizeData.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, data_title)
writer.writeheader()
def save_data(self, data):
with open('NormalizeData.csv', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow([i for i in data.values()])
self.count += 1
print('=' * 20 + '第{}条csv写入成功'.format(self.count) + '=' * 20)
def genComparison(self):
'''生成对照表'''
home_map_list = self.map(mode='home')
home_map_dict = {}
self.save_comparision_tile("home.csv",['home','homeNumber'])
for key in home_map_list.keys():
home_map_dict['home'] = key
home_map_dict['homeNumber'] = home_map_list[key]
self.save_comparision_data("home.csv",home_map_dict)
degree_map_list = self.map(mode='degree')
degree_map_dict = {}
self.save_comparision_tile("degree.csv", ['degree', 'degreeNumber'])
for key in degree_map_list.keys():
degree_map_dict['degree'] = key
degree_map_dict['degreeNumber'] = degree_map_list[key]
self.save_comparision_data("degree.csv", degree_map_dict)
court_map_list = self.map(mode='court')
court_map_dict = {}
self.save_comparision_tile("court.csv", ['court', 'courtNumber'])
for key in court_map_list.keys():
court_map_dict['court'] = key
court_map_dict['courtNumber'] = court_map_list[key]
self.save_comparision_data("court.csv", court_map_dict)
def save_comparision_tile(self, csv_name, data_title):
with open(csv_name, 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.DictWriter(f, data_title)
writer.writeheader()
def save_comparision_data(self, csv_name, data):
with open(csv_name, 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow([i for i in data.values()])
if __name__ == '__main__':
datanormalize = DataNormalize()
datanormalize.tranfer_data()
datanormalize.genComparison()
为避免太冗余,将年龄、罚款、被判年限处理提到外部的工具包中实现
年龄处理
import re
import pandas as pd
def ageTransfer(now_year, birthday):
age = re.sub(r'(年|月)', r'-', birthday)
age = re.sub(r'(日)', r'', age)
if age != '暂无':
frame = pd.to_datetime(age)
age = now_year - frame.year
return age
if __name__ == '__main__':
df = pd.read_csv("D:\\Python_Code\\net_scrapy\\无讼爬虫\\ETL.csv")
age_df = df['birthday']
for age in age_df:
print(ageTransfer(2020,age))
罚款处理
import pandas as pd
'''
将所有的罚款金额转为阿拉伯数字
'''
# 把汉语句子中的汉字(大小写)数字转为阿拉伯数字,不能识别“百分之”
common_used_numerals_tmp = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'十': 10,
u'〇': 0, u'壹': 1, u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5, u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
'拾': 10,
'百': 100, '千': 1000, u'貮': 2, u'俩': 2, '佰': 100, '仟': 1000, '萬': 10000, '万': 10000,
'亿': 100000000,
'億': 100000000, '兆': 1000000000000}
num_str_start_symbol = ['一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十',
'壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '貮', '俩', ]
more_num_str_symbol = ['零', '一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿',
'〇', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '貮', '俩', '佰', '仟', '萬', '億', '兆']
common_used_numerals = {}
for key in common_used_numerals_tmp:
common_used_numerals[key] = common_used_numerals_tmp[key]
def chinese2digits(uchars_chinese):
total = 0
r = 1 # 表示单位:个十百千...
for i in range(len(uchars_chinese) - 1, -1, -1):
val = common_used_numerals.get(uchars_chinese[i])
if val >= 10 and i == 0: # 应对 十三 十四 十*之类
if val > r:
r = val
total = total + val
else:
r = r * val
# total =total + r * x
elif val >= 10:
if val > r:
r = val
else:
r = r * val
else:
total = total + r * val
return total
def ChineseNumToArab(oriStr):
lenStr = len(oriStr)
aProStr = ''
if lenStr == 0:
return aProStr
hasNumStart = False
numberStr = ''
for idx in range(lenStr):
if oriStr[idx] in num_str_start_symbol:
if not hasNumStart:
hasNumStart = True
numberStr += oriStr[idx]
else:
if hasNumStart:
if oriStr[idx] in more_num_str_symbol:
numberStr += oriStr[idx]
continue
else:
numResult = str(chinese2digits(numberStr))
numberStr = ''
hasNumStart = False
aProStr += numResult
if len(numberStr) > 0:
resultNum = chinese2digits(numberStr)
aProStr += str(resultNum)
return aProStr
def ETLPayment(payment_ori):
payment = payment_ori.split("元")[0]
if payment.isdigit():
return payment
return ChineseNumToArab(payment)
if __name__ == '__main__':
df=pd.read_csv("../ETL.csv")
payments = df['payment']
count = 0
for x in payments:
count = count +1
print(x,end="\t")
print(ETLPayment(x))
print(count)
被判年限处理
import pandas as pd
'''
将所有的判刑年限转为阿拉伯数字
'''
# 把汉语句子中的汉字(大小写)数字转为阿拉伯数字,不能识别“百分之”
common_used_numerals_tmp = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'十': 10,
u'〇': 0, u'壹': 1, u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5, u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
'拾': 10,
'百': 100, '千': 1000, u'貮': 2, u'俩': 2, '佰': 100, '仟': 1000, '萬': 10000, '万': 10000,
'亿': 100000000,
'億': 100000000, '兆': 1000000000000}
num_str_start_symbol = ['一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十',
'壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '貮', '俩', ]
more_num_str_symbol = ['零', '一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿',
'〇', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '貮', '俩', '佰', '仟', '萬', '億', '兆']
common_used_numerals = {}
for key in common_used_numerals_tmp:
common_used_numerals[key] = common_used_numerals_tmp[key]
def chinese2digits(uchars_chinese):
total = 0
r = 1 # 表示单位:个十百千...
for i in range(len(uchars_chinese) - 1, -1, -1):
val = common_used_numerals.get(uchars_chinese[i])
if val >= 10 and i == 0: # 应对 十三 十四 十*之类
if val > r:
r = val
total = total + val
else:
r = r * val
# total =total + r * x
elif val >= 10:
if val > r:
r = val
else:
r = r * val
else:
total = total + r * val
return total
def ChineseNumToArab(oriStr):
lenStr = len(oriStr)
aProStr = ''
if lenStr == 0:
return aProStr
hasNumStart = False
numberStr = ''
for idx in range(lenStr):
if oriStr[idx] in num_str_start_symbol:
if not hasNumStart:
hasNumStart = True
numberStr += oriStr[idx]
else:
if hasNumStart:
if oriStr[idx] in more_num_str_symbol:
numberStr += oriStr[idx]
continue
else:
numResult = str(chinese2digits(numberStr))
numberStr = ''
hasNumStart = False
aProStr += numResult
if len(numberStr) > 0:
resultNum = chinese2digits(numberStr)
aProStr += str(resultNum)
return aProStr
def ETLYear(year_ori):
rate = 1
date = 0
if year_ori.__contains__("年"):
rate=12
date_ori = year_ori.split("年")[0]
if date_ori.isdigit():
date = int(date_ori)
else:
date = int(ChineseNumToArab(date_ori))*rate
elif year_ori.__contains__("月"):
date_ori = year_ori.split("月")[0]
if date_ori.isdigit():
date = int(date_ori)
else:
date = int(ChineseNumToArab(date_ori)) * rate
return date
if __name__ == '__main__':
df=pd.read_csv("../ETL.csv")
years = df['year']
count = 0
for x in years:
count = count +1
print(x,end="\t")
print(ETLYear(x))
print(count)
处理结果如下: