Python数据分析一条龙(菜鸟上手项目记录--数据处理阶段)

最新推荐文章于 2024-07-30 18:14:20 发布

ChanZany

最新推荐文章于 2024-07-30 18:14:20 发布

阅读量651

点赞数 1

分类专栏： Python爬虫文章标签： python csv 数据分析正则表达式

本文链接：https://blog.csdn.net/qq_41819729/article/details/106951684

版权

Python爬虫专栏收录该内容

12 篇文章 2 订阅

订阅专栏

上一节回顾

3. 数据处理阶段

分析

为方便后面进行回归分析，所以需要将所有数据转换为数字的形式，在这里我们需要完成的功能如下：

将被判年限如四年转换为4*12（以月份为单位）
将赔偿金额如五万八千元转换为58000(以元为单位)
将法院地址、被告籍贯、被告的文化水平通过字典的方式映射为对应的数字，如：
将被告生日转为当前年龄
将性别转为0|1数字

代码实现

先说一下将法院地址、被告籍贯、被告的文化水平通过字典的方式映射为对应的数字的功能实现，因为我当时没有反应过来numpy有一个专门的方法去重，所以自己将它实现了一遍

import csv

import pandas as pd
'''
将所有的被告籍贯做归一化处理
'''
# ChineseAdminiDivisionsDict.py
# 中国行政区划编码

count=0
ProvinceCode = {}
CityCode = {}
LOCATIONDICT={}


def ETLHome(home):
    '''转换对应的结果输出'''
    if home in LOCATIONDICT.keys():
        homecode=LOCATIONDICT[home]
        return homecode
    else:
        return "NaN"
def getLocationCode(homes):
    locations = set()
    '''过滤重复数据'''
    for x in homes:
        locations.add(x)
    '''初始化地区字典'''
    init_locations(locations)
    '''根据字典，将对应的省市赋值'''
    location_list = []
    parseLocation(locations,location_list)
    return location_list

def parseLocation(locations,location_list):

    for x in locations:
        pid = parseProvince(x)
        cid = parseCity(x)
        location_id=str(pid)+"."+str(cid)
        location_list.append({"location":x,"location_id":location_id})
        LOCATIONDICT[x]=location_id
    # '''打印字典作为后面的对照'''
    # save_title_csv()
    # for x in location_list:
    #     save_data_csv(x)

def parseProvince(home):
    if home.__contains__("龙岩市"):
       home= "福建省"+home
    for x in ProvinceCode.keys():
        if home.__contains__(x):
            return ProvinceCode[x]

def parseCity(home):
    for x in CityCode.keys():
        if home.__contains__(x):
            return CityCode[x]

def init_locations(locations):
    province_set = set()
    city_set = set()
    for x in locations:
        if x.__contains__("省"):
            (province,city)=x.split("省")
            province_set.add(province)
            city_set.add(city)
        elif x.__contains__("自治区"):
            (province, city) = x.split("自治区")
            province_set.add(province)
            city_set.add(city)
        elif x.__contains__("重庆市"):
            (province, city) = x.split("市")
            province_set.add(province)
            city_set.add(city)
        elif x.__contains__("龙岩市"):
            province_set.add("福建省")
            city_set.add("龙岩市")
        else:
            province_set.add(x)
    for index,province in enumerate(province_set):
        ProvinceCode[province]=index
    for index,city in enumerate(city_set):
        CityCode[city]=index

def save_title_csv():
    data_title = ['location','location_id']
    with open('location.csv', 'a', encoding='utf-8-sig', newline='') as f:
        writer = csv.DictWriter(f, data_title)
        writer.writeheader()
def save_data_csv(data):
    global count
    with open('location.csv', 'a', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([i for i in data.values()])

    count += 1
    print('=' * 20 + '第{}条csv写入成功'.format(count) + '=' * 20)



if __name__ == '__main__':
    df = pd.read_csv("../ETL.csv")
    homes = df['home']
    getLocationCode(homes)
    for home in homes:
        print(home,ETLHome(home))

而这么多代码用numpy的unique函数就可以实现，好气啊！

import csv
import pandas as pd
import numpy as np

def get_court_map(court_df):
    map_list = dict()
    temp_uni = np.unique(court_df)
    for i in range(len(temp_uni)):
        map_list[temp_uni[i]] = i
    # {'东阳市人民检察院': 0, '乐安县人民检察院': 1,...}

    return map_list

def savecourtmap(court_df):
    save_title_csv()
    court_map =get_court_map(court_df)
    courtNames = court_map.keys()
    court_dict = dict()
    for x in courtNames:
        court_dict["courtName"]=x
        court_dict["courtCode"]=court_map[x]
        save_data_csv(court_dict)

count = 0
def save_title_csv():
    data_title = ['court','court_id']
    with open('court.csv', 'a', encoding='utf-8-sig', newline='') as f:
        writer = csv.DictWriter(f, data_title)
        writer.writeheader()
def save_data_csv(data):
    global count
    with open('court.csv', 'a', encoding='utf-8-sig', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([i for i in data.values()])

    count += 1
    print('=' * 20 + '第{}条csv写入成功'.format(count) + '=' * 20)



if __name__ == '__main__':
    df =pd.read_csv("../ETL.csv")
    court_df =df["court"]
    savecourtmap(court_df)

所以利用这个方法，最终的代码如下：

import csv
import numpy as np
from 无讼爬虫.util.AgeTransfer import ageTransfer
from 无讼爬虫.util.MoneyTransfer import ETLPayment
from 无讼爬虫.util.YearTrasfer import ETLYear


class DataNormalize():
    def __init__(self):
        self.information = dict()
        self.count = 0
        self.len = []

    def tranfer_data(self):
        # 为保证数据的原子性，一条一条的转换数据
        self.save_csv()
        home_map_list = self.map(mode='home')
        degree_map_list = self.map(mode='degree')
        court_map_list = self.map(mode='court')
        with open('ETL.csv', encoding='utf-8-sig', newline='') as f:
            reader = csv.DictReader(f)
            for row in reader:
                # 获得被告名称
                name = row['name']
                self.information['name'] = name.split("被告人")[1]
                # 被告性别
                sex = row['gender']
                if sex == '男':
                    self.information['gender'] = 1
                else:
                    self.information['gender'] = 0
                # 被告年龄
                birthday = row['birthday']
                self.information['age'] = ageTransfer(2020, birthday)
                # 被告籍贯
                home = row['home']
                self.information['home'] = home_map_list[home]
                # 被告文化程度
                degree = row['degree']
                self.information['degree'] = degree_map_list[degree]
                # 法院
                court = row['court']
                self.information['court'] = court_map_list[court]
                # 判处罚款
                payment = row['payment']
                self.information['payment'] = ETLPayment(payment)
                # 判刑时间
                judge_time = row['year']
                self.information['judge_time'] = ETLYear(judge_time)

                self.save_data(self.information)

    def map(self, mode):
        temp_list = []
        map_list = dict()
        with open('ETL.csv', encoding='utf-8-sig', newline='') as f:
            reader = csv.DictReader(f)
            for row in reader:
                temp = row[mode]
                temp_list.append(temp)
        temp_uni = np.unique(temp_list)
        for i in range(len(temp_uni)):
            map_list[temp_uni[i]] = i
        return map_list

    def save_csv(self):
        data_title = ['name', 'gender', 'age', 'home', 'degree', 'court', 'payment', 'prisonYear']
        with open('NormalizeData.csv', 'a', encoding='utf-8-sig', newline='') as f:
            writer = csv.DictWriter(f, data_title)
            writer.writeheader()

    def save_data(self, data):
        with open('NormalizeData.csv', 'a', encoding='utf-8-sig', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([i for i in data.values()])
        self.count += 1
        print('=' * 20 + '第{}条csv写入成功'.format(self.count) + '=' * 20)

    def genComparison(self):
        '''生成对照表'''
        home_map_list = self.map(mode='home')
        home_map_dict = {}
        self.save_comparision_tile("home.csv",['home','homeNumber'])
        for key in home_map_list.keys():
            home_map_dict['home'] = key
            home_map_dict['homeNumber'] = home_map_list[key]
            self.save_comparision_data("home.csv",home_map_dict)

        degree_map_list = self.map(mode='degree')
        degree_map_dict = {}
        self.save_comparision_tile("degree.csv", ['degree', 'degreeNumber'])
        for key in degree_map_list.keys():
            degree_map_dict['degree'] = key
            degree_map_dict['degreeNumber'] = degree_map_list[key]
            self.save_comparision_data("degree.csv", degree_map_dict)

        court_map_list = self.map(mode='court')
        court_map_dict = {}
        self.save_comparision_tile("court.csv", ['court', 'courtNumber'])
        for key in court_map_list.keys():
            court_map_dict['court'] = key
            court_map_dict['courtNumber'] = court_map_list[key]
            self.save_comparision_data("court.csv", court_map_dict)



    def save_comparision_tile(self, csv_name, data_title):
        with open(csv_name, 'a', encoding='utf-8-sig', newline='') as f:
            writer = csv.DictWriter(f, data_title)
            writer.writeheader()

    def save_comparision_data(self, csv_name, data):
        with open(csv_name, 'a', encoding='utf-8-sig', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([i for i in data.values()])


if __name__ == '__main__':
    datanormalize = DataNormalize()
    datanormalize.tranfer_data()
    datanormalize.genComparison()

为避免太冗余，将年龄、罚款、被判年限处理提到外部的工具包中实现

年龄处理

import re
import pandas as pd

def ageTransfer(now_year, birthday):
    age = re.sub(r'(年|月)', r'-', birthday)
    age = re.sub(r'(日)', r'', age)

    if age != '暂无':
        frame = pd.to_datetime(age)
        age = now_year - frame.year

    return age


if __name__ == '__main__':
    df = pd.read_csv("D:\\Python_Code\\net_scrapy\\无讼爬虫\\ETL.csv")
    age_df = df['birthday']
    for age in age_df:
        print(ageTransfer(2020,age))

罚款处理

import pandas as pd
'''
将所有的罚款金额转为阿拉伯数字
'''
# 把汉语句子中的汉字（大小写）数字转为阿拉伯数字，不能识别“百分之”
common_used_numerals_tmp = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
                            '十': 10,
                            u'〇': 0, u'壹': 1, u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5, u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
                            '拾': 10,
                            '百': 100, '千': 1000, u'貮': 2, u'俩': 2, '佰': 100, '仟': 1000, '萬': 10000, '万': 10000,
                            '亿': 100000000,
                            '億': 100000000, '兆': 1000000000000}

num_str_start_symbol = ['一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十',
                        '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '貮', '俩', ]
more_num_str_symbol = ['零', '一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿',
                       '〇', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '貮', '俩', '佰', '仟', '萬', '億', '兆']



common_used_numerals = {}
for key in common_used_numerals_tmp:
    common_used_numerals[key] = common_used_numerals_tmp[key]


def chinese2digits(uchars_chinese):
    total = 0
    r = 1  # 表示单位：个十百千...
    for i in range(len(uchars_chinese) - 1, -1, -1):
        val = common_used_numerals.get(uchars_chinese[i])
        if val >= 10 and i == 0:  # 应对 十三 十四 十*之类
            if val > r:
                r = val
                total = total + val
            else:
                r = r * val
                # total =total + r * x
        elif val >= 10:
            if val > r:
                r = val
            else:
                r = r * val
        else:
            total = total + r * val
    return total

def ChineseNumToArab(oriStr):
    lenStr = len(oriStr)
    aProStr = ''
    if lenStr == 0:
        return aProStr
    hasNumStart = False
    numberStr = ''
    for idx in range(lenStr):
        if oriStr[idx] in num_str_start_symbol:
            if not hasNumStart:
                hasNumStart = True
            numberStr += oriStr[idx]
        else:
            if hasNumStart:
                if oriStr[idx] in more_num_str_symbol:
                    numberStr += oriStr[idx]
                    continue
                else:
                    numResult = str(chinese2digits(numberStr))
                    numberStr = ''
                    hasNumStart = False
                    aProStr += numResult


    if len(numberStr) > 0:
        resultNum = chinese2digits(numberStr)
        aProStr += str(resultNum)
    return aProStr



def ETLPayment(payment_ori):
    payment = payment_ori.split("元")[0]
    if payment.isdigit():
        return payment
    return ChineseNumToArab(payment)

if __name__ == '__main__':
    df=pd.read_csv("../ETL.csv")
    payments = df['payment']
    count = 0
    for x in payments:
        count = count +1
        print(x,end="\t")
        print(ETLPayment(x))
    print(count)

被判年限处理

import pandas as pd
'''
将所有的判刑年限转为阿拉伯数字
'''
# 把汉语句子中的汉字（大小写）数字转为阿拉伯数字，不能识别“百分之”
common_used_numerals_tmp = {'零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
                            '十': 10,
                            u'〇': 0, u'壹': 1, u'贰': 2, u'叁': 3, u'肆': 4, u'伍': 5, u'陆': 6, u'柒': 7, u'捌': 8, u'玖': 9,
                            '拾': 10,
                            '百': 100, '千': 1000, u'貮': 2, u'俩': 2, '佰': 100, '仟': 1000, '萬': 10000, '万': 10000,
                            '亿': 100000000,
                            '億': 100000000, '兆': 1000000000000}

num_str_start_symbol = ['一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十',
                        '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '貮', '俩', ]

more_num_str_symbol = ['零', '一', '二', '两', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿',
                       '〇', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖', '拾', '貮', '俩', '佰', '仟', '萬', '億', '兆']



common_used_numerals = {}
for key in common_used_numerals_tmp:
    common_used_numerals[key] = common_used_numerals_tmp[key]


def chinese2digits(uchars_chinese):
    total = 0
    r = 1  # 表示单位：个十百千...
    for i in range(len(uchars_chinese) - 1, -1, -1):
        val = common_used_numerals.get(uchars_chinese[i])
        if val >= 10 and i == 0:  # 应对 十三 十四 十*之类
            if val > r:
                r = val
                total = total + val
            else:
                r = r * val
                # total =total + r * x
        elif val >= 10:
            if val > r:
                r = val
            else:
                r = r * val
        else:
            total = total + r * val
    return total

def ChineseNumToArab(oriStr):
    lenStr = len(oriStr)
    aProStr = ''
    if lenStr == 0:
        return aProStr
    hasNumStart = False
    numberStr = ''
    for idx in range(lenStr):
        if oriStr[idx] in num_str_start_symbol:
            if not hasNumStart:
                hasNumStart = True
            numberStr += oriStr[idx]
        else:
            if hasNumStart:
                if oriStr[idx] in more_num_str_symbol:
                    numberStr += oriStr[idx]
                    continue
                else:
                    numResult = str(chinese2digits(numberStr))
                    numberStr = ''
                    hasNumStart = False
                    aProStr += numResult


    if len(numberStr) > 0:
        resultNum = chinese2digits(numberStr)
        aProStr += str(resultNum)
    return aProStr



def ETLYear(year_ori):
    rate = 1
    date = 0
    if year_ori.__contains__("年"):
        rate=12
        date_ori = year_ori.split("年")[0]
        if date_ori.isdigit():
            date = int(date_ori)
        else:
            date = int(ChineseNumToArab(date_ori))*rate

    elif year_ori.__contains__("月"):
        date_ori = year_ori.split("月")[0]
        if date_ori.isdigit():
            date = int(date_ori)
        else:
            date = int(ChineseNumToArab(date_ori)) * rate

    return date
if __name__ == '__main__':
    df=pd.read_csv("../ETL.csv")
    years = df['year']
    count = 0
    for x in years:
        count = count +1
        print(x,end="\t")
        print(ETLYear(x))
    print(count)

处理结果如下：
在这里插入图片描述

ChanZany

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
3
评论
Python数据分析一条龙(菜鸟上手项目记录--数据处理阶段)

上一节回顾3. 数据处理阶段分析为方便后面进行回归分析，所以需要将所有数据转换为数字的形式，在这里我们需要完成的功能如下：将被判年限如四年转换为4*12（以月份为单位）将赔偿金额如五万八千元转换为58000(以元为单位)将法院地址、被告籍贯、被告的文化水平通过字典的方式映射为对应的数字，如：将被告生日转为当前年龄将性别转为0|1数字代码实现先说一下将法院地址、被告籍贯、被告的文化水平通过字典的方式映射为对应的数字的功能实现，因为我当时没有反应过来numpy有一个
复制链接

扫一扫

专栏目录