39.数据清洗

https://docs.python.org/zh-cn/3/library/re.html
re.compile(pattern, flags=0)将正则表达式的样式编译为一个 正则表达式对象
prog = re.compile(pattern)
result = prog.match(string)
等价于
result = re.match(pattern, string)

\d 匹配任意数字,等价于 [0-9]。
re.findall(pattern, string, flags=0)对 string 返回一个不重复的 pattern 的匹配列表, string 从左到右进行扫描,匹配按找到的顺序返回。如果样式里存在一到多个组,就返回一个组合列表;就是一个元组的列表(如果样式里有超过一个组合的话)。空匹配也会包含在结果里。

import xlrd
import xlwt
import re
def get_salary(salary):
    if '-' in salary:
        low_salary = re.findall('\d*\.?\d+',salary)[0]  #?匹配多次
        high_salary = re.findall('\d*\.?\d+', salary)[1]
        if u'万' in salary and u'年' in salary:
            low_salary = float(low_salary) / 12 * 10
            high_salary = float(high_salary) / 12 * 10
        elif u'万' in salary and u'月' in salary:
            low_salary = float(low_salary) * 10
            high_salary = float(high_salary) * 10
    else:  # 针对20万以上/年和100元/天这种情况,不包含-,取最低工资,没有最高工资
        low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0]
        high_salary = ""
        if u'万' in salary and u'年' in salary:  # 单位统一成千/月的形式
            low_salary = float(low_salary) / 12 * 10
        elif u'万' in salary and u'月' in salary:
            low_salary = float(low_salary) * 10
        elif u'元' in salary and u'天' in salary:
            low_salary = float(low_salary) / 1000 * 21  # 每月工作日21天
    return low_salary, high_salary

def open_xlsx(file):
    data = xlrd.open_workbook(file)
    table0 = data.sheet_by_name('Sheet1')
    nrows = table0.nrows
    return table0, nrows

def main():
    table, nrows = open_xlsx('test.xlsx')
    workbook = xlwt.Workbook(encoding='utf-8')  # 创建Excel文件
    worksheet = workbook.add_sheet('result')  # 创建表
    print('一共有{}行数据,开始清洗数据'.format(nrows))
    for i in range(1,nrows):
        salary = table.row_values(i)[2]
        getsalary = get_salary(salary)
        low_salary = getsalary[0]
        high_salary = getsalary[1]
        print('正在写入第{}条,最低工资是{}k,最高工资是{}k'.format(i, low_salary, high_salary))

        worksheet.write(i, 0, low_salary)  # 单元格写入
        worksheet.write(i, 1, high_salary)
    workbook.save("result.xls")


if __name__ == '__main__':
    main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值