https://docs.python.org/zh-cn/3/library/re.html
re.compile(pattern, flags=0)将正则表达式的样式编译为一个 正则表达式对象
prog = re.compile(pattern)
result = prog.match(string)
等价于
result = re.match(pattern, string)
\d 匹配任意数字,等价于 [0-9]。
re.findall(pattern, string, flags=0)对 string 返回一个不重复的 pattern 的匹配列表, string 从左到右进行扫描,匹配按找到的顺序返回。如果样式里存在一到多个组,就返回一个组合列表;就是一个元组的列表(如果样式里有超过一个组合的话)。空匹配也会包含在结果里。
import xlrd
import xlwt
import re
def get_salary(salary):
if '-' in salary:
low_salary = re.findall('\d*\.?\d+',salary)[0] #?匹配多次
high_salary = re.findall('\d*\.?\d+', salary)[1]
if u'万' in salary and u'年' in salary:
low_salary = float(low_salary) / 12 * 10
high_salary = float(high_salary) / 12 * 10
elif u'万' in salary and u'月' in salary:
low_salary = float(low_salary) * 10
high_salary = float(high_salary) * 10
else: # 针对20万以上/年和100元/天这种情况,不包含-,取最低工资,没有最高工资
low_salary = re.findall(re.compile('(\d*\.?\d+)'), salary)[0]
high_salary = ""
if u'万' in salary and u'年' in salary: # 单位统一成千/月的形式
low_salary = float(low_salary) / 12 * 10
elif u'万' in salary and u'月' in salary:
low_salary = float(low_salary) * 10
elif u'元' in salary and u'天' in salary:
low_salary = float(low_salary) / 1000 * 21 # 每月工作日21天
return low_salary, high_salary
def open_xlsx(file):
data = xlrd.open_workbook(file)
table0 = data.sheet_by_name('Sheet1')
nrows = table0.nrows
return table0, nrows
def main():
table, nrows = open_xlsx('test.xlsx')
workbook = xlwt.Workbook(encoding='utf-8') # 创建Excel文件
worksheet = workbook.add_sheet('result') # 创建表
print('一共有{}行数据,开始清洗数据'.format(nrows))
for i in range(1,nrows):
salary = table.row_values(i)[2]
getsalary = get_salary(salary)
low_salary = getsalary[0]
high_salary = getsalary[1]
print('正在写入第{}条,最低工资是{}k,最高工资是{}k'.format(i, low_salary, high_salary))
worksheet.write(i, 0, low_salary) # 单元格写入
worksheet.write(i, 1, high_salary)
workbook.save("result.xls")
if __name__ == '__main__':
main()