#!usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import unicode_literals
import re
import sys
nums = ['零', '壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖']
decimal_label = ['角', '分']
small_int_label = ['', '拾', '佰', '仟']
def convert(n):
n=int(n)
int_part, decimal_part = str(int(n)), str(n - int(n))[2:]
res = []
if decimal_part:
res.append(''.join([nums[int(x)] + y for x, y in zip(decimal_part, decimal_label) if x != '0']))
if int_part != '0':
while int_part:
small_int_part, int_part = int_part[-4:], int_part[:-4]
tmp = ''.join([nums[int(x)] + (y if x != '0' else '') for x, y in zip(small_int_part[::-1], small_int_label)[::-1]])
tmp = tmp.rstrip('零').replace('零零零', '零').replace('零零', '零')
if tmp:
res.append(tmp)
return ''.join(res[::-1])
#convert num to hanzi
def num_deal(s):
val=s
m = re.findall(r'([0-9]+)',val)
for num in m:
tmp=""
cnt=len(num)
if 4 == cnt:
for i in num:
tmp+=nums[int(i)]
else:
tmp=convert(num)
val=re.sub(num,tmp,val)
return val
#delete the biaodianfuhao in line
def change2(nval):
val=nval
val_1=val.decode('utf-8')
#unicode chinese huanhang dunhao
va1_2=re.sub(u'[^0-9a-zA-Z\u4E00-\u9FA5\u000A]+',' ',val_1)
val_3=num_deal(va1_2)
return val_3
#delete the line with all num or chracter
def change1(val):
val_1=val
if val_1 is None or len(val_1) < 3:
return None
val_2=re.search(u'[0-9a-zA-Z]+',val_1)
if val_2 is not None and len(val_2.group()) > 10:
return None
else:
return 1
ls=[]
fr=open(sys.argv[1],"r")
fw=open("res.txt","w")
for line in fr.readlines():
if change1(line) is None:
continue
line2=change2(line) #delete biaodianfuhao
line3=re.split(' ',line2) #split to small
for x in line3:
#print x
if x not in ls and len(x) > 3:
ls.append(x)
fw.write(x.encode('utf-8'))
fr.close()
fw.close()
数据清洗----python
最新推荐文章于 2024-09-04 17:02:10 发布