defextract_number(text):"""
将句子中的中文数字、阿拉伯数字等提取出来
输入:句子
输出:list,提取出来的中文数字、阿拉伯数字
"""# 定义匹配各种数字的正则表达式
regex_patterns =[# 匹配中文数字、分数(如一分之二)、百分比r'((?:\d|一|二|三|四|五|六|七|八|九|零|十|百|千|万|亿|档|挡|度|级|格)+(?:分之|/|\.)?(?:\d|一|二|三|四|五|六|七|八|九|零|十|百|千|万|亿|档|挡|度|级|格|%)*)',# 匹配罗马数字和普通小数# r'\b(?:\d+\.\d+|\d+)\b']# 预定义不应抽取数字的后续字符串
exclude_follows =['条','点','个','下','排','些','百','千','万','亿']# 遍历正则表达式进行匹配
res =[]for pattern in regex_patterns:
matches = re.findall(pattern, text)if matches:formatchin matches:ifmatchin('档','挡','度','级','格'):continue# 确保匹配的数字后面不是排除的词语ifnotany(excl in text[text.index(match)+len(match):].lower()andlen(match)==1for excl in exclude_follows):
res.append(match)return res
2. 将字符串中的中文数字转换成阿拉伯数字
import re
# 汉字数字与对应值的映射
chinese_to_value ={'一':1,'二':2,'三':3,'四':4,'五':5,'六':6,'七':7,'八':8,'九':9,'零':0}
unit_dict ={'十':10,'百':100,'千':1000,'万':10000,'亿':100000000}defchinese_number_to_int(chinese_num):
total =0
unit =1for char inreversed(chinese_num):if char in unit_dict:
unit = unit_dict[char]elif char in chinese_to_value:
value = chinese_to_value[char]
current = value
total += current * unit
return total
defreplace_chinese_numbers(text):"""
将句子中的中文数字替换成数字
输入:句子
输出:句子
"""# 查找所有可能的汉字数字串
chinese_num_pattern = re.compile(r'[一二三四五六七八九十百千万亿]+')defreplacement(match):
chinese_num =match.group(0)if chinese_num in unit_dict:return chinese_num
arabic_num = chinese_number_to_int(chinese_num)returnstr(arabic_num)
result = re.sub(chinese_num_pattern, replacement, text)return result