在财务票据中,中文大写金额(如“贰拾捌万壹仟柒佰伍拾伍元壹角玖分”)被广泛使用以防止篡改。但在数据处理时,我们需要将其转换为阿拉伯数字形式。本文将带你一步步解析如何用Python实现这一转换。
一、核心思路拆解
整个转换过程可分为三步:
- 中文数字解析 :将“贰拾捌”等字符串转换为数字
- 单位分割 :处理“亿”、“万”等大单位分隔
- 元角分处理 :分离并转换金额中的整数与小数部分
二、中文数字解析实现
def chinese_to_number(chinese_str):
result = 0
temp = 0
unit = 1
for char in chinese_str:
if char in chinese_to_arabic:
temp += chinese_to_arabic[char] * unit
if unit < 10:
unit = 1 # 重置单位为个位
elif char in unit_map:
unit = unit_map[char]
temp = 1 if temp == 0 else temp # 处理"拾"等单位前无数字的情况
result += temp * unit
temp = 0
unit = 1
return result + temp
解析逻辑 :
- 遍历每个字符,遇到数字字符(壹、贰等)时累加到临时变量
- 遇到单位字符(拾、佰等)时,将当前累加值乘以单位后加入结果
- 特殊处理连续单位(如"拾万"自动补1)
三、大单位分割策略
def extract_parts_yi_wan(s):
result = {"亿": "", "万": "", "一": ""}
if "亿" in s:
yi_index = s.index("亿")
result["亿"] = s[:yi_index]
after_yi = s[yi_index+1:]
if "万" in after_yi:
wan_index = after_yi.index("万")
result["万"] = after_yi[:wan_index]
result["一"] = after_yi[wan_index+1:]
else:
result["一"] = after_yi
elif "万" in s:
wan_index = s.index("万")
result["万"] = s[:wan_index]
result["一"] = s[wan_index+1:]
else:
result["一"] = s
return result
分割规则 :
- 优先分割"亿"级单位
- 在亿级后的部分继续分割"万"级
- 剩余部分作为个位部分处理
例如:"贰亿叁仟万肆仟伍佰"会被分割为:
亿:"贰"
万:"叁仟"
一:"肆仟伍佰"
四、元角分综合处理
def convert_amounts(amount_list):
pattern = re.compile(
r'([零壹贰叁肆伍陆柒捌玖拾佰仟万亿]+)元?'
r'((?:零|[壹贰叁肆伍陆柒捌玖]+)角)?'
r'((?:零|[壹贰叁肆伍陆柒捌玖]+)分)?'
)
converted = []
for amount in amount_list:
match = pattern.findall(amount)
if not match:
converted.append("无法转换")
continue
yuan, jiao, fen = match[0]
parts = extract_parts_yi_wan(yuan)
total = 0
# 处理亿、万、个位部分
for unit, value in parts.items():
if not value:
continue
num = chinese_to_number(value)
if unit == "亿":
num *= 100000000
elif unit == "万":
num *= 10000
total += num
# 处理角分
if jiao and jiao != '零角':
total += chinese_to_arabic[jiao[:-1]] * 0.1
if fen and fen != '零分':
total += chinese_to_arabic[fen[:-1]] * 0.01
converted.append(f"{total:.2f}")
return converted
处理流程 :
- 正则提取元、角、分三部分
- 分别解析各部分数值
- 按单位权重累加计算总金额
- 保留两位小数输出
五、测试验证
amounts = [
'贰拾捌万壹仟柒佰伍拾伍元壹角玖分',
'贰拾伍万捌仟肆佰玖拾壹元'
]
print(convert_amounts(amounts))
输出结果:
['281755.19', '258491.00']
六、全部代码:
import re
# 定义中文数字到阿拉伯数字的映射
chinese_to_arabic = {
'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4,
'伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9,
}
unit_map = {'拾': 10, '佰': 100, '仟': 1000}
def chinese_to_number(chinese_str):
result = 0
temp_result = 0
unit = 1
for char in chinese_str:
if char in chinese_to_arabic:
temp_result += chinese_to_arabic[char] * unit
if unit < 10: # 当前处理的是个位数,需要重置unit
unit = 1
elif char in unit_map:
unit = unit_map[char]
temp_result = 1 if temp_result == 0 else temp_result
result += temp_result * unit
temp_result = 0
unit = 1
elif char in ['元', '角', '分']:
break
result += temp_result
return result
def extract_parts_yi_wan(s):
# 初始化结果字典
result = {"亿": "", "万": "", "一": ""}
# 检查字符串是否包含“亿”
if "亿" in s:
yi_index = s.index("亿")
result["亿"] = s[:yi_index]
# 检查亿后面的部分是否包含“万”
after_yi = s[yi_index + 1:]
if "万" in after_yi:
wan_index = after_yi.index("万")
result["万"] = after_yi[:wan_index]
result["一"] = after_yi[wan_index + 1:]
else:
# 如果没有“万”,则亿与万中间的字符串为空,万后面的字符串为亿后面的所有内容
result["万"] = ""
result["一"] = after_yi
else:
# 如果没有“亿”,检查是否包含“万”
if "万" in s:
wan_index = s.index("万")
result["一"] = s[wan_index + 1:]
result["万"] = s[:wan_index]
else:
# 如果没有“万”,则万后面的字符串为空,亿前面的字符串为整个字符串
result["一"] = s
result["万"] = ""
return result
def convert_amounts(amount_list):
pattern = re.compile(
r'([壹贰叁肆伍陆柒捌玖零拾佰仟万亿]+)元?((?:零|[壹贰叁肆伍陆柒捌玖]+)角)?((?:零|[壹贰叁肆伍陆柒捌玖]+)分)?')
converted_list = []
for amount in amount_list:
number = 0
match = pattern.findall(amount)
if match:
yuan, jiao, fen = match[0]
ret_yuan = extract_parts_yi_wan(yuan)
for k, v in ret_yuan.items():
new_number = 0
if v:
new_number = chinese_to_number(v)
if k == "亿":
new_number *= 100000000
elif k == "万":
new_number *= 10000
number += new_number
if jiao and jiao != '零角':
number += chinese_to_arabic[jiao.replace('角', '')] * 0.1
if fen and fen != '零分':
number += chinese_to_arabic[fen.replace('分', '')] * 0.01
converted_list.append(f"{number:.2f}")
else:
converted_list.append("无法转换")
return converted_list
amounts = ['贰拾捌万壹仟柒佰伍拾伍元壹角玖分', '贰拾伍万捌仟肆佰玖拾壹元']
converted_amounts = convert_amounts(amounts)
print(converted_amounts)