根据UTF-8的编码规则,可以知道:
1. 全角空格为12288,半角空格为32
2. 其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
但是UTF-8不能字节转换位整型变量,因此需要一个函数,做一个转换:
function utf8_to_num(raw_string)
local result = {}
pos = 1
while pos <= #raw_string do
local count_1_of_byte = get_continuous_1_count_of_byte(string.byte(raw_string, pos))
local num = 0
if count_1_of_byte < 1 then
num = string.byte(raw_string, pos)
count_1_of_byte = 1
else
boundary = 8
i = count_1_of_byte + 1
while i < boundary * count_1_of_byte do
if 0 == i % boundary then
i = i + 2
end
if (1 << (boundary - i % boundary - 1)) & string.byte(raw_string, pos + math.floor(i / boundary)) ~= 0 then
--print(1)
num = (num << 1) | 1
else
--print(0)
num = num << 1
end
i= i + 1
end
end
pos = pos + count_1_of_byte
table.insert(result, num)
end
return result
end
为了方便将一个utf8字符,转换成整型,还需要判断一个utf8字符占了多少个字节,为此需要一个函数来判断(具体参考:
http://blog.csdn.net/SKY453589103/article/details/76337557):
--获取一个字节中,从最高位开始连续的1的个数
function get_continuous_1_count_of_byte(num)
if nil == num then
return -1
end
local count = 0
while num & 0x80 ~= 0 do
count = count + 1
num = num << 1
end
return count
end
接下来就只转换的函数:
function full_width_to_half_width(raw_string)
local new_string = {}
local pos = 1
while pos <= #raw_string do
local count_1_of_byte = get_continuous_1_count_of_byte(string.byte(raw_string, pos))
if 3 == count_1_of_byte then
char = string.sub(raw_string, pos, pos + 2)
num_of_char = utf8_to_num(char)[1]
if 12288 == num_of_char then
num_of_char = 32
table.insert(new_string, string.char(num_of_char))
elseif 65281 <= num_of_char and num_of_char <= 65374 then
num_of_char = num_of_char - 65248
table.insert(new_string, string.char(num_of_char))
end
pos = pos + count_1_of_byte
else
table.insert(new_string, string.sub(raw_string, pos, pos))
pos = pos + 1
end
end
return table.concat(new_string)
end
比较简单,就不做解释了。