原文网址:https://github.com/alexander-yakushev/awesomerc/blob/master/awesompd/utf8.lua
-- Provides UTF-8 aware string functions implemented in pure lua: |
-- * string.utf8len(s) |
-- * string.utf8sub(s, i, j) |
-- |
-- All functions behave as their non UTF-8 aware counterparts with the exception |
-- that UTF-8 characters are used instead of bytes for all units. |
-- |
-- Note: all validations had been removed due to awesome usage specifics. |
--[[ |
Copyright (c) 2006-2007, Kyle Smith |
Modified by Alexander Yakushev, 2010-2013. |
All rights reserved. |
Redistribution and use in source and binary forms, with or without |
modification, are permitted provided that the following conditions are met: |
* Redistributions of source code must retain the above copyright notice, |
this list of conditions and the following disclaimer. |
* Redistributions in binary form must reproduce the above copyright |
notice, this list of conditions and the following disclaimer in the |
documentation and/or other materials provided with the distribution. |
* Neither the name of the author nor the names of its contributors may be |
used to endorse or promote products derived from this software without |
specific prior written permission. |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE |
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
--]] |
-- ABNF from RFC 3629 |
-- |
-- UTF8-octets = *( UTF8-char ) |
-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 |
-- UTF8-1 = %x00-7F |
-- UTF8-2 = %xC2-DF UTF8-tail |
-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / |
-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) |
-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / |
-- %xF4 %x80-8F 2( UTF8-tail ) |
-- UTF8-tail = %x80-BF |
-- |
-- returns the number of bytes used by the UTF-8 character at byte i in s |
-- also doubles as a UTF-8 character validator |
local utf8= {} |
function utf8.charbytes (s, i) |
-- argument defaults |
i = i or 1 |
local c = string.byte(s, i) |
-- determine bytes needed for character, based on RFC 3629 |
if c > 0 and c <= 127 then |
-- UTF8-1 |
return 1 |
elseif c >= 194 and c <= 223 then |
-- UTF8-2 |
local c2 = string.byte(s, i + 1) |
return 2 |
elseif c >= 224 and c <= 239 then |
-- UTF8-3 |
local c2 = s:byte(i + 1) |
local c3 = s:byte(i + 2) |
return 3 |
elseif c >= 240 and c <= 244 then |
-- UTF8-4 |
local c2 = s:byte(i + 1) |
local c3 = s:byte(i + 2) |
local c4 = s:byte(i + 3) |
return 4 |
end |
end |
-- returns the number of characters in a UTF-8 string |
function utf8.len (s) |
local pos = 1 |
local bytes= string.len(s) |
local len = 0 |
while pos <= bytes and len ~= charsdo |
local c = string.byte(s,pos) |
len = len + 1 |
pos = pos + utf8.charbytes(s, pos) |
end |
if chars ~= nil then |
return pos- 1 |
end |
return len |
end |
-- functions identically to string.sub except that i and j are UTF-8 characters |
-- instead of bytes |
function utf8.sub (s, i, j) |
j = j or -1 |
if i == nil then |
return "" |
end |
local pos= 1 |
local bytes= string.len(s) |
local len= 0 |
-- only set l if i or j is negative |
local l = (i >= 0 and j >= 0) or utf8.len(s) |
local startChar= (i >= 0)and i or l + i + 1 |
local endChar= (j >= 0)and j or l + j + 1 |
-- can't have start before end! |
if startChar> endChar then |
return "" |
end |
-- byte offsets to pass to string.sub |
local startByte, endByte= 1, bytes |
while pos<= bytes do |
len = len+ 1 |
if len == startChar then |
startByte = pos |
end |
pos = pos+ utf8.charbytes(s, pos) |
if len == endChar then |
endByte = pos- 1 |
break |
end |
end |
return string.sub(s, startByte, endByte) |
end |
-- replace UTF-8 characters based on a mapping table |
function utf8.replace (s, mapping) |
local pos= 1 |
local bytes= string.len(s) |
local charbytes |
local newstr= "" |
while pos<= bytes do |
charbytes = utf8.charbytes(s, pos) |
local c = string.sub(s, pos, pos + charbytes- 1) |
newstr = newstr.. (mapping[c] or c) |
pos = pos+ charbytes |
end |
return newstr |
end |
return utf8 |