[lua] string trim实现

From: http://lua-users.org/wiki/StringTrim

There are many ways to implement the "trim" function [1] in Lua:


-- trim implementations

function trim1(s)
  return (s:gsub("^%s*(.-)%s*$", "%1"))
end
-- from PiL2 20.4

function trim2(s)
  return s:match "^%s*(.-)%s*$"
end
-- variant of trim1 (match)

function trim3(s)
  return s:gsub("^%s+", ""):gsub("%s+$", "")
end
-- two gsub's

function trim4(s)
  return s:match"^%s*(.*)":match"(.-)%s*$"
end
-- variant of trim3 (match)

function trim5(s)
  return s:match'^%s*(.*%S)' or ''
end
-- warning: has bad performance when s:match'^%s*$' and #s is large

function trim6(s)
  return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)'
end
-- fixes performance problem in trim5.
-- note: the '()' avoids the overhead of default string capture.
-- This overhead is small, ~ 10% for successful whitespace match call
-- alone, and may not be noticeable in the overall benchmarks here,
-- but there's little harm either.  Instead replacing the first `match`
-- with a `find` has a similar effect, but that requires localizing
-- two functions in the trim7 variant below.

local match = string.match
function trim7(s)
  return match(s,'^()%s*$') and '' or match(s,'^%s*(.*%S)')
end
-- variant of trim6 (localize functions)

local find = string.find
local sub = string.sub
function trim8(s)
  local i1,i2 = find(s,'^%s*')
  if i2 >= i1 then s = sub(s,i2+1) end
  local i1,i2 = find(s,'%s*$')
  if i2 >= i1 then s = sub(s,1,i1-1) end
  return s
end
-- based on penlight 0.7.2

function trim9(s)
  local _,i1 = find(s,'^%s*')
  local i2 = find(s,'%s*$')
  return sub(s,i1+1,i2-1)
end
-- simplification of trim8

function trim10(s)
  local a = s:match('^%s*()')
  local b = s:match('()%s*$', a)
  return s:sub(a,b-1)
end
-- variant of trim9 (match)

function trim11(s)
 local n = s:find"%S"
 return n and s:match(".*%S", n) or ""
end
-- variant of trim6 (use n position)
-- http://lua-users.org/lists/lua-l/2009-12/msg00904.html

function trim12(s)
 local from = s:match"^%s*()"
 return from > #s and "" or s:match(".*%S", from)
end
-- variant of trim11 (performs better for all
-- whitespace string). See Roberto's comments
-- on ^%s*$" v.s. "%S" performance:
-- http://lua-users.org/lists/lua-l/2009-12/msg00921.html

do
 require 'lpeg'
 local space = lpeg.S' \t\n\v\f\r'
 local nospace = 1 - space
 local ptrim = space^0 * lpeg.C((space^0 * nospace^1)^0)
 local match = lpeg.match
 function trim13(s)
   return match(ptrim, s)
 end
end
-- lpeg.  based on http://lua-users.org/lists/lua-l/2009-12/msg00921.html

do
 require 'lpeg'
 require 're'
 local ptrim = re.compile"%s* {(%s* %S+)*}"
 local match = lpeg.match
 function trim14(s)
   return match(ptrim, s)
 end
end
-- variant with re module.

require 'trim'
local trim15 = trim
-- C implementation (see separate trim.c file)


-- test utilities

local function trimtest(trim)
  assert(trim'' == '')
  assert(trim' ' == '')
  assert(trim'  ' == '')
  assert(trim'a' == 'a')
  assert(trim' a' == 'a')
  assert(trim'a ' == 'a')
  assert(trim' a ' == 'a')
  assert(trim'  a  ' == 'a')
  assert(trim'  ab cd  ' == 'ab cd')
  assert(trim' \t\r\n\f\va\000b \r\t\n\f\v' == 'a\000b')
end

local function perftest(f, s)
  local time = os.clock  -- os.time or os.clock
  local t1 = time()
  for i=1,100000 do
    f(s)f(s)f(s)f(s)f(s)f(s)f(s)f(s)f(s)f(s)
  end
  local dt = time() - t1
  io.stdout:write(string.format("%4.1f",dt) .. ' ')
end

local trims = {trim1, trim2, trim3, trim4, trim5, trim6, trim7,
               trim8, trim9, trim10, trim11, trim12, trim13, trim14, trim15}

-- correctness tests
for _,trim in ipairs(trims) do
  trimtest(trim)
end

-- performance tests
for j=1,3 do
  for i,trim in ipairs(trims) do
    io.stdout:write(string.format("%2d",i) .. ": ")
    perftest(trim,  "")
    perftest(trim,  "abcdef")
    perftest(trim,  "   abcdef   ")
    perftest(trim,  "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef")
    perftest(trim,  "  a b c d e f g h i j k l m n o p q r s t u v w x y z A B C ")
    perftest(trim,  "                               a                            ")
    perftest(trim,  "                                                            ")
    print()
  end
end

Optional C module for test15 based on LuaList:2009-12/msg00951.html:


/* trim.c - based on http://lua-users.org/lists/lua-l/2009-12/msg00951.html
            from Sean Conner */
#include <stddef.h>
#include <ctype.h>
#include <lua.h>
#include <lauxlib.h>

int trim(lua_State *L)
{
 const char *front;
 const char *end;
 size_t      size;

 front = luaL_checklstring(L,1,&size);
 end   = &front[size - 1];

 for ( ; size && isspace(*front) ; size-- , front++)
   ;
 for ( ; size && isspace(*end) ; size-- , end--)
   ;

 lua_pushlstring(L,front,(size_t)(end - front) + 1);
 return 1;
}

int luaopen_trim(lua_State *L)
{
 lua_register(L,"trim",trim);
 return 0;
}

Test results (lower numbers = faster):


Lua 5.1.4/Cygwin1.7
 1:  0.9  1.9  2.1  9.6 10.3  2.2  2.0
 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.2  2.2  2.2 10.1 11.2  2.7  2.2
 5:  0.6  0.9  1.1  1.2  1.3  2.8 77.6
 6:  0.6  1.2  1.6  1.6  1.8  4.1  1.7
 7:  0.6  1.1  1.5  1.5  1.6  3.9  1.6
 8:  1.0  1.7  2.5  7.5  9.7  3.0  2.4
 9:  1.2  2.0  2.7  8.0  9.3 21.2  3.4
10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3
 1:  0.9  1.9  2.0  9.6 10.3  2.2  1.9
 2:  0.7  1.6  1.8  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.1  2.2  2.2 10.1 11.4  2.7  2.2
 5:  0.6  0.9  1.2  1.2  1.2  2.8 78.2
 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7
 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6
 8:  1.0  1.7  2.5  7.5  9.6  3.1  2.3
 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3
10:  1.5  2.4  2.5  9.8 10.8  2.8  2.5
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.4  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3
 1:  0.9  1.9  2.0  9.6 10.3  2.2  2.0
 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.1  2.2  2.2 10.1 11.2  2.7  2.2
 5:  0.6  0.9  1.2  1.2  1.3  2.8 77.3
 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7
 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6
 8:  1.0  1.7  2.6  7.4  9.6  3.1  2.3
 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3
10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.6  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3

LuaJIT 2.0.0-beta2/Cygwin1.7
 1:  0.6  1.5  1.5  7.7  8.3  1.3  1.2
 2:  0.4  1.2  1.2  7.1  7.8  1.1  1.0
 3:  0.6  1.0  1.2  3.1  4.9  1.7  1.3
 4:  0.8  1.6  1.8  8.4  9.0  1.9  1.3
 5:  0.4  0.6  0.8  1.2  1.2  2.3 99.2
 6:  0.4  0.9  1.1  1.5  1.5  3.2  0.9
 7:  0.3  0.8  1.1  1.4  1.5  3.0  0.8
 8:  0.6  1.2  1.6  5.3  6.8  1.7  1.3
 9:  0.7  1.2  1.8  5.6  6.7 14.4  1.7
10:  0.9  1.6  1.7  7.6  8.3  1.5  1.5
11:  0.3  0.8  1.1  1.4  1.5  2.9  2.1
12:  0.4  0.9  1.1  1.5  1.5  2.7  0.9
13:  0.6  0.7  0.7  1.0  1.4  0.8  0.7
14:  0.6  0.7  0.7  1.0  1.4  0.8  0.8
15:  0.1  0.1  0.1  0.3  0.3  0.2  0.2
...

The speed of trim5 is relatively favorable or competitive over the data set except it is quite poor in the case of a long string with only whitespace. The use of ".*" (rather than a non-greedy ".-") quickly plows to the end of a long string, and the "%S" then triggers backtracking to avoid the trailing whitespace, but the juxtaposition of "%s*" and ".*" imposes substantial backtracking (O(N^2)) if no "%S" match is found. trim6handles the worst cast condition specially and behaves well over the entire data set. trim7 is a variant that localizes functions, which gives a bit larger code size and does not give a substantial speed improvement. (On further testing, each localization gives maybe 10% in best case involving empty string data and inlining the function call, but here there are two calls to match). trim11 is also a variant of trim6with similar performance characteristics, perhaps slightly better, except speed is about half in the case of a long string with only whitespace (this is fixed in trim12). trim13 (or its variant trim14) is an lpeg (LuaPeg) implementation, which generally meets or exceeds the performance of the best pattern matching implementations, particularly exceeding in the case of lots of whitespace, but it is a bit slower in its worst case condition of alternating space and whitespace. The C implementation (trim15) is by far the fastest over the entire data set.

Reusing the same strings across all test iterations, as done above, might not properly gauge the effect of temporary string creation due to Lua's string interning. A quick test with local ss = {}; for i=1,80000 do ss[i] = " " .. (" "..i):rep(10) .. " " end as a string cache doesn't seem to affect the basic conclusions above.

--DavidManura

See Also



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值