【lua学习】3.字符串
Lua字符串的概况
- Lua虚拟机中存在一个散列桶结构的全局字符串表来存放所有字符串
- 关于比较字符串:先比较hash,再比较长度,再逐字符比较。节省时间
- 同一个字符串在Lua虚拟机中仅有一个副本。节省空间
- 一旦创建则无法变更
- 变量存放的仅是字符串的引用
字符串实现
字符串结构TString
(lobject.h) TString
typedef union TString {
L_Umaxalign dummy;//保证最大对其//见下文
struct {
CommonHeader;
lu_byte reserved;//当>0时,其值-1表示保留字列表中的索引//见下文
unsigned int hash;//字符串散列值,根据字符串长度和部分字符计算而来的值,见下文
size_t len;//字符串长度
} tsv;
} TString;
(llimits.h) L_Umaxalign
typedef LUAI_USER_ALIGNMENT_T L_Umaxalign;//LUAI_USER_ALIGNMENT_T见下文
(luaconf.h) LUAI_USER_ALIGNMENT_T
//看此定义,为8字节对齐
#define LUAI_USER_ALIGNMENT_T union { double u; void* s; long l; }
全局字符串表stringtable
(lstate.h) stringtable
typedef struct stringtable {
GCObject** hash;//开散列结构(和lua table的闭散列结构是有区别的),指向一个数组,每个元素是桶(GCObject*类型),桶管理GCObject链表
lu_int32 nuse;//存储的字符串数量
int size;//全局字符串表的最大容量(hash桶的最大数量)
} stringtable;
新建字符串luaS_newlstr (先查表,再决定创建与否)
(lstring.c) luaS_newlstr
TString* luaS_newlstr(lua_State* L, const char *str, size_t len)
{
//初始h值就是字符串的长度
unsigned int h = cast(unsigned int, len);//cast就是强制转型,见下文
//获得计算hash值的跨度,如果字符串很长,若逐位计算肯定非常消耗性能
size_t step = (len>>5) + 1;
//从最后一个字符开始,计算h值,跟后续计算的值执行异或,进而得到最终的h值
for (size_t l1 = len; l1 >= step; l1 -= step)
{
h ^= ((h<<5)+(h>>2)+cast(unsigned char, str[l1-1]));
}
//h值对全局字符串表的最大桶数量求余,得到桶的索引
unsigned int bucket_index = lmod(h, G(L)->strt.size);//lmod见下文,G见下文
//遍历该桶管理的链表,查找有没有相等的字符串
for (GCObject* gco = G(L)->strt.hash[bucket_index];
gco != NULL;
gco = gco->gch.next)
{
TString* ts = rawgco2ts(gco);//rawgco2ts见下文
if (ts->tsv.len == len//先比较长度
&& (memcmp(str, getstr(ts), len) == 0))//再逐位比较。getstr见下文
{
if (isdead(G(L), gco))//若要被GC,则把标记标为另一种白色,防止被GC。isdead见下文
{
changewhite(gco);
}
//找到了,就不需要新建了,直接返回即可
return ts;
}
}
//新建字符串
return newlstr(L, str, len, h);
}
(llimits.h) cast宏
#define cast(t, exp) ((t)(exp))
(lobject.h) lmod宏
//针对size为2次幂的 优化的 取模算法
#define lmod(s,size) \
(check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1)))))
(lstate.h) G宏
#define G(L) (L->l_G)
(lstate.h) rawgco2ts宏
//根据GCObject*获取TString*
#define rawgco2ts(gco) check_exp((gco)->gch.tt == LUA_TSTRING, &((gco)->ts))
(lobject.h) getstr宏
//根据TString* 获取 字符串的首地址,注意:字符串首地址并不在TString内部,而在TString对象最后一个字节的下一个字节,这也解释了为何TString一定要对齐,就是为了提高CPU读取性能
#define getstr(ts) cast(const char *, (ts) + 1)
(lgc.h) isdead宏
//判断是否在当前GC阶段被判定为需要回收,todo以后讨论
#define isdead(g, gco) ((gco)->gch.marked & otherwhite(g) & WHITEBITS)
(lgc.h) changewhite宏
//改变GCObject的当前白色标记,todo以后讨论
#define changewhite(gco) ((gco)->gch.marked ^= WHITEBITS)
新建字符串 newlstr
(lstring.c) newlstr
static TString* newlstr(lua_State* L, const char* str, size_t len, unsigned int h)
{
//若字符串太长,则luaM_toobig。luaM_toobig见下文
if (len + 1 > (MAX_SIZE - sizeof(TString)/sizeof(char))
{
luaM_toobig(L);
}
//为TString对象分配连续的空间,这个空间首部是TString结构,后面紧接着是字符串实际内容
TString* ts = cast(TString*, luaM_malloc(L, (len + 1)*sizeof(char) + sizeof(TString)));
ts.tsv.len = len;
ts.tsv.hash = h;
ts.tsv.marked = luaC_white(G(L));
ts.tsv.reserved = 0;
//获取字符串内容的首地址pstr
char* pstr = (char*)(ts + 1);
//拷贝str到pstr
memcpy(pstr , str, len * sizeof(char));
//字符串最后一个字符当然是'\0'
pstr[len] = '\0';
//获取全局字符串表
stringtable* tb = &G(L)->strt;
//计算桶索引
h = lmod(h, tb->size);
//用头插法将字符串插入桶中
ts->tsv.next = tb->hash[h];
tb->hash[h] = obj2gco(ts);//obj2gco见下文
//全局字符串表的字符串数量+1
tb->nuse++;
//若字符串总数 超过了 全局字符串表的最大桶数 且 最大桶数 <= MAX_INT/2,则对全局字符串表扩容
if (tb->nuse > cast(lu_int32, tb->size) && tb->size <= MAX_INT/2)
{
luaS_resize(L, tb->size*2);//luaS_resize,重新分配全局字符串表的大小,见下文
}
}
(llimits.h) MAX_SIZET宏
#define MAX_SIZET ((size_t)(~(size_t)0)-2)
(lmem.c) luaM_toobig报告要分配的内存过大
void* luaM_toobig(lua_State* L)
{
//luaG_runerror, todo后面讨论
luaG_runerror(L, "memory allocation error: blobk too big");
return NULL;
}
(lmem.h) luaM_malloc宏
//请求分配needbytes字节内存,luaM_realloc_见下文
#define luaM_malloc(L, bytes_to_allocate) luaM_realloc_(L, NULL, 0, (bytes_to_allocate))
(lmem.c) luaM_realloc_分配内存
void* luaM_relloc_(lua_State* L, void* address_to_free, size_t bytes_to_free, size_t bytes_to_allocate)
{
lua_assert((bytes_to_free==0)==(address_to_free==NULL));//#llimits.h中define lua_assert(c) ((void)0) 什么也不做,所以忽略
global_State* g = G(L);
//调用全局表的内存分配函数
void* address_to_allocate = (*g->frealloc)(g->ud, address_to_free, bytes_to_free, bytes_to_allocate);
//若分配失败且需要的字节数>0,抛出内存分配错误
if (address_to_allocate == NULL && bytes_to_allocate> 0)
{
//luaD_throw,todo后面讨论
luaD_throw(L, LUA_ERRMEM);//lua.h中#define LUA_ERRMEM 4
}
lua_assert((bytes_to_allocate==0)==(address_to_allocate==NULL));
//分配的内存总字节数 发生变化
g->totalbytes += bytes_to_allocate - bytes_to_free;
return address_to_allocate;
}
(lgc.h) luaC_white获取当前GC白色
//获取当前gc的白色,todo后面讨论
#define luaC_white(g) cast(lu_byte, (g)->currentwhite & WHITEBITS)
(lstate.h) obj2gco宏
//将对象指针强制转为GCObject*
#define obj2gco(v) (cast(GCObject *, (v)))
(llimits.h) MAX_INT宏
#define MAX_INT (INT_MAX-2)
重新设置全局字符串的大小 luaS_resize
(lstring.c) luaS_resize
void luaS_resize(lua_State* L, int newsize)
{
//若GC正处于扫描字符串阶段,则不处理。GCSweepingstring见下文
if (G(L)->gcstate == GCSweepingstring)
{
return;
}
//新分配hash结构
GCObject** newhash = luaM_newvector(L, newsize, GCObject*);
//初始化每个桶为空指针
for (int i = 0; i < newsize; i++)
{
newhash[i] = NULL;
}
//获取全局字符串表指针
stringtable* tb = &G(L)->strt;
//遍历每个桶,遍历每个桶管理的链表,全部夺舍到新的hash结构中
for (int i = 0; i < tb.size; i+)
{
GCObject* p = tb->hash[i];
//遍历桶管理的链表
while(p)
{
//以next指向下一个元素
GCObject* next = p->gch.next;
//根据的hash计算新的 hash
unsigned int oldh = gco2ts(p)->hash;
int newh = lmod(oldh, newsize);
lua_assert(cast_int(oldh%newsize)==lmod(oldh,newsize))
//用头插法将元素加入桶管理的链表
p->gch.next = newhash[newh];
newhash[newh] = p;
//p设为next,以便循环的下一轮
p = next;
}
}
//释放旧的hash结构
luaM_freearray(L, tb->hash, tb_size, TString*);
//更新全局字符串表
tb->size = newsize;
tb->hash = newhash;
}
(lgc.h) GCSsweepstring宏
//gc的几个阶段,todo后面再说
#define GCSpause 0
#define GCSpropagate 1
#define GCSsweepstring 2
#define GCSsweep 3
#define GCSfinalize 4
(lmem.h) luaM_newvector宏
//分配count个类型为datatype的连续内存空间,获得的数据强制转型为datatype*
#define luaM_newvector(L,count_to_allocate,datatype) \
cast(datatype*, luaM_reallocv(L, NULL, 0, count_to_allocate, sizeof(datatype)))
(lmem.h) luaM_reallocv宏
//分配count个singlebytes大小的连续内存空间,若空间足够则分配,否则报错
#define luaM_reallocv(L,address_to_free,count_to_free,count_to_allocate,singlebytes) \
((cast(size_t, (count)+1) <= MAX_SIZET/(singlebytes)) ?
luaM_realloc_(L, (address_to_free), (count_to_free)*(singlebytes), (count_to_allocate)*(singlebytes)) : \
luaM_toobig(L))
(lmem.h) luaM_freearray宏
//释放address_to_free处的count_to_free个datatype类型的连续内存空间
#define luaM_freearray(L, address_to_free, count_to_free, datatype) luaM_reallocv(L, (address_to_free), count_to_free, 0, sizeof(datatype))
全局字符串表的缩容
- 缩容的时机:垃圾回收的GCSweep阶段
- 缩容的原则:全局字符串表的字符串总数<桶的最大数量 且 桶的最大数量>MINSTRTABSIZE*2 (llimits.h中#define MINSTRTABSIZE 32)
(lgc.c) 看checkSize
static void checkSize(lua_State* L)
{
global_State* g = G(L);
//当全局字符串表的字符串总数小于桶最大数量的四分之一 且 桶的最大数量大于MINSTRTABSIZE*2, 则缩容
if (g-<strt.nuse < cast(lu_int32, g->strt.size/4)
&& g->strt.size > MINSTRTABSIZE*2)
{
luaS_resize(L, g->strt.size/2);
}
//...无关内容省略
}
(lgc.c) 看singlestep
//一次单步GC,todo后面再说
static l_mem singlestep(lua_State* L)
{
global_State* g = G(L);
switch(g->gcstate)
{
//...无关内容省略
case GCSweep:
{
lu_mem old = g->totalbytes;
g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX);
if (*g->sweepgc == NULL)
{
//包含有全局字符串表缩容的操作
checkSizes(L);
g->gcstate = GCSfinalize;
}
lua_assert(old >= g->totalbytes);
g->estimate -= old - g->totalbytes;
return GCSWEEPMAX*GCSWEEPCOST;
}
//...无关内容省略
}
//...无关内容省略
}
保留字是如何不被回收的
- 不被回收的原则:被luaS_fix操作,其tsv.marked被改成了FIXEDBIT
(llex.c) luaX_init
void luaX_init(lua_State* L)
{
for (int i=0; i<NUM_RESERVED; i++)//NUM_RESERVED见下文
{
//尝试新建每个保留字字符串
TString* ts = luaS_new(L, luaX_tokens[i]);//luaS_new luaX_tokens 见下文
//标记不会被GC,修改ts->tsv.marked为FIXEDBIT
luaS_fix(ts);//luaS_fix见下文
lua_assert(strlen(luaX_tokens[i]) + 1 <= TOKEN_LEN);//TOKEN_LEN见下文
//记录在保留字数组的索引+1值
ts->tsv.reserved = cast_byte(i + 1);
}
}
(llex.h) NUM_RESERVED宏
//表示多少个保留字
//TK_WHILE FIRST_RESERVED 见下文
#define NUM_RESERVED (cast(int, TK_WHILE-FIRST_RESERVED+1))
(llex.h) RESERVED 枚举
enum RESERVED {
/* terminal symbols denoted by reserved words */
TK_AND = FIRST_RESERVED, TK_BREAK,
TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
/* other terminal symbols */
TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_NUMBER,
TK_NAME, TK_STRING, TK_EOS
};
(lstring.h) luaS_new宏
//尝试新建一个字符串
#define luaS_new(L, s) (luaS_newlstr(L, s, strlen(s)))
(llex.c) luaX_tokens
//终结符数组
const char *const luaX_tokens [] = {
"and", "break", "do", "else", "elseif",
"end", "false", "for", "function", "if",
"in", "local", "nil", "not", "or", "repeat",
"return", "then", "true", "until", "while",
"..", "...", "==", ">=", "<=", "~=",
"<number>", "<name>", "<string>", "<eof>",
NULL
};
(lstring.h) luaS_fix宏
//设置字符串不会被GC
//l_setbit FIXEDBIT见下文
#define luaS_fix(s) l_setbit((s)->tsv.marked, FIXEDBIT)
(lgc.h) l_setbit宏
//x与m求或
#define setbits(x,m) ((x) |= (m))
//求2的b-1次方,也就是第b位为1,其余为0
#define bitmask(b) (1<<(b))
//将b1和b2位设为1,其余为0
#define bit2mask(b1,b2) (bitmask(b1) | bitmask(b2))
//将x的第b位置为1
#define l_setbit(x,b) setbits(x, bitmask(b))
(lgc.h) FIXEDBIT宏
//todo后面再说
#define WHITE0BIT 0
#define WHITE1BIT 1
#define BLACKBIT 2
#define FINALIZEDBIT 3
#define KEYWEAKBIT 3
#define VALUEWEAKBIT 4
#define FIXEDBIT 5
#define SFIXEDBIT 6
#define WHITEBITS bit2mask(WHITE0BIT, WHITE1BIT)
(llex.h) TOKEN_LEN宏
//保留字选function为最长
#define TOKEN_LEN (sizeof("function")/sizeof(char))