Python3源码分析
本文环境python3.5.2。
参考书籍<<Python源码剖析>>
python官网
Python3的List对象
list对象是一个变长对象,在运行时动态调整其所维护的内存和元素,并且支持插入删除等操作,list的定义如下;
#define PyObject_VAR_HEAD PyVarObject ob_base;
#define Py_INVALID_SIZE (Py_ssize_t)-1
/* Nothing is actually declared to be a PyObject, but every pointer to
* a Python object can be cast to a PyObject*. This is inheritance built
* by hand. Similarly every pointer to a variable-size Python object can,
* in addition, be cast to PyVarObject*.
*/
typedef struct _object {
_PyObject_HEAD_EXTRA
Py_ssize_t ob_refcnt; // 引用计数值
struct _typeobject *ob_type; // 基本的类型 type
} PyObject;
typedef struct {
PyObject ob_base; // 对象基本信息
Py_ssize_t ob_size; /* Number of items in variable part */ // 变长对象的元素个数
} PyVarObject;
...
typedef struct {
PyObject_VAR_HEAD // 变量头部信息
/* Vector of pointers to list elements. list[0] is ob_item[0], etc. */
PyObject **ob_item; // 元素指向具体指
/* ob_item contains space for 'allocated' elements. The number
* currently in use is ob_size.
* Invariants:
* 0 <= ob_size <= allocated
* len(list) == ob_size
* ob_item == NULL implies ob_size == allocated == 0
* list.sort() temporarily sets allocated to -1 to detect mutations.
*
* Items must normally not be NULL, except during construction when
* the list is not yet visible outside the function that builds it.
*/
Py_ssize_t allocated; // 当前空间
} PyListObject;
其中,ob_size就是list实际拥有的元素数量,allocated指的是实际申请的内存空间,比如列表A实际申请了5元素的空间,但是此时A只包含了2个元素,则ob_size为2,allocated为5。
List的创建和初始化
在新建List的时候会调用BUILD_LIST指令新建List,BUILD_LIST对应的字节码指令是,
TARGET(BUILD_LIST) {
PyObject *list = PyList_New(oparg); // 生成新的列表调用,传入初始化时传入的参数值
if (list == NULL)
goto error;
while (--oparg >= 0) {
PyObject *item = POP();
PyList_SET_ITEM(list, oparg, item); // 将初始化的列表值设置到列表中
}
PUSH(list); // 压入list
DISPATCH(); // 执行下一条指令
}
此时查看PyList_New函数,
PyObject *
PyList_New(Py_ssize_t size)
{
PyListObject *op;
size_t nbytes;
#ifdef SHOW_ALLOC_COUNT
static int initialized = 0;
if (!initialized) {
Py_AtExit(show_alloc);
initialized = 1;
}
#endif
if (size < 0) { // 如果传入的大小小于0则报错
PyErr_BadInternalCall();
return NULL;
}
/* Check for overflow without an actual overflow,
* which can cause compiler to optimise out */
if ((size_t)size > PY_SIZE_MAX / sizeof(PyObject *)) // 计算申请的大小是否超过数组最大的索引值,PY_SIZE_MAX如果是64位则是最大的有符号int64值
return PyErr_NoMemory();
nbytes = size * sizeof(PyObject *); // 需要申请的内存字节大小
if (numfree) { // 缓冲池可用
numfree--;
op = free_list[numfree]; // 获取缓冲的对象
_Py_NewReference((PyObject *)op);
#ifdef SHOW_ALLOC_COUNT
count_reuse++;
#endif
} else {
op = PyObject_GC_New(PyListObject, &PyList_Type); // 生成新的对象
if (op == NULL)
return NULL;
#ifdef SHOW_ALLOC_COUNT
count_alloc++;
#endif
}
if (size <= 0) // 传入小于等于0
op->ob_item = NULL; // 则list实例的ob_item为NULL
else {
op->ob_item = (PyObject **) PyMem_MALLOC(nbytes); // 申请nbytes字节大小的内存,并将申请内存的首地址设置到ob_item
if (op->ob_item == NULL) { // 如果为空则报错
Py_DECREF(op);
return PyErr_NoMemory();
}
memset(op->ob_item, 0, nbytes); // 将对应nbytes字节的数据设置为0
}
Py_SIZE(op) = size; // 设置ob->size 为size
op->allocated = size; // 设置allocated值
_PyObject_GC_TRACK(op); // 加入GC列表
return (PyObject *) op;
}
在生成新的list的时候会先检查缓冲池是否拥有,默认会维护80个PyListObject对象;
#ifndef PyList_MAXFREELIST
#define PyList_MAXFREELIST 80
#endif
static PyListObject *free_list[PyList_MAXFREELIST];
static int numfree = 0;
在调用list_dealloc函数释放列表对象时,会检查缓冲池是否已经满了,如果没有满则添加到缓冲池中,
static void
list_dealloc(PyListObject *op)
{
Py_ssize_t i;
PyObject_GC_UnTrack(op);
Py_TRASHCAN_SAFE_BEGIN(op)
if (op->ob_item != NULL) {
/* Do it backwards, for Christian Tismer.
There's a simple test case where somehow this reduces
thrashing when a *very* large list is created and
immediately deleted. */
i = Py_SIZE(op);
while (--i >= 0) {
Py_XDECREF(op->ob_item[i]); // 减少列表对象的引用计数
}
PyMem_FREE(op->ob_item); // 释放ob_item申请的内存空间
}
if (numfree < PyList_MAXFREELIST && PyList_CheckExact(op)) // 如果缓冲池未满
free_list[numfree++] = op; // 添加到缓冲池列表中
else
Py_TYPE(op)->tp_free((PyObject *)op); // 大于缓冲池大小则直接释放内存
Py_TRASHCAN_SAFE_END(op)
}
当缓冲池没有可用list的时候,则新生成一个list,调用PyObject_GC_New(PyListObject, &PyList_Type)生成一个;
#define PyObject_GC_New(type, typeobj) \
( (type *) _PyObject_GC_New(typeobj) ) // 调用_PyObject_GC_New生成
...
PyObject *
_PyObject_GC_New(PyTypeObject *tp)
{
PyObject *op = _PyObject_GC_Malloc(_PyObject_SIZE(tp)); // 申请内存空间
if (op != NULL)
op = PyObject_INIT(op, tp); // 初始化op
return op; // 返回
}
...
#define PyObject_INIT(op, typeobj) \
( Py_TYPE(op) = (typeobj), _Py_NewReference((PyObject *)(op)), (op) ) // 设置ob_type,增加引用计数
...
#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) // 设置ob_type类型
此时新生成的list对象就基本初始化完成。
当调用给list设值时,会尝试如下字节码;
14 0 LOAD_CONST 0 (1)
3 LOAD_CONST 1 (2)
6 BUILD_LIST 2
9 STORE_NAME 0 (a)
15 12 LOAD_CONST 2 (10)
15 LOAD_NAME 0 (a)
18 LOAD_CONST 0 (1)
21 STORE_SUBSCR
其中,a初始化为包含1,2两个元素的列表,然后设置a[1]=10,此时调用STORE_SUBSCR来设置,
TARGET(STORE_SUBSCR) {
PyObject *sub = TOP(); // 获取需要设置的key
PyObject *container = SECOND(); // 获取list实例
PyObject *v = THIRD(); // 获取待设置的值
int err;
STACKADJ(-3); // 栈移动三个值
/* v[w] = u */
err = PyObject_SetItem(container, sub, v); // 设置值
Py_DECREF(v);
Py_DECREF(container);
Py_DECREF(sub);
if (err != 0)
goto error;
DISPATCH();
}
此时调用了PyObject_SetItem函数来将值设置到列表中,
int
PyObject_SetItem(PyObject *o, PyObject *key, PyObject *value)
{
PyMappingMethods *m;
if (o == NULL || key == NULL || value == NULL) { // 检查传入参数
null_error();
return -1;
}
m = o->ob_type->tp_as_mapping; // 获取类型的tp_as_mapping方法
if (m && m->mp_ass_subscript)
return m->mp_ass_subscript(o, key, value); // 调用字典的设置方法
if (o->ob_type->tp_as_sequence) { // 调用tp_as_sequence方法
if (PyIndex_Check(key)) { // 检查key
Py_ssize_t key_value;
key_value = PyNumber_AsSsize_t(key, PyExc_IndexError);
if (key_value == -1 && PyErr_Occurred())
return -1;
return PySequence_SetItem(o, key_value, value); // 调用设置值
}
else if (o->ob_type->tp_as_sequence->sq_ass_item) {
type_error("sequence index must be "
"integer, not '%.200s'", key);
return -1;
}
}
type_error("'%.200s' object does not support item assignment", o);
return -1;
}
由于此时,传入的list对象,此时的PyList_Type对应的tp_as_mapping为list_as_mapping;
static PyMappingMethods list_as_mapping = {
(lenfunc)list_length,
(binaryfunc)list_subscript,
(objobjargproc)list_ass_subscript
};
此时就是调用了list_ass_subscript方法来处理设值,
static int
list_ass_subscript(PyListObject* self, PyObject* item, PyObject* value)
{
if (PyIndex_Check(item)) { // item索引值检查
Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); // 检查值是否合法,不合法返回PyExc_IndexError
if (i == -1 && PyErr_Occurred()) // 如果检查不合法则返回错误
return -1;
if (i < 0) // 如果小于0, 则转换成正值索引
i += PyList_GET_SIZE(self);
return list_ass_item(self, i, value); // 设值值
}
...
}
此时先检查索引值是否合法,然后将负值索引转换成正值索引,然后调用list_ass_item处理;
static int
list_ass_item(PyListObject *a, Py_ssize_t i, PyObject *v)
{
PyObject *old_value;
if (i < 0 || i >= Py_SIZE(a)) { // 检查索引值是否超出范围
PyErr_SetString(PyExc_IndexError,
"list assignment index out of range");
return -1;
}
if (v == NULL)
return list_ass_slice(a, i, i+1, v); // 如果v为NULL则删除对应i索引处的值
Py_INCREF(v);
old_value = a->ob_item[i]; // 获取旧值
a->ob_item[i] = v; // 设值新值
Py_DECREF(old_value);
return 0;
}
至此一个List的设置过程就完成了。
List的添加和删除元素
a本来是[1,2]的列表,调用a.insert(5,105)时,对应的字节码如下;
18 45 LOAD_NAME 0 (a)
48 LOAD_ATTR 3 (insert)
51 LOAD_CONST 3 (5)
54 LOAD_CONST 4 (105)
57 CALL_FUNCTION 2 (2 positional, 0 keyword pair)
此时就会调用listinsert函数,
static PyObject *
listinsert(PyListObject *self, PyObject *args)
{
Py_ssize_t i;
PyObject *v;
if (!PyArg_ParseTuple(args, "nO:insert", &i, &v)) // 解析参数和列表
return NULL;
if (ins1(self, i, v) == 0) // 插入数据
Py_RETURN_NONE;
return NULL;
}
此时解析了相关参数为tuple后调用ins1插入数据,
static int
ins1(PyListObject *self, Py_ssize_t where, PyObject *v)
{
Py_ssize_t i, n = Py_SIZE(self); // 获取当前的列表大小
PyObject **items;
if (v == NULL) { // 检查是否为空
PyErr_BadInternalCall();
return -1;
}
if (n == PY_SSIZE_T_MAX) { // 检查是否已经是列表允许的最大值了
PyErr_SetString(PyExc_OverflowError,
"cannot add more objects to list");
return -1;
}
if (list_resize(self, n+1) == -1) // 重新扩招self大小
return -1;
if (where < 0) { // 如果索引为负值,则转换成正值
where += n; // 计算出正值索引
if (where < 0)
where = 0; // 如果转换后还是负值则置0
}
if (where > n)
where = n; // 如果where大于最大值则置为最大值n
items = self->ob_item;
for (i = n; --i >= where; ) // 将大于where索引值的元素移动后一位
items[i+1] = items[i];
Py_INCREF(v);
items[where] = v; // 设置where为传入的v
return 0;
}
首先会调整列表的大小,调用list_resize函数,
static int
list_resize(PyListObject *self, Py_ssize_t newsize)
{
PyObject **items;
size_t new_allocated;
Py_ssize_t allocated = self->allocated; // 获取当前的申请内存的大小
/* Bypass realloc() when a previous overallocation is large enough
to accommodate the newsize. If the newsize falls lower than half
the allocated size, then proceed with the realloc() to shrink the list.
*/
if (allocated >= newsize && newsize >= (allocated >> 1)) { // 如果已经申请的大小大于newsize并且newsize>=已有空间的一半
assert(self->ob_item != NULL || newsize == 0);
Py_SIZE(self) = newsize; // 设置ob_size为newsize
return 0;
}
/* This over-allocates proportional to the list size, making room
* for additional growth. The over-allocation is mild, but is
* enough to give linear-time amortized behavior over a long
* sequence of appends() in the presence of a poorly-performing
* system realloc().
* The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
*/
new_allocated = (newsize >> 3) + (newsize < 9 ? 3 : 6); // 否则就调整申请空间大小
/* check for integer overflow */
if (new_allocated > PY_SIZE_MAX - newsize) { // 如果新申请的大小最大的大小
PyErr_NoMemory(); // 则报错
return -1;
} else {
new_allocated += newsize; // 新增newsize
}
if (newsize == 0)
new_allocated = 0;
items = self->ob_item;
if (new_allocated <= (PY_SIZE_MAX / sizeof(PyObject *))) // 检查新申请的大小是否在允许范围内
PyMem_RESIZE(items, PyObject *, new_allocated); // 重新调整ob_item的申请空间
else
items = NULL;
if (items == NULL) {
PyErr_NoMemory();
return -1;
}
self->ob_item = items; // 重置item
Py_SIZE(self) = newsize; // 重置size 为newsize
self->allocated = new_allocated; // 重置申请的空间大小
return 0;
}
在调整过列表大小后,列表会判断如果传入的负值索引比列表元素还大则设置在0位,如果转换的正值索引比列表长度还大则设置为最后一位,并将大于索引值的元素,往后移动一位。此时就是list的插入过程。
当列表a=[1,2,10,105],调用a.remove(10)时,对应的字节码如下;
20 71 LOAD_NAME 0 (a)
74 LOAD_ATTR 4 (remove)
77 LOAD_CONST 2 (10)
80 CALL_FUNCTION 1 (1 positional, 0 keyword pair)
83 POP_TOP
此时调用了list的remove属性,对应调用了listremove方法,
static PyObject *
listremove(PyListObject *self, PyObject *v)
{
Py_ssize_t i;
for (i = 0; i < Py_SIZE(self); i++) { // 循环遍历lise
int cmp = PyObject_RichCompareBool(self->ob_item[i], v, Py_EQ); // 比较列表中的值与传入值v进行比较
if (cmp > 0) { // 如果找到
if (list_ass_slice(self, i, i+1,
(PyObject *)NULL) == 0) // 设置该位为NULL
Py_RETURN_NONE;
return NULL;
}
else if (cmp < 0)
return NULL;
}
PyErr_SetString(PyExc_ValueError, "list.remove(x): x not in list"); // 如果没有找到则返回该错误
return NULL;
}
其中调用了PyObject_RichCompareBool函数,进行比较;
int
PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
{
PyObject *res;
int ok;
/* Quick result when objects are the same.
Guarantees that identity implies equality. */
if (v == w) { // 快速检查,如果两个相同则返回
if (op == Py_EQ)
return 1;
else if (op == Py_NE)
return 0;
}
res = PyObject_RichCompare(v, w, op); // 比较v,w
if (res == NULL)
return -1;
if (PyBool_Check(res))
ok = (res == Py_True);
else
ok = PyObject_IsTrue(res);
Py_DECREF(res);
return ok; // 返回结果
}
继续调用了PyObject_RichCompare函数进行比较,
PyObject *
PyObject_RichCompare(PyObject *v, PyObject *w, int op)
{
...
res = do_richcompare(v, w, op); // 进行比较
...
}
查看do_richcompare函数比较可知,
/* Perform a rich comparison, raising TypeError when the requested comparison
operator is not supported. */
static PyObject *
do_richcompare(PyObject *v, PyObject *w, int op)
{
richcmpfunc f;
PyObject *res;
int checked_reverse_op = 0;
if (v->ob_type != w->ob_type &&
PyType_IsSubtype(w->ob_type, v->ob_type) &&
(f = w->ob_type->tp_richcompare) != NULL) { // 检查v,w的两个类型是否相同,检查是否w,v是否是子类型,检查w对应的tp_richcompare是否为空
checked_reverse_op = 1;
res = (*f)(w, v, _Py_SwappedOp[op]); // 直接调用w的tp_richcompare方法进行比较
if (res != Py_NotImplemented)
return res; // 返回结果
Py_DECREF(res);
}
if ((f = v->ob_type->tp_richcompare) != NULL) { // 检查v是否有tp_richcompare方法
res = (*f)(v, w, op); // 调用v的tp_richcompare进行比较
if (res != Py_NotImplemented)
return res; // 返回结果
Py_DECREF(res);
}
if (!checked_reverse_op && (f = w->ob_type->tp_richcompare) != NULL) { // 如果checked_reverse_op为0,w的tp_richcompare不为空
res = (*f)(w, v, _Py_SwappedOp[op]); // 调用w的tp_richcompare方法进行比较
if (res != Py_NotImplemented)
return res; // 返回结果
Py_DECREF(res);
}
/* If neither object implements it, provide a sensible default
for == and !=, but raise an exception for ordering. */
switch (op) { // 如果都没有实现该方法则使用简易的默认方法
case Py_EQ:
res = (v == w) ? Py_True : Py_False; // 判断是否相等
break;
case Py_NE:
res = (v != w) ? Py_True : Py_False; // 判断是否相等
break;
default:
/* XXX Special-case None so it doesn't show as NoneType() */
PyErr_Format(PyExc_TypeError,
"unorderable types: %.100s() %s %.100s()",
v->ob_type->tp_name,
opstrings[op],
w->ob_type->tp_name);
return NULL;
}
Py_INCREF(res);
return res; // 返回结果
}
由于此时列表a中的元素是long型的元素,则调用了long_richcompare方法进行两个值之间的比较,
static PyObject *
long_richcompare(PyObject *self, PyObject *other, int op)
{
int result;
PyObject *v;
CHECK_BINOP(self, other);
if (self == other) // 如果两者相等
result = 0; // 直接返回
else
result = long_compare((PyLongObject*)self, (PyLongObject*)other); // 比较两个long型
/* Convert the return value to a Boolean */
switch (op) { // 根据op 返回不同的结果
case Py_EQ:
v = TEST_COND(result == 0);
break;
case Py_NE:
v = TEST_COND(result != 0);
break;
case Py_LE:
v = TEST_COND(result <= 0);
break;
case Py_GE:
v = TEST_COND(result >= 0);
break;
case Py_LT:
v = TEST_COND(result == -1);
break;
case Py_GT:
v = TEST_COND(result == 1);
break;
default:
PyErr_BadArgument();
return NULL;
}
Py_INCREF(v);
return v;
}
其中又调用了long_compare函数进行比较,
static int
long_compare(PyLongObject *a, PyLongObject *b)
{
Py_ssize_t sign;
if (Py_SIZE(a) != Py_SIZE(b)) { // 比较大小是否相同
sign = Py_SIZE(a) - Py_SIZE(b); // 获取两个值的差值
}
else {
Py_ssize_t i = Py_ABS(Py_SIZE(a)); // 获取a值大小
while (--i >= 0 && a->ob_digit[i] == b->ob_digit[i])
;
if (i < 0)
sign = 0;
else {
sign = (sdigit)a->ob_digit[i] - (sdigit)b->ob_digit[i];
if (Py_SIZE(a) < 0)
sign = -sign;
}
}
return sign < 0 ? -1 : sign > 0 ? 1 : 0; // 根据sign的处理结果返回
}
至此就比较出来了两个值的大小,如果相同则会继续执行listremove
中的list_ass_slice函数,将对应索引位置的值置空,该函数具体的执行流程大家可自行阅读,至此一个列表中的元素的删除过程已经分析完成。
总结
列表的初始化和插入数据和删除数据基本上的流程分析完成,其中还有很多其他的细节在流程中没有涉及到,列表在元素增长或者减少的过程中,都会涉及到都申请得到内存进行相关操作,此时的大量细节还需要大家自行查看。