CPython3.7.9源码学习一：C语言基础、整数对象

骇客567

已于 2024-05-19 20:51:25 修改

阅读量802

点赞数 24

分类专栏： CPython解释器文章标签： python c语言数据结构

于 2024-05-19 20:50:51 首次发布

本文链接：https://blog.csdn.net/u010442378/article/details/139048284

版权

CPython解释器专栏收录该内容

3 篇文章 0 订阅

订阅专栏

C 语言基础

结构体

// 
struct(关键字) 名称 {结构体成员};

// 定义结构体
struct Student {  
    char name[50];  
    int age;  
    float score;  
};

// 初始化 结构体变量
struct Student stu1;
strcpy(stu1.name, "张三");  
stu1.age = 20;  
stu1.score = 90.5;

// 初始化 结构体数组
struct Student students[2];  
  
strcpy(students[0].name, "张三");  
students[0].age = 20;  
students[0].score = 90.5;  
  
strcpy(students[1].name, "李四");  
students[1].age = 21;  
students[1].score = 85.0;  

// 结构体指针，访问结构体成员
struct Student *ptr = &stu1;  
printf("学生名字: %s\n", ptr->name);  
printf("学生年龄: %d\n", ptr->age);  
printf("学生分数: %.1f\n", ptr->score);


// cpython中的结构体，长整型的结构体
struct _longobject {
    PyObject_VAR_HEAD
    digit ob_digit[1];
};

typedef

// 用于给数据类型定义别名，常用于给结构体定义别名

// 将int别名设置成integer
typedef int integer;  
// 将int的指针设置pinteger
typedef int* pinteger;  

// 定义变量
integer a = 100;
pinteger pa = &a;

struct Student {  
    char name[50];  
    int age;  
    float score;  
};

// 为struct Student 设置一个别名为stu 
typedef struct Student stu; 
// 为struct Student* 设置一个别名为pstu 
typedef struct Student* pstu;  
stu stu1;
stu1.name = "张三";
stu1.age = 20;
stu1.score = 89.5;

pstu stu2 = (pstu)malloc(sizeof(Student)); // 申请内存
stu2->name = "张三";
stu2->age = 30;
stu2->score = 95.6;

if (stu2 != NULL)
{
    free(stu2); // 释放内存
}

// cpython中的使用
typedef struct _longobject PyLongObject;

宏定义

// 用于定义常量和表达式，预编译后会将对应的字符串替换成定义的值

// 定义常量PyLong_SHIFT为30
#define PyLong_SHIFT    30

// 源码
(digit)1 << PyLong_SHIFT

// 预编译后
(digit)1 << 30

预编译指令

#if 		// 基础判断
#ifdef 		// 判断释放有宏定义
#ifndef		// 
#else
#elif
#endif
#define		// 宏定义
#undef 		// 取消之前定义的宏
#defined

// cpython
#if PYLONG_BITS_IN_DIGIT == 30
typedef uint32_t digit;
#define PyLong_SHIFT    30
#elif PYLONG_BITS_IN_DIGIT == 15
typedef unsigned short digit; 
#define PyLong_SHIFT    15
#else
#error "PYLONG_BITS_IN_DIGIT should be 15 or 30"
#endif

assert断言

// 用于在调试期间捕获程序错误的机制
#include <assert.h>  
int x = 5;  
assert(x != 0); // 断言 x 不等于 0

goto 语法

// 用于无条件跳转到程序中的指定标签

int i = 0;  
for (i = 0; i < 10; i++) {  
    if (i == 5) {  
        goto end_loop; // 当 i 等于 5 时，跳转到 end_loop 标签处  
    }  
    printf("%d\n", i);  
}  
end_loop: // 这是 end_loop 标签  
printf("Loop ended early.\n");

一切皆对象

整数对象

有无符号

整数分为无符号整数和有符号整数，无符号整数只表示正数和零，而有符号整数则通过特定的编码方式（如补码）来表示正数、负数和零，在补码表示法中，最高位（符号位）为0表示正数，为1表示负数。其余位则用于表示数值的大小。

整数和操作系统

位数：

在32位操作系统中，整数通常使用32位来表示，即4个字节（32个比特）。
在64位操作系统中，整数通常使用64位来表示，即8个字节（64个比特）。

范围：

在32位操作系统中，有符号整数的范围通常是从 -2^31 到 2^31-1，即从 -2147483648 到 2147483647；无符号整数的范围通常是从 0 到 2^32-1，即从 0 到 4294967295。
在64位操作系统中，有符号整数的范围通常是从 -2^63 到 2^63-1，即从 -9223372036854775808 到 9223372036854775807；无符号整数的范围通常是从 0 到 2^64-1，即从 0 到 18446744073709551615。

整数结构体


// Include/object.h 

// 基础对象，定长的
typedef struct _object {
    _PyObject_HEAD_EXTRA
    /*引用计数，用于垃圾回收*/
    Py_ssize_t ob_refcnt;           
    /*
    指向对象类型的指针，用于标识对象的类型，运行时类型检查和类型特定的操作，
    每个对象有一个类型对象，定义了该对象的属性、行为、方法等。
    PyObject 对象到底是什么类型的，只有再调用的时候，通过ob_type来判断，即多态机制
    */
    struct _typeobject *ob_type;    
} PyObject;

// 可变长对象
typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size; /*  可变部分的项目数 */
} PyVarObject; 

// 定义所有可变大小容器对象的初始段。
#define PyObject_VAR_HEAD      PyVarObject ob_base;

// Include/longintrepr.h
typedef struct _longobject PyLongObject; /* Revealed in longintrepr.h */

struct _longobject {
    PyObject_VAR_HEAD
    // 定义了一个数组 ob_digit，其类型为 digit（即uint32_t），该数组只有一个元素
    digit ob_digit[1];
};

// 结合上面的结构体
typedef struct {
    _PyObject_HEAD_EXTRA
    Py_ssize_t ob_refcnt; // 引用计数 8字节
    struct _typeobject *ob_type; // 类型 8字节
    Py_ssize_t ob_size; // 元素个数	8字节
    digit ob_digit[1]; // digit类型的数组，默认长度为1
} PyLongObject;

PyLongObject 对象中数组ob_digit 是 digit 类型的，默认长度是 1，python 中的整数就是存在这个数组中的，看下 digit 的类型


// Include/longintrepr.h
// 值为30表示64位系统，值为15表示32位系统
#if PYLONG_BITS_IN_DIGIT == 30
// uint32_t 是一个无符号32位整数类型
typedef uint32_t digit;
#define PyLong_SHIFT    30
......

#elif PYLONG_BITS_IN_DIGIT == 15
// unsigned short 一个16位的无符号整数类型
typedef unsigned short digit; 
#define PyLong_SHIFT    15

当操作系统 64 位时，digit 的类型是无符号的 32 位整数类型，并且ob_digit 数组中每一位存储的最大数字为 (2^30)-1 即1073741823，此处 30 是 PyLong_SHIFT 的值。如果一个数值大于1073741823，则数组长度通过PyLong_SHIFT 进行计算。
操作系统是 32 位，digit 的类型是一个16位的无符号整数类型，PyLong_SHIFT 值为 15。
看下longintrepr.h 中一段注释

/* Long integer representation.
   The absolute value of a number is equal to
        SUM(for i=0 through abs(ob_size)-1) ob_digit[i] * 2**(SHIFT*i)
   Negative numbers are represented with ob_size < 0;
   zero is represented by ob_size == 0.
   In a normalized number, ob_digit[abs(ob_size)-1] (the most significant
   digit) is never zero.  Also, in all cases, for all valid i,
        0 <= ob_digit[i] <= MASK.
   The allocation function takes care of allocating extra memory
   so that ob_digit[0] ... ob_digit[abs(ob_size)-1] are actually available.

   CAUTION:  Generic code manipulating subtypes of PyVarObject has to
   aware that ints abuse  ob_size's sign bit.
*/

PyLongObject 对象中 ob_size 即表示数组ob_digit 的长度，又表示整数的符号。
ob_size如果小于零，则表示一个负数，ob_size 如果等于零，表示 0。而整个整数的值则通过表达式来计算：
**SUM(for i=0 through abs(ob_size)-1) ob_digit[i] * 2**(SHIFT*i)**
比如数字：1234567890987654321 在ob_digit 中的存储：
此时 ob_size => 3; ob_digit => {829168817, 76039122, 1}
根据公式反推一下ob_digit 数组的值：
第一步：

temp = 1234567890987654321
ob_digit[0] = 829168817 => temp % (2^30)
temp = 1149780946 =>  temp // (2^30)
ob_size++

第二步：

temp = 1149780946
ob_digit[1] = 76039122 => temp % (2^30)
temp = 1 =>  temp // (2^30)
ob_size++

第三步：

temp = 1
ob_digit[2] = 1 => temp % (2^30)
temp = 0 =>  temp // (2^30)
ob_size++

根据公式反算一下 829168817*2**(30*0) + 76039122*2**(30*1) + 1*2**(30*2)
用 python 来模拟查看下PyLongObject 对象

# cpython长整数底层存储算法
import math
import ctypes


class PyLong:
    SHIFT = 30
    MASK = (2 ** SHIFT)

    def parse_ob_size(self, longint):
        """
        解析数组长度
        :param longint:
        :return:
        """
        ob_size = int(math.log(10) / math.log(self.MASK) * len(str(longint)) + 1)
        print(ob_size)
        return ob_size

    def parse_ob_digit(self, longint):
        n = abs(longint)
        ob_digit = []
        while n != 0:
            digit = n % self.MASK
            ob_digit.append(digit)

            n //= self.MASK

        print(ob_digit) # [829168817, 76039122, 1]

    def parse_ob_digit_by_struct(self, longint):
        """
        通过访问底层地址查看ob_digit数组
        :param longint:
        :return:
        """
        _ob_size = self.parse_ob_size(longint)

        class _PyLongObject(ctypes.Structure):
            # c_ssize_t 是一个表示 C 语言中 ssize_t 类型的外包装类。ssize_t 是一个有符号整数类型，即 Py_ssize_t
            # c_void_p 是一个表示通用指针类型的外包装类，它对应于 C 语言中的 void* 类型。void* 是一个泛型指针
            # c_uint32 是一个外包装类，用于表示无符号的 32 位整数，对应于 C 语言中的 uint32_t 类型
            _fields_ = [("ob_refcnt", ctypes.c_ssize_t),
                        ("ob_type", ctypes.c_void_p),
                        ("ob_size", ctypes.c_ssize_t),
                        ("ob_digit", ctypes.c_uint32 * _ob_size)]

        long_object = _PyLongObject.from_address(id(longint))
        ob_size = abs(long_object.ob_size)
        ob_digit = long_object.ob_digit[:ob_size]
        print(ob_digit, ob_size) # [829168817, 76039122, 1], 3


if __name__ == '__main__':
    pylong = PyLong()
    data = 1234567890987654321
    pylong.parse_ob_size(data)
    pylong.parse_ob_digit(data)

    pylong.parse_ob_digit_by_struct(data)

来看下几个特殊的数是怎么存的：
0 ob_size 如果等于零，表示 0，ob_size => 0
1 ob_size => 1; ob_digit=>{1}
-1 ob_size => -1; ob_digit=>{1}
(2 ^ 30) -1 ob_size => 1; ob_digit=>{1073741823}
-(2 ^ 30) -1 ob_size => -1; ob_digit=>{1073741823}
(2 ^ 30) ob_size => 2; ob_digit=>{0, 1}
-(2 ^ 30) ob_size => -2; ob_digit=>{0, 1}
整数占内存大小
ob_refcnt 是 8 字节，ob_type 指针类型占 8 字节，ob_size 占 8 字节，ob_digit 是 4 字节。所以整数的大小是，83+ob_size 绝对值*4

import sys

# 1的ob_size是1，占内存大小为24+4*1=28
sys.getsizeof(1) # 28

# 0的ob_size是0，说明ob_digit长度是0，24+4*0=24

sys.getsizeof(1) # 24

#(2**30)-1的ob_size是1，内存大小为24+4*1=28
sys.getsizeof((2**30)-1) # 28

#2**30的ob_size是2，内存大小为24+4*2=32
sys.getsizeof(2**30) # 32

创建整数的方法
PyLong_FromLong 使用 C 的 long 类型创建 python 整数
PyLong_FromUnsignedLong 使用 C 的无符号 long 类型创建
PyLong_FromDouble 使用 C 的 longlong 类型创建
PyLong_FromVoidPtr 使用 C 的指针类型创建
PyLong_FromLongLong 使用 C 的 longlong 类型创建
PyLong_FromUnsignedLongLong 使用 C 的无符号 longlong 类型创建
PyLong_FromSsize_t 使用 C 的Py_ssize_t 类型创建
PyLong_FromSize_t 使用 C 的size_t 类型创建
创建整数对象
_PyLong_New

PyLongObject *
_PyLong_New(Py_ssize_t size)
{
    PyLongObject *result; // result 是一个PyLongObject类型的指针
    /* Number of bytes needed is: offsetof(PyLongObject, ob_digit) +
       sizeof(digit)*size.  Previous incarnations of this code used
       sizeof(PyVarObject) instead of the offsetof, but this risks being
       incorrect in the presence of padding between the PyVarObject header
       and the digits. 
       所需字节数为offsetof(PyLongObject, ob_digit) + sizeof(digit)*size
       此代码的先前版本使用sizeof(PyVarObject)而不是offsetof，
       但在PyVarObject头文件和数字之间存在填充时，这有可能是不正确的。
       */
    if (size > (Py_ssize_t)MAX_LONG_DIGITS) {
        PyErr_SetString(PyExc_OverflowError,
                        "too many digits in integer");
        return NULL;
    }
    /* PyObject_MALLOC 通常用于分配小块内存
    offsetof(PyLongObject, ob_digit) 表示获取 PyLongObject 结构体中 ob_digit 成员相对于结构体起始地址的偏移量
    申请内存存储PyLongObject结构体和长度为size 数组 ob_digit
    */
    result = PyObject_MALLOC(offsetof(PyLongObject, ob_digit) +
                             size*sizeof(digit));
    if (!result) {
        PyErr_NoMemory();
        return NULL;
    }
    // 初始化ob_type、ob_size、ob_refcnt等值
    return (PyLongObject*)PyObject_INIT_VAR(result, &PyLong_Type, size);
}

整数类型

// 整数对象的类型
PyTypeObject PyLong_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "int",                                      /* tp_name */
    offsetof(PyLongObject, ob_digit),           /* tp_basicsize */
    sizeof(digit),                              /* tp_itemsize */
    long_dealloc,                               /* tp_dealloc 析构操作，计数器为0时，清除对象*/
    0,                                          /* tp_print */
    0,                                          /* tp_getattr */
    0,                                          /* tp_setattr */
    0,                                          /* tp_reserved */
    long_to_decimal_string,                     /* tp_repr */
    &long_as_number,                            /* tp_as_number 数值相关的操作*/
    0,                                          /* tp_as_sequence */
    0,                                          /* tp_as_mapping */
    (hashfunc)long_hash,                        /* tp_hash 哈希函数，是可哈希的*/
    0,                                          /* tp_call */
    long_to_decimal_string,                     /* tp_str */
    PyObject_GenericGetAttr,                    /* tp_getattro */
    0,                                          /* tp_setattro */
    0,                                          /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
        Py_TPFLAGS_LONG_SUBCLASS,               /* tp_flags */
    long_doc,                                   /* tp_doc */
    0,                                          /* tp_traverse */
    0,                                          /* tp_clear */
    long_richcompare,                           /* tp_richcompare 比较操作*/
    0,                                          /* tp_weaklistoffset */
    0,                                          /* tp_iter */
    0,                                          /* tp_iternext */
    long_methods,                               /* tp_methods 相关的函数*/
    0,                                          /* tp_members */
    long_getset,                                /* tp_getset */
    0,                                          /* tp_base */
    0,                                          /* tp_dict */
    0,                                          /* tp_descr_get */
    0,                                          /* tp_descr_set */
    0,                                          /* tp_dictoffset */
    0,                                          /* tp_init */
    0,                                          /* tp_alloc */
    long_new,                                   /* tp_new */
    PyObject_Del,                               /* tp_free */
};

比较操作

/* 复杂的比较操作
self    本身
other   比较对象
op      比较操作
*/
static PyObject *
long_richcompare(PyObject *self, PyObject *other, int op)
{
    int result;
    CHECK_BINOP(self, other); // 检测 self other 是不是长整数
    // 两个对象地址相同时，则表明是同一个对象，不需要比较
    if (self == other)
        result = 0;
    else
        result = long_compare((PyLongObject*)self, (PyLongObject*)other);
    Py_RETURN_RICHCOMPARE(result, 0, op);
}

// 长整数比较
static int
long_compare(PyLongObject *a, PyLongObject *b)
{
    Py_ssize_t sign;
    /*
    对于长整数对象，Py_SIZE 返回的是数字中绝对值的位数（二进制位）。
    注意，这个大小是包括符号位的，所以一个正数和它的负数值会有相同的大小。
    对于列表、元组或其他序列类型的对象，Py_SIZE 通常返回序列中元素的数量。
    */ 
    if (Py_SIZE(a) != Py_SIZE(b)) {
        // a和b的ob_size不相等时，两个ob_size相减，然后根据符号判断哪个数大
        sign = Py_SIZE(a) - Py_SIZE(b);
    }
    else {
        // 如果a和b的ob_size相同，需要逐个比较ob_digit中的值
        Py_ssize_t i = Py_ABS(Py_SIZE(a));
        // 从后往前（因为高位的数放在后面），循环比较ob_digit中的值
        while (--i >= 0 && a->ob_digit[i] == b->ob_digit[i])
            ;
        // a和b数组ob_digit值都一样，执行--i后i就小于零
        if (i < 0)
            sign = 0;
        else {
            // 如果a、b的ob_digit中有1位不相同，则只需要比较当前位上的数字，就能分出大小
            sign = (sdigit)a->ob_digit[i] - (sdigit)b->ob_digit[i];
            // 如果a是负数，则比较结果就要加上负号
            if (Py_SIZE(a) < 0)
                sign = -sign;
        }
    }
    // 最终检查sign的值
    // sign < 0, a < b
    // sign > 0, a > b
    // sign = 0, a = b
    return sign < 0 ? -1 : sign > 0 ? 1 : 0;
}

整数类型的函数集

static PyNumberMethods long_as_number = {
    (binaryfunc)long_add,       /*nb_add 加法*/	
    (binaryfunc)long_sub,       /*nb_subtract 减法*/
    (binaryfunc)long_mul,       /*nb_multiply 乘法*/
    long_mod,                   /*nb_remainder 除法*/
    long_divmod,                /*nb_divmod 取余*/
    long_pow,                   /*nb_power 乘方*/
    (unaryfunc)long_neg,        /*nb_negative*/
    (unaryfunc)long_long,       /*tp_positive*/
    (unaryfunc)long_abs,        /*tp_absolute*/
    (inquiry)long_bool,         /*tp_bool*/
    (unaryfunc)long_invert,     /*nb_invert*/
    long_lshift,                /*nb_lshift*/
    (binaryfunc)long_rshift,    /*nb_rshift*/
    long_and,                   /*nb_and*/
    long_xor,                   /*nb_xor*/
    long_or,                    /*nb_or*/
    long_long,                  /*nb_int*/
    0,                          /*nb_reserved*/
    long_float,                 /*nb_float*/
    0,                          /* nb_inplace_add */
    0,                          /* nb_inplace_subtract */
    0,                          /* nb_inplace_multiply */
    0,                          /* nb_inplace_remainder */
    0,                          /* nb_inplace_power */
    0,                          /* nb_inplace_lshift */
    0,                          /* nb_inplace_rshift */
    0,                          /* nb_inplace_and */
    0,                          /* nb_inplace_xor */
    0,                          /* nb_inplace_or */
    long_div,                   /* nb_floor_divide */
    long_true_divide,           /* nb_true_divide */
    0,                          /* nb_inplace_floor_divide */
    0,                          /* nb_inplace_true_divide */
    long_long,                  /* nb_index */
};