CPython源码阅读笔记(3)

2017-08-28 Mithrilwoodrat 更多博文 » 博客 » GitHub »

原文链接 http://woodrat.xyz/2017/08/28/CPython%E6%BA%90%E7%A0%81%E9%98%85%E8%AF%BB%E7%AC%94%E8%AE%B0%283%29/
注:以下为加速网络访问所做的原文缓存,经过重新格式化,可能存在格式方面的问题,或偶有遗漏信息,请以原文为准。


PyLongObject

PyLongObject 定义在 include/longobject.h 中,实际的 longobject 对象定义在 include/longintrepr.h 中。

// include/longobject.h
typedef struct _longobject PyLongObject; /* Revealed in longintrepr.h */

longobject 继承了 CPython 的变长对象 PyVarObject, 复用了 ob_size 字段来表示正负以及零。同时表示 digit 数组的大小。

// include/longintrepr.h

typedef PY_UINT32_T digit;

/* Long integer representation.
   The absolute value of a number is equal to
    SUM(for i=0 through abs(ob_size)-1) ob_digit[i] * 2**(SHIFT*i)
   Negative numbers are represented with ob_size < 0;
   zero is represented by ob_size == 0.
   In a normalized number, ob_digit[abs(ob_size)-1] (the most significant
   digit) is never zero.  Also, in all cases, for all valid i,
    0 <= ob_digit[i] <= MASK.
   The allocation function takes care of allocating extra memory
   so that ob_digit[0] ... ob_digit[abs(ob_size)-1] are actually available.

   CAUTION:  Generic code manipulating subtypes of PyVarObject has to
   aware that longs abuse  ob_size's sign bit.
*/

struct _longobject {
    PyObject_VAR_HEAD
    digit ob_digit[1];
};

PyLongObject 的创建

// Object/longobject.c
/* Create a new long int object from a C long int */

PyObject *
PyLong_FromLong(long ival)
{
    PyLongObject *v;
    unsigned long abs_ival;
    unsigned long t;  /* unsigned so >> doesn't propagate sign bit */
    int ndigits = 0;
    int negative = 0;

    if (ival < 0) {
        /* if LONG_MIN == -LONG_MAX-1 (true on most platforms) then
           ANSI C says that the result of -ival is undefined when ival
           == LONG_MIN.  Hence the following workaround. */
        abs_ival = (unsigned long)(-1-ival) + 1;
        negative = 1;
    }
    else {
        abs_ival = (unsigned long)ival;
    }

    /* Count the number of Python digits.
       We used to pick 5 ("big enough for anything"), but that's a
       waste of time and space given that 5*15 = 75 bits are rarely
       needed. */
    t = abs_ival;
    while (t) {
        ++ndigits;
        t >>= PyLong_SHIFT;
    }
    v = _PyLong_New(ndigits);
    if (v != NULL) {
        digit *p = v->ob_digit;
        v->ob_size = negative ? -ndigits : ndigits;
        t = abs_ival;
        while (t) {
            *p++ = (digit)(t & PyLong_MASK);
            t >>= PyLong_SHIFT;
        }
    }
    return (PyObject *)v;
}

PyLong_SHIFT 默认为30, 每一个 digit 存储的大小为 PyLong_MASK, 默认为 2**30 -1

#define PyLong_SHIFT    30
....
#define PyLong_BASE ((digit)1 << PyLong_SHIFT)
#define PyLong_MASK ((digit)(PyLong_BASE - 1))

将传入的 val 与 PyLong_MASK 相与,得到的结果放入 longobj->ob_digit。 然后每次右移 PyLong_SHIFT 直至 val == 0。

调试观察该创建过程。

>>> long(2147483648)
(gdb)
p v->ob_size
-2
p v->ob_digit[0]
0
p v->ob_digit[1]
2

PyVarObject 的创建

PyLong_FromLong 中可以看到,调用了 _PyLong_New 申请内存新建 PyLongObject 对象。

PyLongObject *
_PyLong_New(Py_ssize_t size)
{
    if (size > (Py_ssize_t)MAX_LONG_DIGITS) {
        PyErr_SetString(PyExc_OverflowError,
                        "too many digits in integer");
        return NULL;
    }
    /* coverity[ampersand_in_size] */
    /* XXX(nnorwitz): PyObject_NEW_VAR / _PyObject_VAR_SIZE need to detect
       overflow */
    return PyObject_NEW_VAR(PyLongObject, &PyLong_Type, size);
}

_PyLong_New 对申请的内存长度做检查之后,调用了 PyObject_MALLOC,size 即新建的 PyVarObjectob_size 的值。

// include/objimpl.h
#define PyObject_NEW_VAR(type, typeobj, n) \
( (type *) PyObject_InitVar( \
      (PyVarObject *) PyObject_MALLOC(_PyObject_VAR_SIZE((typeobj),(n)) ),\
      (typeobj), (n)) )

PyVarObject 的大小由宏 _PyObject_VAR_SIZE 计算,由 PyObject_VAR_HEAD 宏展开的几个定长的字段,加上变长的以 tp_itemsize 为单位,长度为 nitems 即 (_PyLong_New 的 size 参数) 的数组组成。

#define _PyObject_VAR_SIZE(typeobj, nitems)     \
    (size_t)                                    \
    ( ( (typeobj)->tp_basicsize +               \
        (nitems)*(typeobj)->tp_itemsize +       \
        (SIZEOF_VOID_P - 1)                     \
      ) & ~(SIZEOF_VOID_P - 1)                  \
    )

PyLong_Type 结构体中定义了 tp_basicsizetp_itemsize,继承 PyVarObject 需要指定 tp_itemsize,即变长对象中可变数组的单位大小。

PyTypeObject PyLong_Type = {
    PyObject_HEAD_INIT(&PyType_Type)
    0,                                          /* ob_size */
    "long",                                     /* tp_name */
    offsetof(PyLongObject, ob_digit),           /* tp_basicsize */
    sizeof(digit),                              /* tp_itemsize */
    ....
};

继承 PyObject 则只需要将该项留空,如 PyInt_Type

PyTypeObject PyInt_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "int",
    sizeof(PyIntObject),
    0,
    ....
}

参考 https://docs.python.org/2/c-api/structures.html

PyStringObject

PyStringObject 定义在 include/stringobject.h 中,实际的字符串内容存放在 ob_sval 数组中,长度为 ob_size + 1,ob_sval结尾为固定的 \0ob_shash 存储了该对象的 hash 值,默认为 -1, ob_sstate 用来表示该对象是否在缓存队列里,初始化为 SSTATE_NOT_INTERNED 即不在缓存中。

typedef struct {
    PyObject_VAR_HEAD
    long ob_shash;
    int ob_sstate;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the string or -1 if not computed yet.
     *     ob_sstate != 0 iff the string object is in stringobject.c's
     *       'interned' dictionary; in this case the two references
     *       from 'interned' to this object are *not counted* in ob_refcnt.
     */
} PyStringObject;

ob_sstate 的值有下面几种,分别为:不在缓存中、在缓存中且可修改、在缓存中且不可修改。

#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2

PyStringObject 的创建

PyStringObject 最常用的创建函数为 PyString_FromStringAndSize 位于 Objcts/stringobjec.c

在该函数设置一个断点,然后在 REPL 中输入 a=“123” , 来观察 Python 中字符串对象的创建过程。 可以得到调用栈如下

#0  PyString_FromStringAndSize (str=0x7fffffffced0 "123", size=3) at Objects/stringobject.c:60
#1  0x00000000004a773b in PyUnicodeUCS2_EncodeUTF8 (s=0x7ffff6436860, size=3, errors=0x0) at Objects/unicodeobject.c:2232
#2  0x000000000054971c in utf_8_encode (self=0x0, args=0x7ffff7e68a00) at ./Modules/_codecsmodule.c:698
#3  0x0000000000573222 in PyCFunction_Call (func=0x7ffff6ebb678, arg=0x7ffff7e68a00, kw=0x0) at Objects/methodobject.c:81
#4  0x000000000042174b in PyObject_Call (func=0x7ffff6ebb678, arg=0x7ffff7e68a00, kw=0x0) at Objects/abstract.c:2547
#5  0x00000000004dedba in PyEval_CallObjectWithKeywords (func=0x7ffff6ebb678, arg=0x7ffff7e68a00, kw=0x0) at Python/ceval.c:4226
#6  0x00000000004ee79e in _PyCodec_EncodeInternal (object=0x7ffff6ed93a0, encoder=0x7ffff6ebb678, encoding=0x7ffff6436928 "UTF-8", errors=0x0) at Python/codecs.c:355
#7  0x00000000004ef2cd in _PyCodec_EncodeText (object=0x7ffff6ed93a0, encoding=0x7ffff6436928 "UTF-8", errors=0x0) at Python/codecs.c:541
#8  0x00000000004a5714 in PyUnicodeUCS2_AsEncodedString (unicode=0x7ffff6ed93a0, encoding=0x7ffff6436928 "UTF-8", errors=0x0) at Objects/unicodeobject.c:1374
#9  0x000000000059b774 in parsestr (c=0x7fffffffd9b0, n=0x7ffff7f95b80, s=0x7ffff64368d9 "123'") at Python/ast.c:3535
#10 0x000000000059b896 in parsestrplus (c=0x7fffffffd9b0, n=0x7ffff7f95b80) at Python/ast.c:3558
#11 0x0000000000594c7f in ast_for_atom (c=0x7fffffffd9b0, n=0x7ffff7f95b80) at Python/ast.c:1377
#12 0x0000000000596442 in ast_for_power (c=0x7fffffffd9b0, n=0x7ffff7f95b38) at Python/ast.c:1790
#13 0x0000000000596bbd in ast_for_expr (c=0x7fffffffd9b0, n=0x7ffff7f95b38) at Python/ast.c:1968
#14 0x00000000005972ed in ast_for_testlist (c=0x7fffffffd9b0, n=0x7ffff7eeed50) at Python/ast.c:2131
#15 0x00000000005978ba in ast_for_expr_stmt (c=0x7fffffffd9b0, n=0x7ffff7f95358) at Python/ast.c:2261
#16 0x000000000059aae6 in ast_for_stmt (c=0x7fffffffd9b0, n=0x7ffff7f95358) at Python/ast.c:3267
#17 0x0000000000591f16 in PyAST_FromNode (n=0x7ffff7f952c8, flags=0x7fffffffdbc0, filename=0x59e66a "<stdin>", arena=0x87a500) at Python/ast.c:298

可以看到字符串常量 "123" 是在创建 AST 的时候传给 PyString_FromStringAndSize 用于创建 PyStringObject 的。

其创建过程如下:

有一全局静态变量 nullstring 用于表示所有空字符串,如果传入的字符串为空则直接返回指向 nullstring 的指针。

static PyStringObject *nullstring;

单字符的 str,有一个类似 IntObject 的 freelist 这样的全局缓存,根据字符内容可以直接获取到对应的指针。

static PyStringObject *characters[UCHAR_MAX + 1];

正常情况下的字符串对象创建时,如传入 "123",会调用 PyObject_MALLOC来申请内存。上面的 PyLongObject 创建时,是调用 PyObject_NEW_VAR 宏,计算好内存大小,间接调用 PyObject_MALLOC。 而 PyString_FromStringAndSize 中因为 ob_sval 中每个元素长度固定(char),通过 PyStringObject_SIZE 计算出结构体中固定的大小后加上变化的 size 参数,直接调用 PyObject_MALLOC。 至于 PyObject_MALLOC 里 CPython 内存管理的细节,在下面再慢慢展开。

PyStringObject_SIZE 得到 PyStringObjectob_sval 字段之前的字段的大小之和,加上 ob_sval末尾固定的 \0 的元素大小(1)。

#define PyStringObject_SIZE (offsetof(PyStringObject, ob_sval) + 1)

申请好内存之后,将按上面叙述过的过程进行初始化。然后将 char * 的内容拷贝到 ob_sval 中,并在末尾填入 \0。剩下为在 nullstringcharacters 中的元素未初始化时对其进行初始化。

PyObject *
PyString_FromStringAndSize(const char *str, Py_ssize_t size)
{
    register PyStringObject *op;
    if (size < 0) {
        PyErr_SetString(PyExc_SystemError,
            "Negative size passed to PyString_FromStringAndSize");
        return NULL;
    }
    if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
        null_strings++;
#endif
        Py_INCREF(op);
        return (PyObject *)op;
    }
    if (size == 1 && str != NULL &&
        (op = characters[*str & UCHAR_MAX]) != NULL)
    {
#ifdef COUNT_ALLOCS
        one_strings++;
#endif
        Py_INCREF(op);
        return (PyObject *)op;
    }

    if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
        PyErr_SetString(PyExc_OverflowError, "string is too large");
        return NULL;
    }

    /* Inline PyObject_NewVar */
    op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
    if (op == NULL)
        return PyErr_NoMemory();
    (void)PyObject_INIT_VAR(op, &PyString_Type, size);
    op->ob_shash = -1;
    op->ob_sstate = SSTATE_NOT_INTERNED;
    if (str != NULL)
        Py_MEMCPY(op->ob_sval, str, size);
    op->ob_sval[size] = '\0';
    /* share short strings */
    if (size == 0) {
        PyObject *t = (PyObject *)op;
        PyString_InternInPlace(&t);
        op = (PyStringObject *)t;
        nullstring = op;
        Py_INCREF(op);
    } else if (size == 1 && str != NULL) {
        PyObject *t = (PyObject *)op;
        PyString_InternInPlace(&t);
        op = (PyStringObject *)t;
        characters[*str & UCHAR_MAX] = op;
        Py_INCREF(op);
    }
    return (PyObject *) op;
}