CPython源码阅读笔记(3)
原文链接 http://woodrat.xyz/2017/08/28/CPython%E6%BA%90%E7%A0%81%E9%98%85%E8%AF%BB%E7%AC%94%E8%AE%B0%283%29/
注:以下为加速网络访问所做的原文缓存,经过重新格式化,可能存在格式方面的问题,或偶有遗漏信息,请以原文为准。
PyLongObject
PyLongObject 定义在 include/longobject.h
中,实际的 longobject
对象定义在 include/longintrepr.h
中。
// include/longobject.h
typedef struct _longobject PyLongObject; /* Revealed in longintrepr.h */
longobject
继承了 CPython 的变长对象 PyVarObject
, 复用了 ob_size 字段来表示正负以及零。同时表示 digit 数组的大小。
// include/longintrepr.h
typedef PY_UINT32_T digit;
/* Long integer representation.
The absolute value of a number is equal to
SUM(for i=0 through abs(ob_size)-1) ob_digit[i] * 2**(SHIFT*i)
Negative numbers are represented with ob_size < 0;
zero is represented by ob_size == 0.
In a normalized number, ob_digit[abs(ob_size)-1] (the most significant
digit) is never zero. Also, in all cases, for all valid i,
0 <= ob_digit[i] <= MASK.
The allocation function takes care of allocating extra memory
so that ob_digit[0] ... ob_digit[abs(ob_size)-1] are actually available.
CAUTION: Generic code manipulating subtypes of PyVarObject has to
aware that longs abuse ob_size's sign bit.
*/
struct _longobject {
PyObject_VAR_HEAD
digit ob_digit[1];
};
PyLongObject 的创建
// Object/longobject.c
/* Create a new long int object from a C long int */
PyObject *
PyLong_FromLong(long ival)
{
PyLongObject *v;
unsigned long abs_ival;
unsigned long t; /* unsigned so >> doesn't propagate sign bit */
int ndigits = 0;
int negative = 0;
if (ival < 0) {
/* if LONG_MIN == -LONG_MAX-1 (true on most platforms) then
ANSI C says that the result of -ival is undefined when ival
== LONG_MIN. Hence the following workaround. */
abs_ival = (unsigned long)(-1-ival) + 1;
negative = 1;
}
else {
abs_ival = (unsigned long)ival;
}
/* Count the number of Python digits.
We used to pick 5 ("big enough for anything"), but that's a
waste of time and space given that 5*15 = 75 bits are rarely
needed. */
t = abs_ival;
while (t) {
++ndigits;
t >>= PyLong_SHIFT;
}
v = _PyLong_New(ndigits);
if (v != NULL) {
digit *p = v->ob_digit;
v->ob_size = negative ? -ndigits : ndigits;
t = abs_ival;
while (t) {
*p++ = (digit)(t & PyLong_MASK);
t >>= PyLong_SHIFT;
}
}
return (PyObject *)v;
}
PyLong_SHIFT
默认为30, 每一个 digit 存储的大小为 PyLong_MASK
, 默认为 2**30 -1
。
#define PyLong_SHIFT 30
....
#define PyLong_BASE ((digit)1 << PyLong_SHIFT)
#define PyLong_MASK ((digit)(PyLong_BASE - 1))
将传入的 val 与 PyLong_MASK
相与,得到的结果放入 longobj->ob_digit。 然后每次右移 PyLong_SHIFT
直至 val == 0。
调试观察该创建过程。
>>> long(2147483648)
(gdb)
p v->ob_size
-2
p v->ob_digit[0]
0
p v->ob_digit[1]
2
PyVarObject 的创建
PyLong_FromLong
中可以看到,调用了 _PyLong_New
申请内存新建 PyLongObject
对象。
PyLongObject *
_PyLong_New(Py_ssize_t size)
{
if (size > (Py_ssize_t)MAX_LONG_DIGITS) {
PyErr_SetString(PyExc_OverflowError,
"too many digits in integer");
return NULL;
}
/* coverity[ampersand_in_size] */
/* XXX(nnorwitz): PyObject_NEW_VAR / _PyObject_VAR_SIZE need to detect
overflow */
return PyObject_NEW_VAR(PyLongObject, &PyLong_Type, size);
}
_PyLong_New
对申请的内存长度做检查之后,调用了 PyObject_MALLOC
,size 即新建的 PyVarObject
中 ob_size
的值。
// include/objimpl.h
#define PyObject_NEW_VAR(type, typeobj, n) \
( (type *) PyObject_InitVar( \
(PyVarObject *) PyObject_MALLOC(_PyObject_VAR_SIZE((typeobj),(n)) ),\
(typeobj), (n)) )
PyVarObject 的大小由宏 _PyObject_VAR_SIZE
计算,由 PyObject_VAR_HEAD
宏展开的几个定长的字段,加上变长的以 tp_itemsize
为单位,长度为 nitems
即 (_PyLong_New 的 size 参数) 的数组组成。
#define _PyObject_VAR_SIZE(typeobj, nitems) \
(size_t) \
( ( (typeobj)->tp_basicsize + \
(nitems)*(typeobj)->tp_itemsize + \
(SIZEOF_VOID_P - 1) \
) & ~(SIZEOF_VOID_P - 1) \
)
PyLong_Type
结构体中定义了 tp_basicsize
与 tp_itemsize
,继承 PyVarObject
需要指定 tp_itemsize
,即变长对象中可变数组的单位大小。
PyTypeObject PyLong_Type = {
PyObject_HEAD_INIT(&PyType_Type)
0, /* ob_size */
"long", /* tp_name */
offsetof(PyLongObject, ob_digit), /* tp_basicsize */
sizeof(digit), /* tp_itemsize */
....
};
继承 PyObject
则只需要将该项留空,如 PyInt_Type
。
PyTypeObject PyInt_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"int",
sizeof(PyIntObject),
0,
....
}
参考 https://docs.python.org/2/c-api/structures.html
PyStringObject
PyStringObject
定义在 include/stringobject.h
中,实际的字符串内容存放在 ob_sval 数组中,长度为 ob_size + 1
,ob_sval结尾为固定的 \0
。ob_shash
存储了该对象的 hash 值,默认为 -1, ob_sstate
用来表示该对象是否在缓存队列里,初始化为 SSTATE_NOT_INTERNED
即不在缓存中。
typedef struct {
PyObject_VAR_HEAD
long ob_shash;
int ob_sstate;
char ob_sval[1];
/* Invariants:
* ob_sval contains space for 'ob_size+1' elements.
* ob_sval[ob_size] == 0.
* ob_shash is the hash of the string or -1 if not computed yet.
* ob_sstate != 0 iff the string object is in stringobject.c's
* 'interned' dictionary; in this case the two references
* from 'interned' to this object are *not counted* in ob_refcnt.
*/
} PyStringObject;
ob_sstate
的值有下面几种,分别为:不在缓存中、在缓存中且可修改、在缓存中且不可修改。
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2
PyStringObject 的创建
PyStringObject
最常用的创建函数为 PyString_FromStringAndSize
位于 Objcts/stringobjec.c
。
在该函数设置一个断点,然后在 REPL 中输入 a=“123”
, 来观察 Python 中字符串对象的创建过程。
可以得到调用栈如下
#0 PyString_FromStringAndSize (str=0x7fffffffced0 "123", size=3) at Objects/stringobject.c:60
#1 0x00000000004a773b in PyUnicodeUCS2_EncodeUTF8 (s=0x7ffff6436860, size=3, errors=0x0) at Objects/unicodeobject.c:2232
#2 0x000000000054971c in utf_8_encode (self=0x0, args=0x7ffff7e68a00) at ./Modules/_codecsmodule.c:698
#3 0x0000000000573222 in PyCFunction_Call (func=0x7ffff6ebb678, arg=0x7ffff7e68a00, kw=0x0) at Objects/methodobject.c:81
#4 0x000000000042174b in PyObject_Call (func=0x7ffff6ebb678, arg=0x7ffff7e68a00, kw=0x0) at Objects/abstract.c:2547
#5 0x00000000004dedba in PyEval_CallObjectWithKeywords (func=0x7ffff6ebb678, arg=0x7ffff7e68a00, kw=0x0) at Python/ceval.c:4226
#6 0x00000000004ee79e in _PyCodec_EncodeInternal (object=0x7ffff6ed93a0, encoder=0x7ffff6ebb678, encoding=0x7ffff6436928 "UTF-8", errors=0x0) at Python/codecs.c:355
#7 0x00000000004ef2cd in _PyCodec_EncodeText (object=0x7ffff6ed93a0, encoding=0x7ffff6436928 "UTF-8", errors=0x0) at Python/codecs.c:541
#8 0x00000000004a5714 in PyUnicodeUCS2_AsEncodedString (unicode=0x7ffff6ed93a0, encoding=0x7ffff6436928 "UTF-8", errors=0x0) at Objects/unicodeobject.c:1374
#9 0x000000000059b774 in parsestr (c=0x7fffffffd9b0, n=0x7ffff7f95b80, s=0x7ffff64368d9 "123'") at Python/ast.c:3535
#10 0x000000000059b896 in parsestrplus (c=0x7fffffffd9b0, n=0x7ffff7f95b80) at Python/ast.c:3558
#11 0x0000000000594c7f in ast_for_atom (c=0x7fffffffd9b0, n=0x7ffff7f95b80) at Python/ast.c:1377
#12 0x0000000000596442 in ast_for_power (c=0x7fffffffd9b0, n=0x7ffff7f95b38) at Python/ast.c:1790
#13 0x0000000000596bbd in ast_for_expr (c=0x7fffffffd9b0, n=0x7ffff7f95b38) at Python/ast.c:1968
#14 0x00000000005972ed in ast_for_testlist (c=0x7fffffffd9b0, n=0x7ffff7eeed50) at Python/ast.c:2131
#15 0x00000000005978ba in ast_for_expr_stmt (c=0x7fffffffd9b0, n=0x7ffff7f95358) at Python/ast.c:2261
#16 0x000000000059aae6 in ast_for_stmt (c=0x7fffffffd9b0, n=0x7ffff7f95358) at Python/ast.c:3267
#17 0x0000000000591f16 in PyAST_FromNode (n=0x7ffff7f952c8, flags=0x7fffffffdbc0, filename=0x59e66a "<stdin>", arena=0x87a500) at Python/ast.c:298
可以看到字符串常量 "123"
是在创建 AST 的时候传给 PyString_FromStringAndSize
用于创建 PyStringObject
的。
其创建过程如下:
有一全局静态变量 nullstring 用于表示所有空字符串,如果传入的字符串为空则直接返回指向 nullstring 的指针。
static PyStringObject *nullstring;
单字符的 str,有一个类似 IntObject 的 freelist 这样的全局缓存,根据字符内容可以直接获取到对应的指针。
static PyStringObject *characters[UCHAR_MAX + 1];
正常情况下的字符串对象创建时,如传入 "123"
,会调用 PyObject_MALLOC
来申请内存。上面的 PyLongObject
创建时,是调用 PyObject_NEW_VAR
宏,计算好内存大小,间接调用 PyObject_MALLOC
。
而 PyString_FromStringAndSize
中因为 ob_sval
中每个元素长度固定(char),通过 PyStringObject_SIZE
计算出结构体中固定的大小后加上变化的 size 参数,直接调用 PyObject_MALLOC
。
至于 PyObject_MALLOC
里 CPython 内存管理的细节,在下面再慢慢展开。
PyStringObject_SIZE
得到 PyStringObject
中 ob_sval
字段之前的字段的大小之和,加上 ob_sval
末尾固定的 \0
的元素大小(1)。
#define PyStringObject_SIZE (offsetof(PyStringObject, ob_sval) + 1)
申请好内存之后,将按上面叙述过的过程进行初始化。然后将 char *
的内容拷贝到 ob_sval
中,并在末尾填入 \0
。剩下为在 nullstring
和 characters
中的元素未初始化时对其进行初始化。
PyObject *
PyString_FromStringAndSize(const char *str, Py_ssize_t size)
{
register PyStringObject *op;
if (size < 0) {
PyErr_SetString(PyExc_SystemError,
"Negative size passed to PyString_FromStringAndSize");
return NULL;
}
if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
null_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
if (size == 1 && str != NULL &&
(op = characters[*str & UCHAR_MAX]) != NULL)
{
#ifdef COUNT_ALLOCS
one_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
PyErr_SetString(PyExc_OverflowError, "string is too large");
return NULL;
}
/* Inline PyObject_NewVar */
op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyString_Type, size);
op->ob_shash = -1;
op->ob_sstate = SSTATE_NOT_INTERNED;
if (str != NULL)
Py_MEMCPY(op->ob_sval, str, size);
op->ob_sval[size] = '\0';
/* share short strings */
if (size == 0) {
PyObject *t = (PyObject *)op;
PyString_InternInPlace(&t);
op = (PyStringObject *)t;
nullstring = op;
Py_INCREF(op);
} else if (size == 1 && str != NULL) {
PyObject *t = (PyObject *)op;
PyString_InternInPlace(&t);
op = (PyStringObject *)t;
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
}