CPython源码阅读笔记(1)
原文链接 http://woodrat.xyz/2017/06/21/CPython%E6%BA%90%E7%A0%81%E9%98%85%E8%AF%BB%E7%AC%94%E8%AE%B0%281%29
注:以下为加速网络访问所做的原文缓存,经过重新格式化,可能存在格式方面的问题,或偶有遗漏信息,请以原文为准。
准备调试环境
目前 CPython 的开发已经迁移到了 Github 上,可以直接去 Github clone 对应的分支。 我们将基于 Python 2.7.13 版本, Linux x86_64 环境进行接下来的工作。 下载好代码以后以
./configure --with-pydebug
make -j2
编译。
调试可以直接使用 GDB, 然后使用 Emacs + Ctags 看代码。(喜欢使用 IDE 的话可以考虑 eclipse-cdt)
寻找参考资料
官方的 Python Developer’s Guide 是开始时必备的资料。 readthedocs 上另外一个版本的 Python Developer’s Guide 更详细一点。 Exploring CPython’s Internals 一节列出了 CPython 的目录结构, 以及推荐了几篇很有参考价值的文章。
- Internals of CPython (这篇比较长,写的比较仔细)
- Yet another guided tour of CPython (这篇的作者是 Guido)
可以参考 devguide 的 compiler 一节来调试 Python 解释器, 跟着执行流程一步步看代码。
另外 Philip Guo 的博客 上关于 Python 的博文也十分有价值。
中文资料中 《Python 源码剖析》 一定不能错过。
从 main 开始
在 gdb 中直接 b main
,输入 list
可以看到对应 main 函数的代码为
// Modules/python.c
#include "Python.h"
int
main(int argc, char **argv)
{
return Py_Main(argc, argv);
}
在编辑器中打开对应文件,在 Python 2.7 中位于 Modules/python.c
,
该文件只是一个入口,直接调用了 Py_Main
。Py_Main
位于 Modules/main.c
中,
该函数的主要作用如下:
- 初始化环境变量和命令行参数
- 如果参数里有
-R
则调用_PyRandom_Init
初始化 Hash 算法的随机数生成,使得 dict 对象中 key 的顺序每次启动时随机。 - 调用
Py_Initialize
初始化 Python 解释器 - 根据命令行选项决定是运行模块
-m
RunModule
, 还是运行一条语句-c
PyRun_SimpleStringFlags
, 或是运行一个文件PyRun_AnyFileExFlags
我们先跟一下 PyRun_SimpleStringFlags
, 该函数添加了一个默认的 __main__
后直接调用了 PyRun_StringFlags
。
截取 PyRun_StringFlags
函数部分代码如下
PyParser_ASTFromString
对输入的源码进行词法语法分析生成 AST,run_mod
生成字节码并运行。
// Python/pythonrun.c PyRun_StringFlags
PyObject *
PyRun_StringFlags(const char *str, int start, PyObject *globals,
PyObject *locals, PyCompilerFlags *flags)
{
PyObject *ret = NULL;
mod_ty mod;
PyArena *arena = PyArena_New(); /* 为编译阶段申请一块内存池 */
if (arena == NULL)
return NULL;
mod = PyParser_ASTFromString(str, "<string>", start, flags, arena); /* 将语句解析为 AST */
if (mod != NULL)
ret = run_mod(mod, "<string>", globals, locals, flags, arena); /* 调用 run_mod */
PyArena_Free(arena);
return ret;
}
词法语法分析
PyParser_ASTFromString
先将源码解析为 Parser Tree,
再调用 PyAST_FromNode
,将 Parser Tree 转换为 AST。
// Python/pythonrun.c 54 行 引用了 graminit 中的 grammar 结构,该结构将在下面几个函数中以 `g` 变量名传递
extern grammar _PyParser_Grammar; /* From graminit.c */
// Python/pythonrun.c PyParser_ASTFromString
mod_ty
PyParser_ASTFromString(const char *s, const char *filename, int start,
PyCompilerFlags *flags, PyArena *arena)
{
mod_ty mod;
PyCompilerFlags localflags;
perrdetail err;
int iflags = PARSER_FLAGS(flags);
// 将源码解析为了 Parser Tree
node *n = PyParser_ParseStringFlagsFilenameEx(s, filename,
&_PyParser_Grammar, start, &err,
&iflags);
if (flags == NULL) {
localflags.cf_flags = 0;
flags = &localflags;
}
if (n) {
flags->cf_flags |= iflags & PyCF_MASK;
// 将 Parser Tree 转换为 AST
mod = PyAST_FromNode(n, flags, filename, arena);
PyNode_Free(n);
return mod; // 返回 AST
}
else {
err_input(&err);
return NULL;
}
}
生成 TOKEN
// Parser/parsetok.c PyParser_ParseStringFlagsFilenameEx
node *
PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename,
grammar *g, int start,
perrdetail *err_ret, int *flags)
{
struct tok_state *tok;
initerr(err_ret, filename);
// 初始化词法分析模块
if ((tok = PyTokenizer_FromString(s, start == file_input)) == NULL) {
err_ret->error = PyErr_Occurred() ? E_DECODE : E_NOMEM;
return NULL;
}
tok->filename = filename ? filename : "<string>";
if (Py_TabcheckFlag || Py_VerboseFlag) {
tok->altwarning = (tok->filename != NULL);
if (Py_TabcheckFlag >= 2)
tok->alterror++;
}
// 调用 parsetok ,将源码转换为 token 序列
return parsetok(tok, g, start, err_ret, flags);
parsetok 循环直到读完整个字符串,将分出来的 token 添加到 parser 中, 最后返回 parser tree
token 的 type 定义在 Include/token.h
中
// Parser/parsetok.c parsetok
static node *
parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
int *flags)
{
// 初始化 parser
if ((ps = PyParser_New(g, start)) == NULL) {
....
// 获取 token type
type = PyTokenizer_Get(tok, &a, &b);
....
// 将 token 赋值给 str
len = b - a; /* XXX this may compute NULL - NULL */
str = (char *) PyObject_MALLOC(len + 1);
if (len > 0)
strncpy(str, a, len);
str[len] = '\0';
....
// 将 token 添加到 parser tree 中
if ((err_ret->error =
PyParser_AddToken(ps, (int)type, str, tok->lineno, col_offset,
&(err_ret->expected))) != E_OK) {
if (err_ret->error != E_DONE) {
PyObject_FREE(str);
err_ret->token = type;
}
break;
}
....
if (err_ret->error == E_DONE) {
n = ps->p_tree;
ps->p_tree = NULL;
}
else
n = NULL;
....
done:
PyTokenizer_Free(tok); // 释放词法分析器
return n; // 返回 parser->p_tree
}
以 a = 1
为例,循环5次得到的 str 和 type 依次为:
- str: a, type: 1(NAME)
- str: =, type: 22(EQUAL)
- str: 1, type: 2(NUMBER)
- str: \n, type: 4(NEWLINE)
- str: 未初始化, type: 0(ENDMARKER)
node 的结构定义在 /Include/node.h
中
typedef struct _node {
short n_type;
char *n_str;
int n_lineno;
int n_col_offset;
int n_nchildren;
struct _node *n_child;
} node;
Add TOKEN to Parser Tree
使用 export PYTHONDEBUG=1
开启 python 的调试模式,可以看到 python 将 TOKEN 解析为 Parser Tree 的过程。
./Python-2.7.12/python -d -c 'a = 1'
Token NAME/'a' ... It's a token we know
....
DFA 'atom', state 0: Shift.
DFA 'atom', state 5: Direct pop.
Token EQUAL/'=' ... It's a token we know
....
DFA 'expr_stmt', state 1: Shift.
Token NUMBER/'1' ... It's a token we know
....
DFA 'atom', state 5: Direct pop.
Token NEWLINE/'' ... It's a token we know
....
DFA 'simple_stmt', state 3: Direct pop.
DFA 'stmt', state 1: Direct pop.
Token NEWLINE/'' ... It's a token we know
DFA 'file_input', state 0: Shift.
Token ENDMARKER/'' ... It's a token we know
DFA 'file_input', state 0: Shift.
DFA 'file_input', state 1: Direct pop.
ACCEPT.
[35150 refs]
可以看到只有正确解析的 TOKEN 才会显示 "Direct pop
生成 TOKEN 后,按照语法规则生成 Parser Tree
// Parser/parser.c PyParser_AddToken
int
PyParser_AddToken(register parser_state *ps, register int type, char *str,
int lineno, int col_offset, int *expected_ret)
{
...
/* Loop until the token is shifted or an error occurred */
for (;;) {
...
/* Pop while we are in an accept-only state */
while (s = &d->d_state
[ps->p_stack.s_top->s_state],
s->s_accept && s->s_narcs == 1) {
D(printf(" DFA '%s', state %d: "
"Direct pop.\n",
d->d_name,
ps->p_stack.s_top->s_state));
...
}
}
所以上面 Tree 的构建顺序为 atom
expr_stmt
atom
simple_stmt
stmt
file_input
Parser Tree to AST
在上面的 PyParser_ASTFromString
函数中可以看到,在调用 PyParser_ParseStringFlagsFilenameEx
生成 Parser Tree ( concrete syntax tree ) 后,调用 PyAST_FromNode
生成 AST。
// ast.c PyAST_FromNode
mod_ty
PyAST_FromNode(const node *n, PyCompilerFlags *flags, const char *filename,
PyArena *arena)
{
...
stmts = asdl_seq_new(num_stmts(n), arena); /* 为 AST 申请内存 */
if (!stmts)
return NULL;
for (i = 0; i < NCH(n) - 1; i++) {
ch = CHILD(n, i);
if (TYPE(ch) == NEWLINE)
continue;
REQ(ch, stmt);
num = num_stmts(ch);
if (num == 1) {
s = ast_for_stmt(&c, ch); /* 生成对应的 AST Node */
if (!s)
goto error;
asdl_seq_SET(stmts, k++, s); /* 将对应的 AST Node 设置到 asdl_seq 中对应的位置 */
}
...
return Module(stmts, arena); /* 转换 asdl_seq 为 mod_ty, mod_ty 事实上只是一个对
asdl_seq 的简单封装,给 asdl_seq 加上了类型而已 */
}
...
}
asdl_seq
结构定义在 Include/asdl.h
中,其实就是一个放 AST Node 的数组,根据注释可以看出,这个文件是由 Parser/asdl_c.py
生成的。
/* It would be nice if the code generated by asdl_c.py was completely
independent of Python, but it is a goal the requires too much work
at this stage. So, for example, I'll represent identifiers as
interned Python strings.
*/
/* XXX A sequence should be typed so that its use can be typechecked. */
typedef struct {
int size;
void *elements[1];
} asdl_seq;
asdl_c.py
将 Parser/Python.asdl
中定义的 asdl 生成 C 代码,对应的 asdl 的实现在 Parser/asdl.py
中。
ast_for_xxx
是生成对应 xxx AST Node 的函数, ast_for_stmt
就是生成 stmt node (stmt_ty 由 asdl_c.py 生成)。
ast_for_stmt
其实就是一个巨大的 switch ,根据语法规则调用不同的 ast_for_xxx
, 例如 a = 1
,就会调用到 ast_for_expr
a = 1
生成 AST 的 callstack 如下
- ast_for_stmt
- ast_for_expr_stmt
- ast_for_testlist
- ast_for_expr
最后返回的 AST 可以由下面的 Python 代码查看
import ast
t = ast.parse('a = 1')
ast.dump(t)
>>> "Module(body=[Assign(targets=[Name(id='a', ctx=Store())], value=Num(n=1))])"
AST to CodeObject
上面分析到 PyParser_ASTFromString
将源码解析为 AST (mod_ty),调用 run_mod
函数。
run_mod
中有两个主要的函数:
- PyAST_Compile,
- PyEval_EvalCode
PyAST_Compile
将 AST 编译成 CodeObject,PyEval_EvalCode
运行编译后的 CodeObject。
// Python/pythonrun.c run_mod
static PyObject *
run_mod(mod_ty mod, const char *filename, PyObject *globals, PyObject *locals,
PyCompilerFlags *flags, PyArena *arena)
{
PyCodeObject *co;
PyObject *v;
co = PyAST_Compile(mod, filename, flags, arena);
if (co == NULL)
return NULL;
v = PyEval_EvalCode(co, globals, locals);
Py_DECREF(co);
return v;
}
我们先看一下编译的过程
Build Symbol Table
PyAST_Compile
里主要使用 PySymtable_Build
, compiler_mod
两个函数
// Python/compile.c PyAST_Compile
PyCodeObject *
PyAST_Compile(mod_ty mod, const char *filename, PyCompilerFlags *flags,
PyArena *arena)
{
struct compiler c; // 初始化 compiler 结构
PyCodeObject *co = NULL;
PyCompilerFlags local_flags;
...
if (!compiler_init(&c))
return NULL;
c.c_filename = filename;
c.c_arena = arena;
c.c_future = PyFuture_FromAST(mod, filename);
...
c.c_st = PySymtable_Build(mod, filename, c.c_future);
co = compiler_mod(&c, mod);
finally:
compiler_free(&c);
assert(co || PyErr_Occurred());
return co;
}
PySymtable_Build
遍历 AST 构建 Symbol table,存储一些类名、变量名之类的信息。
struct symtable {
const char *st_filename; /* name of file being compiled */
struct _symtable_entry *st_cur; /* current symbol table entry */
struct _symtable_entry *st_top; /* module entry */
PyObject *st_symbols; /* dictionary of symbol table entries */
PyObject *st_stack; /* stack of namespace info */
PyObject *st_global; /* borrowed ref to MODULE in st_symbols */
int st_nblocks; /* number of blocks */
PyObject *st_private; /* name of current class or NULL */
PyFutureFeatures *st_future; /* module's future features */
};
typedef struct _symtable_entry {
PyObject_HEAD
PyObject *ste_id; /* int: key in st_symbols */
PyObject *ste_symbols; /* dict: name to flags */
PyObject *ste_name; /* string: name of block */
PyObject *ste_varnames; /* list of variable names */
PyObject *ste_children; /* list of child ids */
_Py_block_ty ste_type; /* module, class, or function */
int ste_unoptimized; /* false if namespace is optimized */
int ste_nested; /* true if block is nested */
unsigned ste_free : 1; /* true if block has free variables */
unsigned ste_child_free : 1; /* true if a child block has free vars,
including free refs to globals */
unsigned ste_generator : 1; /* true if namespace is a generator */
unsigned ste_varargs : 1; /* true if block has varargs */
unsigned ste_varkeywords : 1; /* true if block has varkeywords */
unsigned ste_returns_value : 1; /* true if namespace uses return with
an argument */
int ste_lineno; /* first line of block */
int ste_opt_lineno; /* lineno of last exec or import * */
int ste_tmpname; /* counter for listcomp temp vars */
struct symtable *ste_table;
} PySTEntryObject;
Build CFG
AST 传入 compiler_mod
后会调用 compiler_XXX
生成 CFG。
然后 assemble
会从 CFG 生成最后的 bytecode。
// Python/compiler.c compiler_mod
static PyCodeObject *
compiler_mod(struct compiler *c, mod_ty mod)
{
PyCodeObject *co;
int addNone = 1;
...
if (!compiler_enter_scope(c, module, mod, 0))
return NULL;
switch (mod->kind) {
case Module_kind:
if (!compiler_body(c, mod->v.Module.body)) {
compiler_exit_scope(c);
return 0;
}
break;
...
}
co = assemble(c, addNone);
compiler_exit_scope(c);
return co;
}
根据上面 AST 生成一节, a = 1
的 AST 为 Module(body=[Assign(targets=[Name(id='a', ctx=Store())], value=Num(n=1))])
,所以将会执行 compiler_body
。
// Python/compiler.c compiler_body
static int
compiler_body(struct compiler *c, asdl_seq *stmts)
{
...
for (; i < asdl_seq_LEN(stmts); i++)
VISIT(c, stmt, (stmt_ty)asdl_seq_GET(stmts, i));
...
}
VISIT
是定义在 compiler.c 中的一个宏,根据 AST Node 的 Type,调用不同的 compiler_visit_xxx
函数。
#define VISIT(C, TYPE, V) {\
if (!compiler_visit_ ## TYPE((C), (V))) \
return 0; \
}
这里 VISIT
宏展开是 compiler_visit_stmt
。
// Python/compiler.c compiler_visit_stmt
static int
compiler_visit_stmt(struct compiler *c, stmt_ty s)
{
...
switch (s->kind) {
case Assign_kind:
n = asdl_seq_LEN(s->v.Assign.targets);
VISIT(c, expr, s->v.Assign.value);
for (i = 0; i < n; i++) {
if (i < n - 1)
ADDOP(c, DUP_TOP);
VISIT(c, expr,
(expr_ty)asdl_seq_GET(s->v.Assign.targets, i));
}
break;
}
...
}
首先调用 VISIT(c, expr, s->v.Assign.value);
也就是 compiler_visit_expr
函数。
传入的参数为 Assign.value
, 根据 AST value=Num(n=1)
。可以在 gdb 中验证。
>> p e.kind
$2 = Num_kind
// Python/compiler.c compiler_visit_expr
static int
compiler_visit_expr(struct compiler *c, expr_ty e)
{
switch (e->kind) {
case Num_kind:
ADDOP_O(c, LOAD_CONST, e->v.Num.n, consts);
break;
}
}
ADDOP_O
是一个宏,用于将 opcode 添加到 compiler 结构中。添加 opcode 有以下几个宏:
// 添加一个 opcode
#define ADDOP(C, OP) { \
if (!compiler_addop((C), (OP))) \
return 0; \
}
#define ADDOP_IN_SCOPE(C, OP) { \
if (!compiler_addop((C), (OP))) { \
compiler_exit_scope(c); \
return 0; \
} \
}
// 添加一个 opcode ,带一个没有具体名字的参数,`a=1` 中的 value(Num(1))
#define ADDOP_O(C, OP, O, TYPE) { \
if (!compiler_addop_o((C), (OP), (C)->u->u_ ## TYPE, (O))) \
return 0; \
}
// 添加一个 opcode, 带一个有名字的参数,如变量名
#define ADDOP_NAME(C, OP, O, TYPE) { \
if (!compiler_addop_name((C), (OP), (C)->u->u_ ## TYPE, (O))) \
return 0; \
}
#define ADDOP_I(C, OP, O) { \
if (!compiler_addop_i((C), (OP), (O))) \
return 0; \
}
#define ADDOP_JABS(C, OP, O) { \
if (!compiler_addop_j((C), (OP), (O), 1)) \
return 0; \
}
#define ADDOP_JREL(C, OP, O) { \
if (!compiler_addop_j((C), (OP), (O), 0)) \
return 0; \
}
所有带参数的 opcode 最后都会调用到 compiler_addop_i
,AST 中的参数是数字或者变量名,到了 opcode 中都是一个 integer,为对应值在 compiler 结构中的下标。
对应的 opcode 定义在 include/opcode.h
中,如 opcode =100
的定义为 #define LOAD_CONST 100 /* Index in const list */
/* Add an opcode with an integer argument.
Returns 0 on failure, 1 on success.
*/
static int
compiler_addop_i(struct compiler *c, int opcode, int oparg)
{
struct instr *i;
int off;
off = compiler_next_instr(c, c->u->u_curblock);
if (off < 0)
return 0;
i = &c->u->u_curblock->b_instr[off];
i->i_opcode = opcode;
i->i_oparg = oparg;
i->i_hasarg = 1;
compiler_set_lineno(c, off);
return 1;
}
assemble
compile_mod
中,compiler_xxx
构建出了 CFG 调用 assemble
函数构建出最后的 CodeObject。
static PyCodeObject *
assemble(struct compiler *c, int addNone)
{
basicblock *b, *entryblock;
struct assembler a;
int i, j, nblocks;
PyCodeObject *co = NULL;
/* Make sure every block that falls off the end returns None.
XXX NEXT_BLOCK() isn't quite right, because if the last
block ends with a jump or return b_next shouldn't set.
*/
if (!c->u->u_curblock->b_return) {
NEXT_BLOCK(c);
if (addNone)
ADDOP_O(c, LOAD_CONST, Py_None, consts);
ADDOP(c, RETURN_VALUE);
}
// 找到 entryblock
nblocks = 0;
entryblock = NULL;
for (b = c->u->u_blocks; b != NULL; b = b->b_list) {
nblocks++;
entryblock = b;
}
...
if (!assemble_init(&a, nblocks, c->u->u_firstlineno))
goto error;
dfs(c, entryblock, &a);
/* Can't modify the bytecode after computing jump offsets. */
assemble_jump_offsets(&a, c);
/* Emit code in reverse postorder from dfs. */
for (i = a.a_nblocks - 1; i >= 0; i--) {
b = a.a_postorder[i];
for (j = 0; j < b->b_iused; j++)
if (!assemble_emit(&a, &b->b_instr[j]))
goto error;
}
if (_PyString_Resize(&a.a_lnotab, a.a_lnotab_off) < 0)
goto error;
if (_PyString_Resize(&a.a_bytecode, a.a_offset) < 0)
goto error;
co = makecode(c, &a);
error:
assemble_free(&a);
return co;
}
assemble
中先调用 dfs
以后序深度优先遍历 struct compiler(CFG),将 CFG 平坦化,然后调用 assemble_jump_offsets
生成跳转地址。最后调用 makecode
生成 CodeObject。
调试的过程中,可以使用 dis
模块来查看源码对应的字节码。将源码写入 test.py
,
然后调用 python -m dis test.py
即可。
echo 'a=1' > test.py
python2 -m dis test.py
1 0 LOAD_CONST 0 (1)
3 STORE_NAME 0 (a)
6 LOAD_CONST 1 (None)
9 RETURN_VALUE
CodeObject 的结构定义在 include/code.h
中
/* Bytecode object */
typedef struct {
PyObject_HEAD
int co_argcount; /* #arguments, except *args */
int co_nlocals; /* #local variables */
int co_stacksize; /* #entries needed for evaluation stack */
int co_flags; /* CO_..., see below */
PyObject *co_code; /* instruction opcodes */
PyObject *co_consts; /* list (constants used) */
PyObject *co_names; /* list of strings (names used) */
PyObject *co_varnames; /* tuple of strings (local variable names) */
PyObject *co_freevars; /* tuple of strings (free variable names) */
PyObject *co_cellvars; /* tuple of strings (cell variable names) */
/* The rest doesn't count for hash/cmp */
PyObject *co_filename; /* string (where it was loaded from) */
PyObject *co_name; /* string (name, for reference) */
int co_firstlineno; /* first source line number */
PyObject *co_lnotab; /* string (encoding addr<->lineno mapping) See
Objects/lnotab_notes.txt for details. */
void *co_zombieframe; /* for optimization only (see frameobject.c) */
PyObject *co_weakreflist; /* to support weakrefs to code objects */
} PyCodeObject;
Eval CodeObject
run_mod
中构建出 CodeObject 后调用 PyEval_EvalCode
运行 CodeObject。
可以看到, PyEval_EvalCode
只是简单调用了 PyEval_EvalCodeEx
。
PyObject *
PyEval_EvalCode(PyCodeObject *co, PyObject *globals, PyObject *locals)
{
return PyEval_EvalCodeEx(co,
globals, locals,
(PyObject **)NULL, 0,
(PyObject **)NULL, 0,
(PyObject **)NULL, 0,
NULL);
}
PyEval_EvalCodeEx
构建出 PyFrameObject
然后执行 PyEval_EvalFrameEx
// Python/ceval.c PyEval_EvalCodeEx
PyObject *
PyEval_EvalCodeEx(PyCodeObject *co, PyObject *globals, PyObject *locals,
PyObject **args, int argcount, PyObject **kws, int kwcount,
PyObject **defs, int defcount, PyObject *closure)
{
register PyFrameObject *f;
...
retval = PyEval_EvalFrameEx(f,0);
...
return retval;
}
FrameObject
PyFrameObject
定义在 include/frameobject.h
中
typedef struct _frame {
PyObject_VAR_HEAD
struct _frame *f_back; /* previous frame, or NULL */
PyCodeObject *f_code; /* code segment */
PyObject *f_builtins; /* builtin symbol table (PyDictObject) */
PyObject *f_globals; /* global symbol table (PyDictObject) */
PyObject *f_locals; /* local symbol table (any mapping) */
PyObject **f_valuestack; /* points after the last local */
/* Next free slot in f_valuestack. Frame creation sets to f_valuestack.
Frame evaluation usually NULLs it, but a frame that yields sets it
to the current stack top. */
PyObject **f_stacktop;
PyObject *f_trace; /* Trace function */
/* If an exception is raised in this frame, the next three are used to
* record the exception info (if any) originally in the thread state. See
* comments before set_exc_info() -- it's not obvious.
* Invariant: if _type is NULL, then so are _value and _traceback.
* Desired invariant: all three are NULL, or all three are non-NULL. That
* one isn't currently true, but "should be".
*/
PyObject *f_exc_type, *f_exc_value, *f_exc_traceback;
PyThreadState *f_tstate;
int f_lasti; /* Last instruction if called */
/* Call PyFrame_GetLineNumber() instead of reading this field
directly. As of 2.3 f_lineno is only valid when tracing is
active (i.e. when f_trace is set). At other times we use
PyCode_Addr2Line to calculate the line from the current
bytecode index. */
int f_lineno; /* Current line number */
int f_iblock; /* index in f_blockstack */
PyTryBlock f_blockstack[CO_MAXBLOCKS]; /* for try and loop blocks */
PyObject *f_localsplus[1]; /* locals+stack, dynamically sized */
} PyFrameObject;
可以看到 FrameObject
中存储了对应的字节码,local、global、builtin 三种变量,以及数据栈等运行时必须的信息。
PyTryBlock
中用来处理 Try 和 Loop(循环) 语句。在 break
continue
return
等时候可以跳转到 b_handler
记录的位置继续执行。
typedef struct {
int b_type; /* what kind of block this is */
int b_handler; /* where to jump to find handler */
int b_level; /* value stack level to pop to */
} PyTryBlock;
PyEval_EvalFrameEx
PyEval_EvalFrameEx
是 CPython 解释器最主要的求值函数,核心是一个循环里的巨大的 switch case,对不同的 opcode 执行不同的操作。
下面摘抄部分 PyEval_EvalFrameEx
代码,忽略 profile
和 debug
相关功能的代码。
可以看到在 PyEval_EvalFrameEx
的 for 循环中,先判断了锁的状态,确保同一时间只有一个线程访问解释器,然后通过 NEXTOP
等宏操作 next_instr
指针,以执行不同的字节码。
// Python/ceval.c PyEval_EvalFrameEx
PyObject *
PyEval_EvalFrameEx(PyFrameObject *f, int throwflag)
{
...
register PyObject **stack_pointer; /* Next free slot in value stack */
register unsigned char *next_instr;
register int opcode; /* Current opcode */
register int oparg; /* Current opcode argument, if any */
register enum why_code why; /* Reason for block stack unwind */
register int err; /* Error status -- nonzero if error */
register PyObject *x; /* Result object -- NULL if error */
register PyObject *v; /* Temporary objects popped off stack */
register PyObject *w;
register PyObject *u;
register PyObject *t;
register PyObject *stream = NULL; /* for PRINT opcodes */
register PyObject **fastlocals, **freevars;
PyObject *retval = NULL; /* Return value */
PyThreadState *tstate = PyThreadState_GET();
PyCodeObject *co;
unsigned char *first_instr;
PyObject *names;
PyObject *consts;
...
/* Start of code */
if (f == NULL)
return NULL;
/* push frame */
if (Py_EnterRecursiveCall(""))
return NULL;
tstate->frame = f;
...
co = f->f_code;
names = co->co_names;
consts = co->co_consts;
fastlocals = f->f_localsplus;
freevars = f->f_localsplus + co->co_nlocals;
first_instr = (unsigned char*) PyString_AS_STRING(co->co_code);
next_instr = first_instr + f->f_lasti + 1;
stack_pointer = f->f_stacktop;
assert(stack_pointer != NULL);
f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */
...
why = WHY_NOT;
err = 0;
x = Py_None; /* Not a reference, just anything non-NULL */
w = NULL;
for (;;) {
...
if (interpreter_lock) {
/* Give another thread a chance */
if (PyThreadState_Swap(NULL) != tstate)
Py_FatalError("ceval: tstate mix-up");
PyThread_release_lock(interpreter_lock);
/* Other threads may run now */
PyThread_acquire_lock(interpreter_lock, 1);
if (PyThreadState_Swap(tstate) != NULL)
Py_FatalError("ceval: orphan tstate");
/* Check for thread interrupts */
if (tstate->async_exc != NULL) {
x = tstate->async_exc;
tstate->async_exc = NULL;
PyErr_SetNone(x);
Py_DECREF(x);
why = WHY_EXCEPTION;
goto on_error;
}
}
...
/* Extract opcode and argument */
opcode = NEXTOP();
oparg = 0; /* allows oparg to be stored in a register because
it doesn't have to be remembered across a full loop */
if (HAS_ARG(opcode))
oparg = NEXTARG();
dispatch_opcode:
/* Main switch on opcode */
switch (opcode) {
/* BEWARE!
It is essential that any operation that fails sets either
x to NULL, err to nonzero, or why to anything but WHY_NOT,
and that no operation that succeeds does this! */
/* case STOP_CODE: this is an error! */
TARGET_NOARG(NOP)
{
FAST_DISPATCH();
}
TARGET(LOAD_FAST)
{
x = GETLOCAL(oparg);
if (x != NULL) {
Py_INCREF(x);
PUSH(x);
FAST_DISPATCH();
}
format_exc_check_arg(PyExc_UnboundLocalError,
UNBOUNDLOCAL_ERROR_MSG,
PyTuple_GetItem(co->co_varnames, oparg));
break;
}
... // 省略剩余 opcode 对应的 case
}/* switch */
/* End the loop if we still have an error (or return) */
if (why != WHY_NOT)
break;
} /* main loop */
assert(why != WHY_YIELD);
/* Pop remaining stack entries. */
while (!EMPTY()) {
v = POP();
Py_XDECREF(v);
}
if (why != WHY_RETURN)
retval = NULL;
return retval;
}
其中 why
变量是一个表示 main loop 状态的枚举值。
/* Status code for main loop (reason for stack unwind) */
enum why_code {
WHY_NOT = 0x0001, /* No error */
WHY_EXCEPTION = 0x0002, /* Exception occurred */
WHY_RERAISE = 0x0004, /* Exception re-raised by 'finally' */
WHY_RETURN = 0x0008, /* 'return' statement */
WHY_BREAK = 0x0010, /* 'break' statement */
WHY_CONTINUE = 0x0020, /* 'continue' statement */
WHY_YIELD = 0x0040 /* 'yield' operator */
};
其中 PUSH
POP
都是对 stack_pointer 操作的宏,stack_pointer 指向 FrameObject 中的运行时栈,初始化时指向栈顶。
stack_pointer = f->f_stacktop;
NEXTOP
以及 JUMPTO
都是对 next_instr
进行操作的宏,如
#define NEXTOP() (*next_instr++)
控制下一个执行的 opcode
而 #define NEXTARG() (next_instr += 2, (next_instr[-1]<<8) + next_instr[-2])
是从 opcode 中提出参数。
可以看出, CPython 虚拟机是基于栈、支持多线程和协程(yield),并且支持异常处理,和许多语言特性。