CPython 源码阅读笔记(4)
原文链接 http://woodrat.xyz/2018/11/02/CPython+%e6%ba%90%e7%a0%81%e9%98%85%e8%af%bb%e7%ac%94%e8%ae%b0(4)/
注:以下为加速网络访问所做的原文缓存,经过重新格式化,可能存在格式方面的问题,或偶有遗漏信息,请以原文为准。
之前看了 a-python-interpreter-written-in-python 和 byterun,就想试试用 JAVA 解析 Python 生成的 pyc 文件,读取 bytecode 后在 JAVA 中实现解释执行。
要解析 pyc 文件,就需要知道其来龙去脉,以及是如何生成的。
pyc
根据平时编写 Python 代码的经验,pyc 文件是在我们 import 一个模块后生成的。
imp module
而官方文档中提到了 imp 模块是用来和 import 语句的具体实现机制交互的。其中:
- find_module 函数负责到 sys.path 中寻找对应的 module
- 若存在需要的 module,则调用 load_module 加载对应 module
根据之前分析 CPython 源码的经验, 标准库模块中和运行逻辑相关的函数一般对应着一个 CPython 解释器中的 C 代码实现。
import.c
如 load_module 就位于 https://github.com/python/cpython/blob/2.7/Python/import.c#L1929
/* Load an external module using the default search path and return
its module object WITH INCREMENTED REFERENCE COUNT */
static PyObject *
load_module(char *name, FILE *fp, char *pathname, int type, PyObject *loader)
{
PyObject *modules;
PyObject *m;
int err;
/* First check that there's an open file (if we need one) */
switch (type) {
case PY_SOURCE:
case PY_COMPILED:
if (fp == NULL) {
PyErr_Format(PyExc_ValueError,
"file object required for import (type code %d)",
type);
return NULL;
}
}
switch (type) {
case PY_SOURCE:
m = load_source_module(name, pathname, fp);
break;
case PY_COMPILED:
m = load_compiled_module(name, pathname, fp);
break;
#ifdef HAVE_DYNAMIC_LOADING
case C_EXTENSION:
m = _PyImport_LoadDynamicModule(name, pathname, fp);
break;
#endif
case PKG_DIRECTORY:
m = load_package(name, pathname);
break;
case C_BUILTIN:
case PY_FROZEN:
if (pathname != NULL && pathname[0] != '\0')
name = pathname;
if (type == C_BUILTIN)
err = init_builtin(name);
else
err = PyImport_ImportFrozenModule(name);
if (err < 0)
return NULL;
if (err == 0) {
PyErr_Format(PyExc_ImportError,
"Purported %s module %.200s not found",
type == C_BUILTIN ?
"builtin" : "frozen",
name);
return NULL;
}
modules = PyImport_GetModuleDict();
m = PyDict_GetItemString(modules, name);
if (m == NULL) {
PyErr_Format(
PyExc_ImportError,
"%s module %.200s not properly initialized",
type == C_BUILTIN ?
"builtin" : "frozen",
name);
return NULL;
}
Py_INCREF(m);
break;
case IMP_HOOK: {
if (loader == NULL) {
PyErr_SetString(PyExc_ImportError,
"import hook without loader");
return NULL;
}
m = PyObject_CallMethod(loader, "load_module", "s", name);
break;
}
default:
PyErr_Format(PyExc_ImportError,
"Don't know how to import %.200s (type code %d)",
name, type);
m = NULL;
}
return m;
}
可以看到 load_module 会检查找到的 module 是PY_SOURCE
还是 PY_COMPILED
,而这两个宏分别对应着 .py
和 .pyc
文件。
#ifdef RISCOS
static const struct filedescr _PyImport_StandardFiletab[] = {
{"/py", "U", PY_SOURCE},
{"/pyc", "rb", PY_COMPILED},
{0, 0}
};
#else
static const struct filedescr _PyImport_StandardFiletab[] = {
{".py", "U", PY_SOURCE},
#ifdef MS_WINDOWS
{".pyw", "U", PY_SOURCE},
#endif
{".pyc", "rb", PY_COMPILED},
{0, 0}
};
#endif
我们跟入在没有 .pyc
文件时加载 .py
源文件的 load_source_module
函数(只摘录了一部分)。
https://github.com/python/cpython/blob/2.7/Python/import.c#L1076
/* Load a source module from a given file and return its module
object WITH INCREMENTED REFERENCE COUNT. If there's a matching
byte-compiled file, use that instead. */
static PyObject *
load_source_module(char *name, char *pathname, FILE *fp)
{
.....
cpathname = make_compiled_pathname(pathname, buf,
(size_t)MAXPATHLEN + 1);
if (cpathname != NULL &&
(fpc = check_compiled_module(pathname, mtime, cpathname))) {
}
else {
co = parse_source_module(pathname, fp);
if (co == NULL)
goto error_exit;
if (Py_VerboseFlag)
PySys_WriteStderr("import %s # from %s\n",
name, pathname);
if (cpathname) {
PyObject *ro = PySys_GetObject("dont_write_bytecode");
int b = (ro == NULL) ? 0 : PyObject_IsTrue(ro);
if (b < 0)
goto error_exit;
if (!b)
write_compiled_module(co, cpathname, &st, mtime);
}
}
m = PyImport_ExecCodeModuleEx(name, (PyObject *)co, pathname);
Py_DECREF(co);
PyMem_FREE(buf);
return m;
error_exit:
Py_XDECREF(co);
PyMem_FREE(buf);
return NULL;
}
}
可以看到 load_source_module
同样会去找一次 .pyc
文件,再找不到的情况下,会先解析源文件,
得到 codeobject 后调用 write_compiled_module
生成 .pyc
文件,再执行 import 逻辑。
write_compiled_module
所以,write_compiled_module
函数中应该就对应着我们的 pyc 文件生成逻辑了。
https://github.com/python/cpython/blob/2.7/Python/import.c#L951
static void
write_compiled_module(PyCodeObject *co, char *cpathname, struct stat *srcstat, time_t mtime)
{
FILE *fp;
#ifdef MS_WINDOWS /* since Windows uses different permissions */
mode_t mode = srcstat->st_mode & ~S_IEXEC;
/* Issue #6074: We ensure user write access, so we can delete it later
* when the source file changes. (On POSIX, this only requires write
* access to the directory, on Windows, we need write access to the file
* as well)
*/
mode |= _S_IWRITE;
#else
mode_t mode = srcstat->st_mode & ~S_IXUSR & ~S_IXGRP & ~S_IXOTH;
#endif
fp = open_exclusive(cpathname, mode);
if (fp == NULL) {
if (Py_VerboseFlag)
PySys_WriteStderr(
"# can't create %s\n", cpathname);
return;
}
PyMarshal_WriteLongToFile(pyc_magic, fp, Py_MARSHAL_VERSION);
/* First write a 0 for mtime */
PyMarshal_WriteLongToFile(0L, fp, Py_MARSHAL_VERSION);
PyMarshal_WriteObjectToFile((PyObject *)co, fp, Py_MARSHAL_VERSION);
if (fflush(fp) != 0 || ferror(fp)) {
if (Py_VerboseFlag)
PySys_WriteStderr("# can't write %s\n", cpathname);
/* Don't keep partial file */
fclose(fp);
(void) unlink(cpathname);
return;
}
/* Now write the true mtime (as a 32-bit field) */
fseek(fp, 4L, 0);
assert(mtime <= 0xFFFFFFFF);
PyMarshal_WriteLongToFile((long)mtime, fp, Py_MARSHAL_VERSION);
fflush(fp);
fclose(fp);
if (Py_VerboseFlag)
PySys_WriteStderr("# wrote %s\n", cpathname);
}
可以看到,pyc
文件的生成大致分下面几步:
- 1.创建目标 pyc 文件
- 2.首先调用
PyMarshal_WriteLongToFile
序列化 magic number 到文件中 - 3.然后序列化一个空的时间戳到文件中
- 4.
PyMarshal_WriteObjectToFile
将 PyCodeObject 序列化到文件 - 5.写完 CodeObject 后,fseek 到时间戳的位置,填充真实的时间戳
其中,magic number 定义于 import.c 头部
/*
Python 2.7a0: 62171 (optimize list comprehensions/change LIST_APPEND)
Python 2.7a0: 62181 (optimize conditional branches:
introduce POP_JUMP_IF_FALSE and POP_JUMP_IF_TRUE)
Python 2.7a0 62191 (introduce SETUP_WITH)
Python 2.7a0 62201 (introduce BUILD_SET)
Python 2.7a0 62211 (introduce MAP_ADD and SET_ADD)
.
*/
#define MAGIC (62211 | ((long)'\r'<<16) | ((long)'\n'<<24))
/* Magic word as global; note that _PyImport_Init() can change the
value of this global to accommodate for alterations of how the
compiler works which are enabled by command line switches. */
static long pyc_magic = MAGIC;
而 PyMarshal_WriteLongToFile
和 PyMarshal_WriteObjectToFile
定义于 marshal.c 中。
marshal.c
PyMarshal_WriteLongToFile
我们先来看 PyMarshal_WriteLongToFile
https://github.com/python/cpython/blob/2.7/Python/marshal.c#L462
/* version currently has no effect for writing longs. */
void
PyMarshal_WriteLongToFile(long x, FILE *fp, int version)
{
WFILE wf;
wf.fp = fp;
wf.str = NULL;
wf.ptr = NULL;
wf.end = NULL;
wf.error = WFERR_OK;
wf.depth = 0;
wf.strings = NULL;
wf.version = version;
w_long(x, &wf);
}
PyMarshal_WriteLongToFile
创建了 WFILE,将打开的文件描述符赋值给 WFILE,并调用 w_long。
用于表示写入的 pyc 文件的 WFILE 结构如下。
typedef struct {
FILE *fp;
int error; /* see WFERR_* values */
int depth;
/* If fp == NULL, the following are valid: */
PyObject *str;
char *ptr;
char *end;
PyObject *strings; /* dict on marshal, list on unmarshal */
int version;
} WFILE;
跟入 w_long
static void
w_long(long x, WFILE *p)
{
w_byte((char)( x & 0xff), p);
w_byte((char)((x>> 8) & 0xff), p);
w_byte((char)((x>>16) & 0xff), p);
w_byte((char)((x>>24) & 0xff), p);
}
可以看到 w_long
只是调用了四次 w_byte
将一个 type 为 long ,长度为4字节的数写入到文件中。
#define w_byte(c, p) if (((p)->fp)) putc((c), (p)->fp); \
else if ((p)->ptr != (p)->end) *(p)->ptr++ = (c); \
else w_more(c, p)
w_byte
宏简单的将传入的一字节内容写入到 WFILE->fp ,即对应的 pyc 文件中。marshal
中的序列化写入操作都是基于 w_byte
封装的。
PyMarshal_WriteObjectToFile
PyMarshal_WriteObjectToFile
相对之前的 PyMarshal_WriteLongToFile
更加的复杂了,用于将 Python 对象序列化到文件中。
void
PyMarshal_WriteObjectToFile(PyObject *x, FILE *fp, int version)
{
WFILE wf;
wf.fp = fp;
wf.str = NULL;
wf.ptr = NULL;
wf.end = NULL;
wf.error = WFERR_OK;
wf.depth = 0;
wf.strings = (version > 0) ? PyDict_New() : NULL;
wf.version = version;
w_object(x, &wf);
Py_XDECREF(wf.strings);
}
可以看到 PyMarshal_WriteObjectToFile
调用的是 w_object
,是 marshal 最复杂的一个函数。
https://github.com/python/cpython/blob/2.7/Python/marshal.c#L212
static void
w_object(PyObject *v, WFILE *p)
{
Py_ssize_t i, n;
p->depth++;
if (p->depth > MAX_MARSHAL_STACK_DEPTH) {
p->error = WFERR_NESTEDTOODEEP;
}
else if (v == NULL) {
w_byte(TYPE_NULL, p);
}
else if (v == Py_None) {
w_byte(TYPE_NONE, p);
}
else if (v == PyExc_StopIteration) {
w_byte(TYPE_STOPITER, p);
}
else if (v == Py_Ellipsis) {
w_byte(TYPE_ELLIPSIS, p);
}
else if (v == Py_False) {
w_byte(TYPE_FALSE, p);
}
else if (v == Py_True) {
w_byte(TYPE_TRUE, p);
}
else if (PyInt_CheckExact(v)) {
long x = PyInt_AS_LONG((PyIntObject *)v);
w_byte(TYPE_INT, p);
w_long(x, p);
}
else if (PyLong_CheckExact(v)) {
PyLongObject *ob = (PyLongObject *)v;
w_PyLong(ob, p);
}
....
else if (PyCode_Check(v)) {
PyCodeObject *co = (PyCodeObject *)v;
w_byte(TYPE_CODE, p);
w_long(co->co_argcount, p);
w_long(co->co_nlocals, p);
w_long(co->co_stacksize, p);
w_long(co->co_flags, p);
w_object(co->co_code, p);
w_object(co->co_consts, p);
w_object(co->co_names, p);
w_object(co->co_varnames, p);
w_object(co->co_freevars, p);
w_object(co->co_cellvars, p);
w_object(co->co_filename, p);
w_object(co->co_name, p);
w_long(co->co_firstlineno, p);
w_object(co->co_lnotab, p);
}
else {
w_byte(TYPE_UNKNOWN, p);
p->error = WFERR_UNMARSHALLABLE;
}
exit:
p->depth--;
}
w_object
的主要逻辑为读取传入的 PyObject *v
的具体类型,调用 w_byte
写入一个字节的类型数据,然后调用不同的 w_
系列函数序列化对应类型的数据。
这里我们省略其他类型的代码,重点看下 PyCodeObject
类型的处理。可以看到,w_object
只是简单的讲 PyCodeObject
中每个类变量依次序列化到文件中,我们只需要按照 type
object
的顺序去反序列化即可得到对应的内容。
TYPE 相关的宏定义于marshal.c#L27
#define TYPE_NULL '0'
#define TYPE_NONE 'N'
#define TYPE_FALSE 'F'
#define TYPE_TRUE 'T'
#define TYPE_STOPITER 'S'
#define TYPE_ELLIPSIS '.'
#define TYPE_INT 'i'
#define TYPE_INT64 'I'
#define TYPE_FLOAT 'f'
#define TYPE_BINARY_FLOAT 'g'
#define TYPE_COMPLEX 'x'
#define TYPE_BINARY_COMPLEX 'y'
#define TYPE_LONG 'l'
#define TYPE_STRING 's'
#define TYPE_INTERNED 't'
#define TYPE_STRINGREF 'R'
#define TYPE_TUPLE '('
#define TYPE_LIST '['
#define TYPE_DICT '{'
#define TYPE_CODE 'c'
#define TYPE_UNICODE 'u'
#define TYPE_UNKNOWN '?'
#define TYPE_SET '<'
#define TYPE_FROZENSET '>'
使用 JAVA 反序列化 pyc
文件参考 PycFile.java 。
pyc 文件结构(Struct of pyc)
根据上面的分析,我们可以得出 pyc 文件的格式如下,其中 PyCodeObject
部分为变长,需要参考 w_object
进行反序列化。
----------
magic number 4 bytes
----------
timestamp 4 bytes
----------
PyCodeObject
PyCodeObject
根据上面的分析,我们知道了 pyc 文件中最主要的内容为序列化的 PyCodeObject
,接下来我们就分析一下 PyCodeObject
的结构,以及如何生成及如何被解释执行。
PyCodeObject
定义于 Include/code.h#L10
/* Bytecode object */
typedef struct {
PyObject_HEAD
int co_argcount; /* #arguments, except *args */
int co_nlocals; /* #local variables */
int co_stacksize; /* #entries needed for evaluation stack */
int co_flags; /* CO_..., see below */
PyObject *co_code; /* instruction opcodes */
PyObject *co_consts; /* list (constants used) */
PyObject *co_names; /* list of strings (names used) */
PyObject *co_varnames; /* tuple of strings (local variable names) */
PyObject *co_freevars; /* tuple of strings (free variable names) */
PyObject *co_cellvars; /* tuple of strings (cell variable names) */
/* The rest doesn't count for hash/cmp */
PyObject *co_filename; /* string (where it was loaded from) */
PyObject *co_name; /* string (name, for reference) */
int co_firstlineno; /* first source line number */
PyObject *co_lnotab; /* string (encoding addr<->lineno mapping) See
Objects/lnotab_notes.txt for details. */
void *co_zombieframe; /* for optimization only (see frameobject.c) */
PyObject *co_weakreflist; /* to support weakrefs to code objects */
} PyCodeObject;
上面 load_source_module
中可以看到 pyc
文件的 PyCodeObject
是调用 parse_source_module
生成的。
static PyCodeObject *
parse_source_module(const char *pathname, FILE *fp)
{
PyCodeObject *co = NULL;
mod_ty mod;
PyCompilerFlags flags;
PyArena *arena = PyArena_New();
if (arena == NULL)
return NULL;
flags.cf_flags = 0;
mod = PyParser_ASTFromFile(fp, pathname, Py_file_input, 0, 0, &flags,
NULL, arena);
if (mod) {
co = PyAST_Compile(mod, pathname, NULL, arena);
}
PyArena_Free(arena);
return co;
}
我们在第一篇CPython源码阅读笔记(1) 中曾经分析从 PyParser_ASTFromString
开始的代码生成流程,这里的逻辑和之前一致。
即在 compile_mod
阶段划分好了 CFG ,然后按照 CFG 遍历生成 PyCodeObject
。其中最外层为一个入口的 Block,嵌套的生成多个 code object。
代码生成测试
创建 test.py 如下
def test1(a, b):
return a+b
c = test1(1,2)
在同级目录启动一个 Python 终端。
>>> f = open('test.py').read()
>>> co = compile(f, 'test.py', 'exec')
>>> import dis
>>> dis.dis(co)
1 0 LOAD_CONST 0 (<code object test1 at 0x1028d15b0, file "test.py", line 1>)
3 MAKE_FUNCTION 0
6 STORE_NAME 0 (test1)
4 9 LOAD_NAME 0 (test1)
12 LOAD_CONST 1 (1)
15 LOAD_CONST 2 (2)
18 CALL_FUNCTION 2
21 STORE_NAME 1 (c)
24 LOAD_CONST 3 (None)
27 RETURN_VALUE
可以看到 test 函数的生成了单独的一个 code object。
查看最外层 code object 的 co_const
后找到了对应的 test 函数的 code object 。
>>> co.co_consts
(<code object test1 at 0x1028d15b0, file "test.py", line 1>, 1, 2, None)
接着我们可以根据 PyCodeObject
的各个属性的名字猜测并查看其内容。
>>> co_test1 = co.co_consts[0]
>>> co_test1.co_argcount
2
>>> co_test1.co_varname
('a', 'b')
>>> co_test1.co_code
'|\x00\x00|\x01\x00\x17S'
>>> dis.dis(co_test1)
2 0 LOAD_FAST 0 (a)
3 LOAD_FAST 1 (b)
6 BINARY_ADD
7 RETURN_VALUE
调试
按照第一篇文章中的方法,我们可以试着调试一下 test.py 的编译过程。
compiler_mod
在编译的入口函数 compiler_mod
处下断点,运行 test.py
。
> gdb python
(gdb) b compiler_mod
Breakpoint 1 at 0xe4602: file Python/compile.c, line 1219.
(gdb) r test.py
Starting program: /mnt/e/codes/Python-2.7.10/python.exe test.py
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
compiler_body
单步跟入 compiler_mod
函数,可以看到传入的 mod 为 Module_kind
,所以接下来跟入
compiler_body
。
(gdb) s
compiler_mod (mod=0x8695e08, c=0x7ffffffee620) at Python/compile.c:1219
1219 if (!module) {
(gdb) l
1214 compiler_mod(struct compiler *c, mod_ty mod)
1215 {
1216 PyCodeObject *co;
1217 int addNone = 1;
1218 static PyObject *module;
1219 if (!module) {
1220 module = PyString_InternFromString("<module>");
1221 if (!module)
1222 return NULL;
1223 }
(gdb) p c
$1 = (struct compiler *) 0x7ffffffee620
(gdb) p *c
$2 = {c_filename = 0x7ffffffeebbd "test.py", c_st = 0x863b190, c_future = 0x7fffff7c94a0, c_flags = 0x7ffffffee7dc, c_interactive = 0,
c_nestlevel = 0, u = 0x0, c_stack = 0x7fffff6a6f80, c_arena = 0x864e520}
(gdb) p mod
$3 = (mod_ty) 0x8695e08
(gdb) p *mod
$4 = {kind = Module_kind, v = {Module = {body = 0x8695ab0}, Interactive = {body = 0x8695ab0}, Expression = {body = 0x8695ab0}, Suite = {
body = 0x8695ab0}}}
compiler_mod
static PyCodeObject *
compiler_mod(struct compiler *c, mod_ty mod)
{
PyCodeObject *co;
int addNone = 1;
static PyObject *module;
if (!module) {
module = PyString_InternFromString("<module>");
if (!module)
return NULL;
}
/* Use 0 for firstlineno initially, will fixup in assemble(). */
if (!compiler_enter_scope(c, module, mod, 0))
return NULL;
switch (mod->kind) {
case Module_kind:
if (!compiler_body(c, mod->v.Module.body)) {
compiler_exit_scope(c);
return 0;
}
break;
case Interactive_kind:
c->c_interactive = 1;
VISIT_SEQ_IN_SCOPE(c, stmt,
mod->v.Interactive.body);
break;
case Expression_kind:
VISIT_IN_SCOPE(c, expr, mod->v.Expression.body);
addNone = 0;
break;
case Suite_kind:
PyErr_SetString(PyExc_SystemError,
"suite should not be possible");
return 0;
default:
PyErr_Format(PyExc_SystemError,
"module kind %d should not be possible",
mod->kind);
return 0;
}
co = assemble(c, addNone);
compiler_exit_scope(c);
return co;
}
在 compiler_body
处下断点,然后跟入该函数。
(gdb) b compiler_body
Breakpoint 2 at 0x80e392a: compiler_body. (2 locations)
(gdb) c
Continuing.
Breakpoint 2, compiler_mod (mod=0x8695e08, c=0x7ffffffee620) at Python/compile.c:1229
1229 if (!compiler_body(c, mod->v.Module.body)) {
(gdb) s
compiler_body (stmts=0x8695ab0, c=0x7ffffffee620) at Python/compile.c:1198
1198 if (!asdl_seq_LEN(stmts))
可以看到,compiler_body
只是简单的讲 stmts 中的元素取出,通过 VISIT
宏进行代码生成。
static int
compiler_body(struct compiler *c, asdl_seq *stmts)
{
int i = 0;
stmt_ty st;
if (!asdl_seq_LEN(stmts))
return 1;
st = (stmt_ty)asdl_seq_GET(stmts, 0);
if (compiler_isdocstring(st) && Py_OptimizeFlag < 2) {
/* don't generate docstrings if -OO */
i = 1;
VISIT(c, expr, st->v.Expr.value);
if (!compiler_nameop(c, __doc__, Store))
return 0;
}
for (; i < asdl_seq_LEN(stmts); i++)
VISIT(c, stmt, (stmt_ty)asdl_seq_GET(stmts, i));
return 1;
}
#define VISIT(C, TYPE, V) {\
if (!compiler_visit_ ## TYPE((C), (V))) \
return 0; \
}
VISIT(c, stmt, (stmt_ty)asdl_seq_GET(stmts, i));
展开其实就是compiler_visit_stmt(c, (stmt_ty)asdl_seq_GET(stmts, i))
。
单步跟入循环中的 VISIT 调用,查看传入的 stmt 参数,为 include/Python-ast.h
中定义的 struct _stmt
,即 stmt AST Node(stmt 的语法树节点)。
(gdb) s
compiler_visit_stmt (c=0x7ffffffee620, s=0x8695c58) at Python/compile.c:2117
2117 c->u->u_lineno = s->lineno;
(gdb) p *s
$19 = {kind = FunctionDef_kind, v = {FunctionDef = {name = 0x7fffff67a480, args = 0x8695b50, body = 0x8695b70, decorator_list = 0x0},
....
Python-ast.h
中定义了 Python 中 stmt 的类型。
https://github.com/python/cpython/blob/2.7/Include/Python-ast.h#L62
enum _stmt_kind {FunctionDef_kind=1, ClassDef_kind=2, Return_kind=3,
Delete_kind=4, Assign_kind=5, AugAssign_kind=6, Print_kind=7,
For_kind=8, While_kind=9, If_kind=10, With_kind=11,
Raise_kind=12, TryExcept_kind=13, TryFinally_kind=14,
Assert_kind=15, Import_kind=16, ImportFrom_kind=17,
Exec_kind=18, Global_kind=19, Expr_kind=20, Pass_kind=21,
Break_kind=22, Continue_kind=23};
compiler_visit_stmt
https://github.com/python/cpython/blob/2.7/Python/compile.c#L2074
static int
compiler_visit_stmt(struct compiler *c, stmt_ty s)
{
int i, n;
/* Always assign a lineno to the next instruction for a stmt. */
c->u->u_lineno = s->lineno;
c->u->u_lineno_set = false;
switch (s->kind) {
case FunctionDef_kind:
return compiler_function(c, s);
...
}
return 1;
}
因为这里第一次传入的 stmt 的类型为 FunctionDef_kind
,这里会调用 compiler_function
。
(gdb) n
2120 switch (s->kind) {
(gdb) n
2122 return compiler_function(c, s);
compiler_function
跟入 compiler_function
, 这里即是真正的代码生成逻辑。
https://github.com/python/cpython/blob/2.7/Python/compile.c#L1351
static int
compiler_function(struct compiler *c, stmt_ty s)
{
PyCodeObject *co;
PyObject *first_const = Py_None;
arguments_ty args = s->v.FunctionDef.args;
asdl_seq* decos = s->v.FunctionDef.decorator_list;
stmt_ty st;
int i, n, docstring;
assert(s->kind == FunctionDef_kind);
if (!compiler_decorators(c, decos))
return 0;
if (args->defaults)
VISIT_SEQ(c, expr, args->defaults);
if (!compiler_enter_scope(c, s->v.FunctionDef.name, (void *)s,
s->lineno))
return 0;
st = (stmt_ty)asdl_seq_GET(s->v.FunctionDef.body, 0);
/* unpack nested arguments */
compiler_arguments(c, args);
c->u->u_argcount = asdl_seq_LEN(args->args);
n = asdl_seq_LEN(s->v.FunctionDef.body);
....
co = assemble(c, 1);
compiler_exit_scope(c);
if (co == NULL)
return 0;
compiler_make_closure(c, co, asdl_seq_LEN(args->defaults));
Py_DECREF(co);
for (i = 0; i < asdl_seq_LEN(decos); i++) {
ADDOP_I(c, CALL_FUNCTION, 1);
}
return compiler_nameop(c, s->v.FunctionDef.name, Store);
}
这里逻辑比较复杂,就不贴调试的过程了。大致的流程为
- 将 FuncDef AST Node 中的一些 metadata 存储到 compiler 对象中。
- 调用 assemble 将函数体生成单独的 code object。
- 调用
compiler_make_closure
生成LOAD_CONST
和MAKE_FUNCTION
两个opcode。
static int
compiler_make_closure(struct compiler *c, PyCodeObject *co, int args)
{
int i, free = PyCode_GetNumFree(co);
if (free == 0) {
ADDOP_O(c, LOAD_CONST, (PyObject*)co, consts);
ADDOP_I(c, MAKE_FUNCTION, args);
return 1;
}
...
}
- 调用
compiler_nameop
生成STORE_NAME
opcode。
static int
compiler_nameop(struct compiler *c, identifier name, expr_context_ty ctx)
{
int op, scope, arg;
enum { OP_FAST, OP_GLOBAL, OP_DEREF, OP_NAME } optype;
....
op = 0;
optype = OP_NAME;
....
switch (optype) {
case OP_NAME:
switch (ctx) {
case Load: op = LOAD_NAME; break;
case Store: op = STORE_NAME; break;
case Del: op = DELETE_NAME; break;
case AugLoad:
case AugStore:
break;
case Param:
default:
PyErr_SetString(PyExc_SystemError,
"param invalid for name variable");
return 0;
}
break;
}
assert(op);
arg = compiler_add_o(c, dict, mangled);
Py_DECREF(mangled);
if (arg < 0)
return 0;
return compiler_addop_i(c, op, arg);
}
至此生成了下面的字节码
LOAD_CONST 0
MAKE_FUNCTION 0
STORE_NAME 0
对应源码中的
def test1(a, b):
return a+b
Py_OPCODE
字节码对应的数字定于于 opcode.h 。
其中 HAS_ARG
宏定义了字节码是否带有参数(通过判断字节码对应的数字是否大于指定的值)。
#define HAS_ARG(op) ((op) >= HAVE_ARGUMENT)
在 Python2.7 中这个值为 90
#define HAVE_ARGUMENT 90 /* Opcodes from here have an argument: */