Python Behind the Scenes Series

Comprehensive deep dive into CPython internals across multiple articles:

Part 1: CPython Virtual Machine Architecture

High-Level Overview:

P y t h P C V O o a o i u n r m r t s p t p S โ†“ e โ†“ i โ†“ u โ†“ u o r l a t u e l / r ( r E c A M f e S ( a f T B c e C y h c o g t i t d e e n s e n c e e o r d ( . a e B p t y y i g t ) o e e n n c ) e o r d a e t i e o x n e ) c u t i o n )

Core Components:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
// Simplified CPython VM structure
typedef struct {
    PyObject *f_code;        // Code object containing bytecode
    PyObject *f_globals;     // Global namespace dict
    PyObject *f_locals;      // Local namespace dict
    PyObject **f_valuestack; // Value stack for operations
    PyObject *f_trace;       // Trace function for debugging
    int f_stacksize;         // Current stack size
} PyFrameObject;

// Main evaluation loop
PyObject *
PyEval_EvalFrameEx(PyFrameObject *f, int throwflag) {
    // The heart of Python execution
    // Interprets bytecode instructions one by one
    for (;;) {
        opcode = NEXTOP();
        switch (opcode) {
            case LOAD_FAST:
                // Load local variable onto stack
                break;
            case BINARY_ADD:
                // Pop two values, add them, push result
                break;
            // ... hundreds of other opcodes
        }
    }
}

Part 2: CPython Compiler Pipeline

Compilation Phases:

  1. Lexical Analysis: Source code โ†’ tokens
  2. Parsing: Tokens โ†’ Abstract Syntax Tree (AST)
  3. AST Optimization: Tree transformations
  4. Code Generation: AST โ†’ bytecode

AST Structure Example:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# Python code
def factorial(n):
    if n <= 1:
        return 1
    return n * factorial(n - 1)

# Corresponding AST (simplified)
FunctionDef(
    name='factorial',
    args=arguments(args=[arg('n')]),
    body=[
        If(
            test=Compare(
                left=Name('n'),
                ops=[LtE()],
                comparators=[Num(1)]
            ),
            body=[Return(Num(1))],
            orelse=[
                Return(
                    BinOp(
                        left=Name('n'),
                        op=Mult(),
                        right=Call(
                            func=Name('factorial'),
                            args=[BinOp(Name('n'), Sub(), Num(1))]
                        )
                    )
                )
            ]
        )
    ]
)

Bytecode Generation:

1
2
3
4
5
6
7
8
import dis

def factorial(n):
    if n <= 1:
        return 1
    return n * factorial(n - 1)

dis.dis(factorial)

Output:

2 3 4 1 1 1 1 1 2 2 2 2 0 2 4 6 8 0 2 4 6 8 0 2 4 6 L L C P L R L L L L B C B R O O O O O E O O O O I A I E A A M P A T A A A A N L N T D D P _ D U D D D D A L A U _ _ A J _ R _ _ _ _ R _ R R F C R U C N F G F C Y F Y N A O E M O _ A L A O _ U _ _ S N _ P N V S O S N S N M V T S O _ S A T B T S U C U A T P I T L A T B T L L F U L T I T U _ E R O I E F A N P A C L L T Y S E 1 0 1 1 2 1 0 0 0 1 1 ( ( ( ( ( ( ( n 1 < 1 n f n 1 ) ) = ) ) a ) ) ) c t o r i a l )

Part 3: Bytecode Execution Engine

Stack-Based Virtual Machine:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
// Bytecode execution example: BINARY_ADD
case BINARY_ADD: {
    PyObject *right = POP();   // Pop right operand
    PyObject *left = TOP();    // Get left operand (don't pop yet)
    PyObject *sum;
    
    // Try fast path for integers
    if (PyLong_CheckExact(left) && PyLong_CheckExact(right)) {
        sum = long_add(left, right);
    } else {
        // General case: call __add__ method
        sum = PyNumber_Add(left, right);
    }
    
    SET_TOP(sum);              // Replace left operand with result
    Py_DECREF(right);          // Clean up
    if (sum == NULL) goto error;
    DISPATCH();                // Continue to next instruction
}

Execution Stack Management:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# Python code demonstrating stack operations
x = 10
y = 20
result = x + y * 2

# Corresponding stack operations:
# LOAD_CONST 10        -> Stack: [10]
# STORE_FAST x         -> Stack: [], x = 10
# LOAD_CONST 20        -> Stack: [20]  
# STORE_FAST y         -> Stack: [], y = 20
# LOAD_FAST x          -> Stack: [10]
# LOAD_FAST y          -> Stack: [10, 20]
# LOAD_CONST 2         -> Stack: [10, 20, 2]
# BINARY_MULTIPLY      -> Stack: [10, 40]
# BINARY_ADD           -> Stack: [50]
# STORE_FAST result    -> Stack: [], result = 50

Part 4: Variable Implementation

Local Variables (LEGB Rule):

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// Local variable access in CPython
typedef struct {
    PyObject **localsplus;     // Array of local variables
    int nlocals;               // Number of local variables
    int nfree;                 // Number of free variables (closures)
    // ... other fields
} PyCodeObject;

// Fast local variable access
#define GETLOCAL(i)     (fastlocals[i])
#define SETLOCAL(i, v)  (fastlocals[i] = v)

// LOAD_FAST implementation
case LOAD_FAST: {
    PyObject *value = GETLOCAL(oparg);
    if (value == NULL) {
        // UnboundLocalError
        goto error;
    }
    Py_INCREF(value);
    PUSH(value);
    DISPATCH();
}

Global and Built-in Lookup:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
// Global variable lookup (LOAD_GLOBAL)
case LOAD_GLOBAL: {
    PyObject *name = GETITEM(names, oparg);
    PyObject *v;
    
    // Try global namespace first
    v = PyDict_GetItem(f->f_globals, name);
    if (v == NULL) {
        // Try built-ins namespace
        v = PyDict_GetItem(f->f_builtins, name);
        if (v == NULL) {
            // NameError
            goto error;
        }
    }
    
    Py_INCREF(v);
    PUSH(v);
    DISPATCH();
}

Closure and Cell Variables:

1
2
3
4
5
6
7
def outer(x):
    def inner():
        return x  # 'x' is a free variable in inner()
    return inner

# CPython creates a 'cell' object to store 'x'
# Both outer and inner reference the same cell

Part 5: Python Object System

PyObject Structure:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
// Base object structure
typedef struct _object {
    Py_ssize_t ob_refcnt;    // Reference count
    struct _typeobject *ob_type;  // Type information
} PyObject;

// Variable-size objects
typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size;      // Number of items
} PyVarObject;

// Example: Integer object
typedef struct {
    PyObject_HEAD
    digit ob_digit[1];       // Actual integer data
} PyLongObject;

Type System:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// Type object structure (simplified)
typedef struct _typeobject {
    PyVarObject_HEAD
    const char *tp_name;          // Type name
    Py_ssize_t tp_basicsize;      // Size of instances
    
    // Method slots
    destructor tp_dealloc;        // Destructor
    printfunc tp_print;           // Print function
    getattrfunc tp_getattr;       // Attribute access
    setattrfunc tp_setattr;       // Attribute setting
    
    // Number methods
    PyNumberMethods *tp_as_number;
    
    // Sequence methods  
    PySequenceMethods *tp_as_sequence;
    
    // Mapping methods
    PyMappingMethods *tp_as_mapping;
    
    // ... many more slots
} PyTypeObject;

Method Resolution and Dispatch:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
class A:
    def method(self):
        return "A"

class B(A):
    def method(self):
        return "B"

class C(A):
    def method(self):
        return "C"

class D(B, C):  # Multiple inheritance
    pass

# Method Resolution Order (MRO): D -> B -> C -> A -> object
print(D.__mro__)
# (<class 'D'>, <class 'B'>, <class 'C'>, <class 'A'>, <class 'object'>)

# CPython uses C3 linearization algorithm for MRO

Performance Implications:

Optimization Strategies:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# Understanding performance through internals

# 1. Local variable access is fastest (LOAD_FAST)
def fast_locals():
    x = 10
    for i in range(1000000):
        y = x + i  # x is local, very fast

# 2. Global lookups are slower (LOAD_GLOBAL)  
global_var = 10
def slower_globals():
    for i in range(1000000):
        y = global_var + i  # global_var requires dict lookup

# 3. Attribute access triggers method calls
class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y

def attribute_access():
    p = Point(1, 2)
    for i in range(1000000):
        # Each p.x triggers __getattribute__ or __getattr__
        y = p.x + i  

# 4. Built-in functions are optimized
def builtin_vs_custom():
    # Fast: built-in sum() is implemented in C
    result1 = sum(range(1000000))
    
    # Slower: Python loop with bytecode overhead
    result2 = 0
    for i in range(1000000):
        result2 += i

Memory Management Insights:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import sys

# Understanding reference counting
x = [1, 2, 3]
print(sys.getrefcount(x))  # Should be 2 (x + getrefcount parameter)

y = x  # Increment reference count
print(sys.getrefcount(x))  # Should be 3

del y  # Decrement reference count  
print(sys.getrefcount(x))  # Back to 2

# Circular references require garbage collector
class Node:
    def __init__(self, value):
        self.value = value
        self.parent = None
        self.children = []

# Create circular reference
parent = Node("parent")
child = Node("child")
parent.children.append(child)
child.parent = parent  # Circular reference

# Without GC, this would be a memory leak
# CPython's cycle detector will find and clean this up

Debugging and Introspection:

Useful Tools:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import dis
import ast
import inspect

# Disassemble bytecode
def debug_bytecode():
    def sample(x, y):
        return x + y * 2
    
    print("Bytecode:")
    dis.dis(sample)
    
    print("\nCode object attributes:")
    code = sample.__code__
    print(f"co_names: {code.co_names}")        # Global names
    print(f"co_varnames: {code.co_varnames}")  # Local names
    print(f"co_consts: {code.co_consts}")      # Constants
    print(f"co_code: {code.co_code}")          # Raw bytecode

# Examine AST
def debug_ast():
    source = "x = 1 + 2 * 3"
    tree = ast.parse(source)
    print(ast.dump(tree, indent=2))

# Frame introspection
def debug_frames():
    frame = inspect.currentframe()
    print(f"Function: {frame.f_code.co_name}")
    print(f"Locals: {frame.f_locals}")
    print(f"Globals count: {len(frame.f_globals)}")

debug_bytecode()
debug_ast()  
debug_frames()

Understanding CPython internals provides valuable insights for:

  • Performance optimization: Knowing what operations are expensive
  • Memory management: Understanding reference counting and GC
  • Debugging: Better tools for investigating issues
  • Language design: Appreciating the complexity of dynamic languages
  • Extension development: Writing efficient C extensions

This knowledge bridges the gap between high-level Python programming and low-level system understanding, making you a more effective Python developer.