Set up Development Environment
Clone, Build
git clone https://github.com/postgres/postgres
cd postgres
./configure \
--prefix=$(pwd)/.install \
--enable-debug \
--enable-cassert \
--without-icu
bear - make
make install
export PATH=$(pwd)/.install/bin:$PATH
Initial database cluster
initdb -D .data
Configure Clangd
compile_commands.json
: make with bear
.clangd
:
If:
PathMatch: .*\.h$
CompileFlags:
Add:
- -include postgres.h
Debug
Configure .vscode/launch.json
{
"version": "0.2.0",
"configurations": [
{
"name": "Debug PostgreSQL Backend",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/.install/bin/postgres",
"args": [
"-D", "${workspaceFolder}/.data",
"-p", "5432",
"-c", "log_statement=all",
"-c", "log_min_messages=debug1"
],
"cwd": "${workspaceFolder}",
"environment": [
{
"name": "PGDATA",
"value": "${workspaceFolder}/.data"
},
{
"name": "PATH",
"value": "${workspaceFolder}/.install/bin:${env:PATH}"
}
],
"MIMode": "lldb"
},
{
"name": "Attach to PostgreSQL Backend",
"type": "cppdbg",
"request": "attach",
"program": "${workspaceFolder}/.install/bin/postgres",
"processId": "${command:pickProcess}",
"MIMode": "lldb"
}
]
}
Debug PostMaster
Debug Worker
pg_ctl -D .data -l .data/logfile start
psql postgres
Then check which process is serving this session.
postgres=# select pg_backend_pid();
pg_backend_pid
----------------
59407
(1 row)
Attach to this pid.
Code Structure
The following is the main directories of Postgres codebase.
src/backend
├── access
│ ├── heap
│ ├── index
│ └── transam
├── catalog
├── executor
├── main
├── optimizer
├── parser
├── postmaster
└── storage
├── buffer
├── file
├── ipc
├── lmgr
└── smgr
We will first look at storage
and access
.
The former is responsible for managing pages on disk and their buffers in shared memory.
The latter concerns the formats within a page, whether it is a heap table, B-tree index, or WAL log.
The separation of page management and page format is convenient for extending access methods or storage.
The following are examples:
pgvector
is an extension that provides support for storing vector indexes inside Postgres.neon
is a distributed database built on top of Postgres. It replaces local disk management with S3-based page servers.
Heap Access Method
Prepare:
CREATE TABLE heap_test (
id SERIAL PRIMARY KEY,
a INT NOT NULL,
b VARCHAR(255) NOT NULL,
c BOOLEAN NOT NULL
);
INSERT INTO heap_test (a, b, c)
SELECT
generate_series(1, 100),
md5(random()::text),
random() < 0.5;
Page Layout
First we use extension pageinspect
to explore the physical page layout.
Build and install pageinspect
pushd contri/pageinspect
make
make install
popd
psql -c 'CREATE EXTENSION pageinspect;'
As described in docs, a page consists of a header and multiple tuples (also refered as lines or items).
+----------------+---------------------------------+
| PageHeaderData | linp1 linp2 linp3 ... |
+-----------+----+---------------------------------+
| ... linpN | |
+-----------+--------------------------------------+
| ^ pd_lower |
| |
| v pd_upper |
+-------------+------------------------------------+
| | tupleN ... |
+-------------+------------------+-----------------+
| ... tuple3 tuple2 tuple1 | "special space" |
+--------------------------------+-----------------+
^ pd_special
PageHeaderData
is space management information generic to any page.
Fields in PageHeader
typedef struct PageHeaderData
{
/* XXX LSN is member of *any* block, not only page-organized ones */
PageXLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog
* record for last change to this page */
uint16 pd_checksum; /* checksum */
uint16 pd_flags; /* flag bits, see below */
LocationIndex pd_lower; /* offset to start of free space */
LocationIndex pd_upper; /* offset to end of free space */
LocationIndex pd_special; /* offset to start of special space */
uint16 pd_pagesize_version;
TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */
} PageHeaderData;
typedef PageHeaderData *PageHeader;
pd_lsn
: identifies xlog record for last change to this page.pd_checksum
: page checksum, if set.pd_flags
: flag bits.pd_lower
: offset to start of free space.pd_upper
: offset to end of free space.pd_special
: offset to start of special space.pd_pagesize_version
: size in bytes and page layout version number.pd_prune_xid
: oldest XID among potentially prunable tuples on page.
To inspect the page in table we created above:
postgres=# select * from page_header(get_raw_page('heap_test', 0));
lsn | checksum | flags | lower | upper | special | pagesize | version | prune_xid
-----------+----------+-------+-------+-------+---------+----------+---------+-----------
0/14E84B0 | 0 | 0 | 428 | 952 | 8192 | 8192 | 4 | 0
(1 row)
Line pointer (linp
) is a fixed-size fat pointer to real tuple.
typedef struct ItemIdData
{
unsigned lp_off:15, /* offset to tuple (from start of page) */
lp_flags:2, /* state of line pointer, see below */
lp_len:15; /* byte length of tuple */
} ItemIdData;
typedef ItemIdData *ItemId;
Each heap tuple consists of HeapTupleHeader and actual data.
typedef struct HeapTupleFields
{
TransactionId t_xmin; /* inserting xact ID */
TransactionId t_xmax; /* deleting or locking xact ID */
union
{
CommandId t_cid; /* inserting or deleting command ID, or both */
TransactionId t_xvac; /* old-style VACUUM FULL xact ID */
} t_field3;
} HeapTupleFields;
struct HeapTupleHeaderData
{
union
{
HeapTupleFields t_heap;
DatumTupleFields t_datum;
} t_choice;
ItemPointerData t_ctid; /* current TID of this or newer tuple (or a
* speculative insertion token) */
/* Fields below here must match MinimalTupleData! */
#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2
uint16 t_infomask2; /* number of attributes + various flags */
#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3
uint16 t_infomask; /* various flag bits, see below */
#define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4
uint8 t_hoff; /* sizeof header incl. bitmap, padding */
/* ^ - 23 bytes - ^ */
#define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5
bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */
/* MORE DATA FOLLOWS AT END OF STRUCT */
};
To inspect the first 5 tuples in the table:
postgres=# select * from heap_page_items(get_raw_page('heap_test', 0)) limit 5;
lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid | t_data
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+----------------------------------------------------------------------------------------
1 | 8120 | 1 | 66 | 746 | 0 | 0 | (0,1) | 4 | 2306 | 24 | | | \x010000000100000043633736316331636439653330383035373531623465323266666437613432633901
2 | 8048 | 1 | 66 | 746 | 0 | 0 | (0,2) | 4 | 2306 | 24 | | | \x020000000200000043633736353635636135393064663437373330373534643664616631326236326101
3 | 7976 | 1 | 66 | 746 | 0 | 0 | (0,3) | 4 | 2306 | 24 | | | \x030000000300000043376132346439643734333731643563356163333966616230636566373034353700
4 | 7904 | 1 | 66 | 746 | 0 | 0 | (0,4) | 4 | 2306 | 24 | | | \x040000000400000043663562626233633534376435643162373465623034633362333063316637333100
5 | 7832 | 1 | 66 | 746 | 0 | 0 | (0,5) | 4 | 2306 | 24 | | | \x050000000500000043373035653238393330366232383664306434366137633534373035633361633800
(5 rows)
Heap Insert
Start a new session and then attach to the backend process. Breakpoint at heapam.c:heap_insert
. Then insert a new tuple.
INSERT INTO heap_test (a, b, c) VALUES (1, 'test', true);
At breakpoint, the call stack is:
heap_insert (src/backend/access/heap/heapam.c:2008)
heapam_tuple_insert (src/backend/access/heap/heapam_handler.c:253)
table_tuple_insert (src/include/access/tableam.h:1406)
ExecInsert (src/backend/executor/nodeModifyTable.c:1159)
ExecModifyTable (src/backend/executor/nodeModifyTable.c:4150)
ExecProcNode (src/include/executor/executor.h:274)
ExecutePlan (src/backend/executor/execMain.c:1649)
standard_ExecutorRun (src/backend/executor/execMain.c:361)
ExecutorRun (src/backend/executor/execMain.c:307)
ProcessQuery (src/backend/tcop/pquery.c:160)
PortalRunMulti (Unknown Source:0)
PortalRun (src/backend/tcop/pquery.c:789)
exec_simple_query (src/backend/tcop/postgres.c:1278)
PostgresMain (Unknown Source:0)
BackendMain (src/backend/tcop/backend_startup.c:105)
postmaster_child_launch (src/backend/postmaster/launch_backend.c:277)
BackendStartup (src/backend/postmaster/postmaster.c:3594)
ServerLoop (src/backend/postmaster/postmaster.c:1676)
PostmasterMain (src/backend/postmaster/postmaster.c:1374)
main (src/backend/main/main.c:199)
In this chapter, we focus on storage layer, which is the top 3 frames.
table_tuple_insert
wraps call to relation's table access method (tableam
). By this way, PostgreSQL can support different table access methods, such as heap, hash, and B-tree. See Table Access Method Interface Definition.
static inline void
table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid,
int options, struct BulkInsertStateData *bistate)
{
rel->rd_tableam->tuple_insert(rel, slot, cid, options,
bistate);
}
static const TableAmRoutine heapam_methods = {
/* ... */
.tuple_insert = heapam_tuple_insert,
/* ... */
};
heapam_tuple_insert
do the preparation and cleanup around the insertion.
static void
heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
int options, BulkInsertState bistate)
{
bool shouldFree = true;
HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
/* Update the tuple with table oid */
slot->tts_tableOid = RelationGetRelid(relation);
tuple->t_tableOid = slot->tts_tableOid;
/* Perform the insertion, and copy the resulting ItemPointer */
heap_insert(relation, tuple, cid, options, bistate);
ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
if (shouldFree)
pfree(tuple);
}
heap_insert
is the dirty work of the insertion. Since it has ~200 LoC (src/backend/access/heap/heapam.c:2004-2185
), the whole function is not shown here for saving network traffic :). We pick the most important part of the function here.
-
prepares the tuple for insertion. We would go back to TOAST later.
/* * Fill in tuple header fields and toast the tuple if necessary. * * Note: below this point, heaptup is the data we actually intend to store * into the relation; tup is the caller's original untoasted data. */ heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
-
finds the buffer to insert the tuple into.
/* * Find buffer to insert this tuple into. If the page is all visible, * this will also pin the requisite visibility map page. */ buffer = RelationGetBufferForTuple(relation, heaptup->t_len, InvalidBuffer, options, bistate, &vmbuffer, NULL, 0);
-
puts the tuple into the buffer.
RelationPutHeapTuple(relation, buffer, heaptup, (options & HEAP_INSERT_SPECULATIVE) != 0);
-
marks the buffer as modified so that the buffer would be flushed to disk by background worker.
MarkBufferDirty(buffer);
-
WAL part, skip for now.
/* XLOG stuff */ if (RelationNeedsWAL(relation))
-
release the buffer.
UnlockReleaseBuffer(buffer); if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer);
[[TODO]]
Catalog
Catalogs in PostgreSQL is also stored in (heap) tables. These tables have fixed Oid and attributes so that there is no recursive lookup.
Example: pg_class
CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,RelationRelation_Rowtype_Id) BKI_SCHEMA_MACRO
{
/* oid */
Oid oid;
/* class name */
NameData relname;
/* OID of namespace containing this class */
Oid relnamespace BKI_DEFAULT(pg_catalog) BKI_LOOKUP(pg_namespace);
/* OID of entry in pg_type for relation's implicit row type, if any */
Oid reltype BKI_LOOKUP_OPT(pg_type);
/* OID of entry in pg_type for underlying composite type, if any */
Oid reloftype BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_type);
/* class owner */
Oid relowner BKI_DEFAULT(POSTGRES) BKI_LOOKUP(pg_authid);
/* access method; 0 if not a table / index */
Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am);
/* identifier of physical storage file */
/* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */
Oid relfilenode BKI_DEFAULT(0);
/* identifier of table space for relation (0 means default for database) */
Oid reltablespace BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_tablespace);
/* # of blocks (not always up-to-date) */
int32 relpages BKI_DEFAULT(0);
/* # of tuples (not always up-to-date; -1 means "unknown") */
float4 reltuples BKI_DEFAULT(-1);
/* # of all-visible blocks (not always up-to-date) */
int32 relallvisible BKI_DEFAULT(0);
/* OID of toast table; 0 if none */
Oid reltoastrelid BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_class);
/* T if has (or has had) any indexes */
bool relhasindex BKI_DEFAULT(f);
/* T if shared across databases */
bool relisshared BKI_DEFAULT(f);
/* see RELPERSISTENCE_xxx constants below */
char relpersistence BKI_DEFAULT(p);
/* see RELKIND_xxx constants below */
char relkind BKI_DEFAULT(r);
/* number of user attributes */
int16 relnatts BKI_DEFAULT(0); /* genbki.pl will fill this in */
/*
* Class pg_attribute must contain exactly "relnatts" user attributes
* (with attnums ranging from 1 to relnatts) for this class. It may also
* contain entries with negative attnums for system attributes.
*/
/* # of CHECK constraints for class */
int16 relchecks BKI_DEFAULT(0);
/* has (or has had) any rules */
bool relhasrules BKI_DEFAULT(f);
/* has (or has had) any TRIGGERs */
bool relhastriggers BKI_DEFAULT(f);
/* has (or has had) child tables or indexes */
bool relhassubclass BKI_DEFAULT(f);
/* row security is enabled or not */
bool relrowsecurity BKI_DEFAULT(f);
/* row security forced for owners or not */
bool relforcerowsecurity BKI_DEFAULT(f);
/* matview currently holds query results */
bool relispopulated BKI_DEFAULT(t);
/* see REPLICA_IDENTITY_xxx constants */
char relreplident BKI_DEFAULT(n);
/* is relation a partition? */
bool relispartition BKI_DEFAULT(f);
/* link to original rel during table rewrite; otherwise 0 */
Oid relrewrite BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_class);
/* all Xids < this are frozen in this rel */
TransactionId relfrozenxid BKI_DEFAULT(3); /* FirstNormalTransactionId */
/* all multixacts in this rel are >= this; it is really a MultiXactId */
TransactionId relminmxid BKI_DEFAULT(1); /* FirstMultiXactId */
#ifdef CATALOG_VARLEN /* variable-length fields start here */
/* NOTE: These fields are not present in a relcache entry's rd_rel field. */
/* access permissions */
aclitem relacl[1] BKI_DEFAULT(_null_);
/* access-method-specific options */
text reloptions[1] BKI_DEFAULT(_null_);
/* partition bound node tree */
pg_node_tree relpartbound BKI_DEFAULT(_null_);
#endif
} FormData_pg_class;
The process of discovering a table:
- Scanning the
pg_class
catalog table. - Scanning the
pg_attribute
table to get column definitions.
[[TODO]]