Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Heap Access Method

Prepare:

CREATE TABLE heap_test (
  id SERIAL PRIMARY KEY,
  a INT NOT NULL,
  b VARCHAR(255) NOT NULL,
  c BOOLEAN NOT NULL
);
INSERT INTO heap_test (a, b, c)
SELECT
  generate_series(1, 100),
  md5(random()::text),
  random() < 0.5;

Page Layout

First we use extension pageinspect to explore the physical page layout.

Build and install pageinspect
pushd contri/pageinspect
make
make install
popd
psql -c 'CREATE EXTENSION pageinspect;'

As described in docs, a page consists of a header and multiple tuples (also refered as lines or items).

+----------------+---------------------------------+
| PageHeaderData | linp1 linp2 linp3 ...           |
+-----------+----+---------------------------------+
| ... linpN |                                      |
+-----------+--------------------------------------+
|           ^ pd_lower                             |
|                                                  |
|             v pd_upper                           |
+-------------+------------------------------------+
|             | tupleN ...                         |
+-------------+------------------+-----------------+
|       ... tuple3 tuple2 tuple1 | "special space" |
+--------------------------------+-----------------+
                                 ^ pd_special

PageHeaderData is space management information generic to any page.

Fields in PageHeader
typedef struct PageHeaderData
{
  /* XXX LSN is member of *any* block, not only page-organized ones */
  PageXLogRecPtr pd_lsn;                     /* LSN: next byte after last byte of xlog
                                              * record for last change to this page */
  uint16 pd_checksum;                        /* checksum */
  uint16 pd_flags;                           /* flag bits, see below */
  LocationIndex pd_lower;                    /* offset to start of free space */
  LocationIndex pd_upper;                    /* offset to end of free space */
  LocationIndex pd_special;                  /* offset to start of special space */
  uint16 pd_pagesize_version;
  TransactionId pd_prune_xid;                /* oldest prunable XID, or zero if none */
  ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */
} PageHeaderData;

typedef PageHeaderData *PageHeader;
  • pd_lsn: identifies xlog record for last change to this page.
  • pd_checksum: page checksum, if set.
  • pd_flags: flag bits.
  • pd_lower: offset to start of free space.
  • pd_upper: offset to end of free space.
  • pd_special: offset to start of special space.
  • pd_pagesize_version: size in bytes and page layout version number.
  • pd_prune_xid: oldest XID among potentially prunable tuples on page.

To inspect the page in table we created above:

postgres=# select * from page_header(get_raw_page('heap_test', 0));
    lsn    | checksum | flags | lower | upper | special | pagesize | version | prune_xid 
-----------+----------+-------+-------+-------+---------+----------+---------+-----------
 0/14E84B0 |        0 |     0 |   428 |   952 |    8192 |     8192 |       4 |         0
(1 row)

Line pointer (linp) is a fixed-size fat pointer to real tuple.

typedef struct ItemIdData
{
 unsigned lp_off:15,  /* offset to tuple (from start of page) */
          lp_flags:2, /* state of line pointer, see below */
          lp_len:15;  /* byte length of tuple */
} ItemIdData;

typedef ItemIdData *ItemId;

Each heap tuple consists of HeapTupleHeader and actual data.

typedef struct HeapTupleFields
{
  TransactionId t_xmin;   /* inserting xact ID */
  TransactionId t_xmax;   /* deleting or locking xact ID */

  union
  {
    CommandId t_cid;      /* inserting or deleting command ID, or both */
    TransactionId t_xvac; /* old-style VACUUM FULL xact ID */
  } t_field3;
} HeapTupleFields;

struct HeapTupleHeaderData
{
  union
  {
    HeapTupleFields t_heap;
   DatumTupleFields t_datum;
  } t_choice;

  ItemPointerData t_ctid;  /* current TID of this or newer tuple (or a
                            * speculative insertion token) */

  /* Fields below here must match MinimalTupleData! */

#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2
  uint16 t_infomask2;      /* number of attributes + various flags */

#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3
  uint16 t_infomask;       /* various flag bits, see below */

#define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4
  uint8 t_hoff;            /* sizeof header incl. bitmap, padding */

  /* ^ - 23 bytes - ^ */

#define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5
  bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */

  /* MORE DATA FOLLOWS AT END OF STRUCT */
};

To inspect the first 5 tuples in the table:

postgres=# select * from heap_page_items(get_raw_page('heap_test', 0)) limit 5;
 lp | lp_off | lp_flags | lp_len | t_xmin | t_xmax | t_field3 | t_ctid | t_infomask2 | t_infomask | t_hoff | t_bits | t_oid |                                         t_data                                         
----+--------+----------+--------+--------+--------+----------+--------+-------------+------------+--------+--------+-------+----------------------------------------------------------------------------------------
  1 |   8120 |        1 |     66 |    746 |      0 |        0 | (0,1)  |           4 |       2306 |     24 |        |       | \x010000000100000043633736316331636439653330383035373531623465323266666437613432633901
  2 |   8048 |        1 |     66 |    746 |      0 |        0 | (0,2)  |           4 |       2306 |     24 |        |       | \x020000000200000043633736353635636135393064663437373330373534643664616631326236326101
  3 |   7976 |        1 |     66 |    746 |      0 |        0 | (0,3)  |           4 |       2306 |     24 |        |       | \x030000000300000043376132346439643734333731643563356163333966616230636566373034353700
  4 |   7904 |        1 |     66 |    746 |      0 |        0 | (0,4)  |           4 |       2306 |     24 |        |       | \x040000000400000043663562626233633534376435643162373465623034633362333063316637333100
  5 |   7832 |        1 |     66 |    746 |      0 |        0 | (0,5)  |           4 |       2306 |     24 |        |       | \x050000000500000043373035653238393330366232383664306434366137633534373035633361633800
(5 rows)

Heap Insert

Start a new session and then attach to the backend process. Breakpoint at heapam.c:heap_insert. Then insert a new tuple.

INSERT INTO heap_test (a, b, c) VALUES (1, 'test', true);

At breakpoint, the call stack is:

heap_insert (src/backend/access/heap/heapam.c:2008)
heapam_tuple_insert (src/backend/access/heap/heapam_handler.c:253)
table_tuple_insert (src/include/access/tableam.h:1406)
ExecInsert (src/backend/executor/nodeModifyTable.c:1159)
ExecModifyTable (src/backend/executor/nodeModifyTable.c:4150)
ExecProcNode (src/include/executor/executor.h:274)
ExecutePlan (src/backend/executor/execMain.c:1649)
standard_ExecutorRun (src/backend/executor/execMain.c:361)
ExecutorRun (src/backend/executor/execMain.c:307)
ProcessQuery (src/backend/tcop/pquery.c:160)
PortalRunMulti (Unknown Source:0)
PortalRun (src/backend/tcop/pquery.c:789)
exec_simple_query (src/backend/tcop/postgres.c:1278)
PostgresMain (Unknown Source:0)
BackendMain (src/backend/tcop/backend_startup.c:105)
postmaster_child_launch (src/backend/postmaster/launch_backend.c:277)
BackendStartup (src/backend/postmaster/postmaster.c:3594)
ServerLoop (src/backend/postmaster/postmaster.c:1676)
PostmasterMain (src/backend/postmaster/postmaster.c:1374)
main (src/backend/main/main.c:199)

In this chapter, we focus on storage layer, which is the top 3 frames.

table_tuple_insert wraps call to relation's table access method (tableam). By this way, PostgreSQL can support different table access methods, such as heap, hash, and B-tree. See Table Access Method Interface Definition.

static inline void
table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid,
				   int options, struct BulkInsertStateData *bistate)
{
	rel->rd_tableam->tuple_insert(rel, slot, cid, options,
								  bistate);
}
static const TableAmRoutine heapam_methods = {
    /* ... */
	.tuple_insert = heapam_tuple_insert,
    /* ... */
};

heapam_tuple_insert do the preparation and cleanup around the insertion.

static void
heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
					int options, BulkInsertState bistate)
{
	bool		shouldFree = true;
	HeapTuple	tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);

	/* Update the tuple with table oid */
	slot->tts_tableOid = RelationGetRelid(relation);
	tuple->t_tableOid = slot->tts_tableOid;

	/* Perform the insertion, and copy the resulting ItemPointer */
	heap_insert(relation, tuple, cid, options, bistate);
	ItemPointerCopy(&tuple->t_self, &slot->tts_tid);

	if (shouldFree)
		pfree(tuple);
}

heap_insert is the dirty work of the insertion. Since it has ~200 LoC (src/backend/access/heap/heapam.c:2004-2185), the whole function is not shown here for saving network traffic :). We pick the most important part of the function here.

  1. prepares the tuple for insertion. We would go back to TOAST later.

    /*
     * Fill in tuple header fields and toast the tuple if necessary.
     *
     * Note: below this point, heaptup is the data we actually intend to store
     * into the relation; tup is the caller's original untoasted data.
     */
    heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
    
  2. finds the buffer to insert the tuple into.

    /*
     * Find buffer to insert this tuple into.  If the page is all visible,
     * this will also pin the requisite visibility map page.
     */
    buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
    								   InvalidBuffer, options, bistate,
    								   &vmbuffer, NULL,
    								   0);
    
  3. puts the tuple into the buffer.

    RelationPutHeapTuple(relation, buffer, heaptup,
    					 (options & HEAP_INSERT_SPECULATIVE) != 0);
    
  4. marks the buffer as modified so that the buffer would be flushed to disk by background worker.

    MarkBufferDirty(buffer);
    
  5. WAL part, skip for now.

    /* XLOG stuff */
    if (RelationNeedsWAL(relation))
    
  6. release the buffer.

    UnlockReleaseBuffer(buffer);
    if (vmbuffer != InvalidBuffer)
    	ReleaseBuffer(vmbuffer);
    

[[TODO]]