本文简单介绍了PG插入数据部分的源码,主要内容包括ExecInsert函数的实现逻辑,该函数位于nodeModifyTable.c文件中。
一、基础信息
按惯例,首先看看ExecInsert函数使用的数据结构、宏定义以及依赖的函数等。
数据结构/宏定义
1、ModifyTableState
/* ----------------
* PlanState node
*
* We never actually instantiate any PlanState nodes; this is just the common
* abstract superclass for all PlanState-type nodes.
* ----------------
*/
typedef struct PlanState
{
NodeTag type;
Plan *plan; /* associated Plan node */
EState *state; /* at execution time, states of individual
* nodes point to one EState for the whole
* top-level plan */
ExecProcNodeMtd ExecProcNode; /* function to return next tuple */
ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a
* wrapper */
Instrumentation *instrument; /* Optional runtime stats for this node */
WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */
/*
* Common structural data for all Plan types. These links to subsidiary
* state trees parallel links in the associated plan tree (except for the
* subPlan list, which does not exist in the plan tree).
*/
ExprState *qual; /* boolean qual condition */
struct PlanState *lefttree; /* input plan tree(s) */
struct PlanState *righttree;
List *initPlan; /* Init SubPlanState nodes (un-correlated expr
* subselects) */
List *subPlan; /* SubPlanState nodes in my expressions */
/*
* State for management of parameter-change-driven rescanning
*/
Bitmapset *chgParam; /* set of IDs of changed Params */
/*
* Other run-time state needed by most if not all node types.
*/
TupleTableSlot *ps_ResultTupleSlot; /* slot for my result tuples */
ExprContext *ps_ExprContext; /* node's expression-evaluation context */
ProjectionInfo *ps_ProjInfo; /* info for doing tuple projection */
/*
* Scanslot's descriptor if known. This is a bit of a hack, but otherwise
* it's hard for expression compilation to optimize based on the
* descriptor, without encoding knowledge about all executor nodes.
*/
TupleDesc scandesc;
} PlanState;
/* ----------------
* ModifyTableState information
* ----------------
*/
typedef struct ModifyTableState
{
PlanState ps; /* its first field is NodeTag */
CmdType operation; /* INSERT, UPDATE, or DELETE */
bool canSetTag; /* do we set the command tag/es_processed? */
bool mt_done; /* are we done? */
PlanState **mt_plans; /* subplans (one per target rel) */
int mt_nplans; /* number of plans in the array */
int mt_whichplan; /* which one is being executed (0..n-1) */
ResultRelInfo *resultRelInfo; /* per-subplan target relations */
ResultRelInfo *rootResultRelInfo; /* root target relation (partitioned
* table root) */
List **mt_arowmarks; /* per-subplan ExecAuxRowMark lists */
EPQState mt_epqstate; /* for evaluating EvalPlanQual rechecks */
bool fireBSTriggers; /* do we need to fire stmt triggers? */
TupleTableSlot *mt_existing; /* slot to store existing target tuple in */
List *mt_excludedtlist; /* the excluded pseudo relation's tlist */
TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection target */
/* Tuple-routing support info */
struct PartitionTupleRouting *mt_partition_tuple_routing;
/* controls transition table population for specified operation */
struct TransitionCaptureState *mt_transition_capture;
/* controls transition table population for INSERT...ON CONFLICT UPDATE */
struct TransitionCaptureState *mt_oc_transition_capture;
/* Per plan map for tuple conversion from child to root */
TupleConversionMap **mt_per_subplan_tupconv_maps;
} ModifyTableState;
2、TupleTableSlot
/*----------
* The executor stores tuples in a "tuple table" which is a List of
* independent TupleTableSlots. There are several cases we need to handle:
* 1. physical tuple in a disk buffer page
* 2. physical tuple constructed in palloc'ed memory
* 3. "minimal" physical tuple constructed in palloc'ed memory
* 4. "virtual" tuple consisting of Datum/isnull arrays
*
* The first two cases are similar in that they both deal with "materialized"
* tuples, but resource management is different. For a tuple in a disk page
* we need to hold a pin on the buffer until the TupleTableSlot's reference
* to the tuple is dropped; while for a palloc'd tuple we usually want the
* tuple pfree'd when the TupleTableSlot's reference is dropped.
*
* A "minimal" tuple is handled similarly to a palloc'd regular tuple.
* At present, minimal tuples never are stored in buffers, so there is no
* parallel to case 1. Note that a minimal tuple has no "system columns".
* (Actually, it could have an OID, but we have no need to access the OID.)
*
* A "virtual" tuple is an optimization used to minimize physical data
* copying in a nest of plan nodes. Any pass-by-reference Datums in the
* tuple point to storage that is not directly associated with the
* TupleTableSlot; generally they will point to part of a tuple stored in
* a lower plan node's output TupleTableSlot, or to a function result
* constructed in a plan node's per-tuple econtext. It is the responsibility
* of the generating plan node to be sure these resources are not released
* for as long as the virtual tuple needs to be valid. We only use virtual
* tuples in the result slots of plan nodes --- tuples to be copied anywhere
* else need to be "materialized" into physical tuples. Note also that a
* virtual tuple does not have any "system columns".
*
* It is also possible for a TupleTableSlot to hold both physical and minimal
* copies of a tuple. This is done when the slot is requested to provide
* the format other than the one it currently holds. (Originally we attempted
* to handle such requests by replacing one format with the other, but that
* had the fatal defect of invalidating any pass-by-reference Datums pointing
* into the existing slot contents.) Both copies must contain identical data
* payloads when this is the case.
*
* The Datum/isnull arrays of a TupleTableSlot serve double duty. When the
* slot contains a virtual tuple, they are the authoritative data. When the
* slot contains a physical tuple, the arrays contain data extracted from
* the tuple. (In this state, any pass-by-reference Datums point into
* the physical tuple.) The extracted information is built "lazily",
* ie, only as needed. This serves to avoid repeated extraction of data
* from the physical tuple.
*
* A TupleTableSlot can also be "empty", holding no valid data. This is
* the only valid state for a freshly-created slot that has not yet had a
* tuple descriptor assigned to it. In this state, tts_isempty must be
* true, tts_shouldFree false, tts_tuple NULL, tts_buffer InvalidBuffer,
* and tts_nvalid zero.
*
* The tupleDescriptor is simply referenced, not copied, by the TupleTableSlot
* code. The caller of ExecSetSlotDescriptor() is responsible for providing
* a descriptor that will live as long as the slot does. (Typically, both
* slots and descriptors are in per-query memory and are freed by memory
* context deallocation at query end; so it's not worth providing any extra
* mechanism to do more. However, the slot will increment the tupdesc
* reference count if a reference-counted tupdesc is supplied.)
*
* When tts_shouldFree is true, the physical tuple is "owned" by the slot
* and should be freed when the slot's reference to the tuple is dropped.
*
* If tts_buffer is not InvalidBuffer, then the slot is holding a pin
* on the indicated buffer page; drop the pin when we release the
* slot's reference to that buffer. (tts_shouldFree should always be
* false in such a case, since presumably tts_tuple is pointing at the
* buffer page.)
*
* tts_nvalid indicates the number of valid columns in the tts_values/isnull
* arrays. When the slot is holding a "virtual" tuple this must be equal
* to the descriptor's natts. When the slot is holding a physical tuple
* this is equal to the number of columns we have extracted (we always
* extract columns from left to right, so there are no holes).
*
* tts_values/tts_isnull are allocated when a descriptor is assigned to the
* slot; they are of length equal to the descriptor's natts.
*
* tts_mintuple must always be NULL if the slot does not hold a "minimal"
* tuple. When it does, tts_mintuple points to the actual MinimalTupleData
* object (the thing to be pfree'd if tts_shouldFreeMin is true). If the slot
* has only a minimal and not also a regular physical tuple, then tts_tuple
* points at tts_minhdr and the fields of that struct are set correctly
* for access to the minimal tuple; in particular, tts_minhdr.t_data points
* MINIMAL_TUPLE_OFFSET bytes before tts_mintuple. This allows column
* extraction to treat the case identically to regular physical tuples.
*
* tts_slow/tts_off are saved state for slot_deform_tuple, and should not
* be touched by any other code.
*----------
*/
typedef struct TupleTableSlot
{
NodeTag type;
bool tts_isempty; /* true = slot is empty */
bool tts_shouldFree; /* should pfree tts_tuple? */
bool tts_shouldFreeMin; /* should pfree tts_mintuple? */
#define FIELDNO_TUPLETABLESLOT_SLOW 4
bool tts_slow; /* saved state for slot_deform_tuple */
#define FIELDNO_TUPLETABLESLOT_TUPLE 5
HeapTuple tts_tuple; /* physical tuple, or NULL if virtual */
#define FIELDNO_TUPLETABLESLOT_TUPLEDESCRIPTOR 6
TupleDesc tts_tupleDescriptor; /* slot's tuple descriptor */
MemoryContext tts_mcxt; /* slot itself is in this context */
Buffer tts_buffer; /* tuple's buffer, or InvalidBuffer */
#define FIELDNO_TUPLETABLESLOT_NVALID 9
int tts_nvalid; /* # of valid values in tts_values */
#define FIELDNO_TUPLETABLESLOT_VALUES 10
Datum *tts_values; /* current per-attribute values */
#define FIELDNO_TUPLETABLESLOT_ISNULL 11
bool *tts_isnull; /* current per-attribute isnull flags */
MinimalTuple tts_mintuple; /* minimal tuple, or NULL if none */
HeapTupleData tts_minhdr; /* workspace for minimal-tuple-only case */
#define FIELDNO_TUPLETABLESLOT_OFF 14
uint32 tts_off; /* saved state for slot_deform_tuple */
bool tts_fixedTupleDescriptor; /* descriptor can't be changed */
} TupleTableSlot;
3、EState
/* ----------------
* EState information
*
* Master working state for an Executor invocation
* ----------------
*/
typedef struct EState
{
NodeTag type;
/* Basic state for all query types: */
ScanDirection es_direction; /* current scan direction */
Snapshot es_snapshot; /* time qual to use */
Snapshot es_crosscheck_snapshot; /* crosscheck time qual for RI */
List *es_range_table; /* List of RangeTblEntry */
PlannedStmt *es_plannedstmt; /* link to top of plan tree */
const char *es_sourceText; /* Source text from QueryDesc */
JunkFilter *es_junkFilter; /* top-level junk filter, if any */
/* If query can insert/delete tuples, the command ID to mark them with */
CommandId es_output_cid;
/* Info about target table(s) for insert/update/delete queries: */
ResultRelInfo *es_result_relations; /* array of ResultRelInfos */
int es_num_result_relations; /* length of array */
ResultRelInfo *es_result_relation_info; /* currently active array elt */
/*
* Info about the target partitioned target table root(s) for
* update/delete queries. They required only to fire any per-statement
* triggers defined on the table. It exists separately from
* es_result_relations, because partitioned tables don't appear in the
* plan tree for the update/delete cases.
*/
ResultRelInfo *es_root_result_relations; /* array of ResultRelInfos */
int es_num_root_result_relations; /* length of the array */
/*
* The following list contains ResultRelInfos created by the tuple routing
* code for partitions that don't already have one.
*/
List *es_tuple_routing_result_relations;
/* Stuff used for firing triggers: */
List *es_trig_target_relations; /* trigger-only ResultRelInfos */
TupleTableSlot *es_trig_tuple_slot; /* for trigger output tuples */
TupleTableSlot *es_trig_oldtup_slot; /* for TriggerEnabled */
TupleTableSlot *es_trig_newtup_slot; /* for TriggerEnabled */
/* Parameter info: */
ParamListInfo es_param_list_info; /* values of external params */
ParamExecData *es_param_exec_vals; /* values of internal params */
QueryEnvironment *es_queryEnv; /* query environment */
/* Other working state: */
MemoryContext es_query_cxt; /* per-query context in which EState lives */
List *es_tupleTable; /* List of TupleTableSlots */
List *es_rowMarks; /* List of ExecRowMarks */
uint64 es_processed; /* # of tuples processed */
Oid es_lastoid; /* last oid processed (by INSERT) */
int es_top_eflags; /* eflags passed to ExecutorStart */
int es_instrument; /* OR of InstrumentOption flags */
bool es_finished; /* true when ExecutorFinish is done */
List *es_exprcontexts; /* List of ExprContexts within EState */
List *es_subplanstates; /* List of PlanState for SubPlans */
List *es_auxmodifytables; /* List of secondary ModifyTableStates */
/*
* this ExprContext is for per-output-tuple operations, such as constraint
* checks and index-value computations. It will be reset for each output
* tuple. Note that it will be created only if needed.
*/
ExprContext *es_per_tuple_exprcontext;
/*
* These fields are for re-evaluating plan quals when an updated tuple is
* substituted in READ COMMITTED mode. es_epqTuple[] contains tuples that
* scan plan nodes should return instead of whatever they'd normally
* return, or NULL if nothing to return; es_epqTupleSet[] is true if a
* particular array entry is valid; and es_epqScanDone[] is state to
* remember if the tuple has been returned already. Arrays are of size
* list_length(es_range_table) and are indexed by scan node scanrelid - 1.
*/
HeapTuple *es_epqTuple; /* array of EPQ substitute tuples */
bool *es_epqTupleSet; /* true if EPQ tuple is provided */
bool *es_epqScanDone; /* true if EPQ tuple has been fetched */
bool es_use_parallel_mode; /* can we use parallel workers? */
/* The per-query shared memory area to use for parallel execution. */
struct dsa_area *es_query_dsa;
/*
* JIT information. es_jit_flags indicates whether JIT should be performed
* and with which options. es_jit is created on-demand when JITing is
* performed.
*/
int es_jit_flags;
struct JitContext *es_jit;
} EState;
4、ResultRelInfo
/*
* ResultRelInfo
*
* Whenever we update an existing relation, we have to update indexes on the
* relation, and perhaps also fire triggers. ResultRelInfo holds all the
* information needed about a result relation, including indexes.
*/
typedef struct ResultRelInfo
{
NodeTag type;
/* result relation's range table index */
Index ri_RangeTableIndex;
/* relation descriptor for result relation */
Relation ri_RelationDesc;
/* # of indices existing on result relation */
int ri_NumIndices;
/* array of relation descriptors for indices */
RelationPtr ri_IndexRelationDescs;
/* array of key/attr info for indices */
IndexInfo **ri_IndexRelationInfo;
/* triggers to be fired, if any */
TriggerDesc *ri_TrigDesc;
/* cached lookup info for trigger functions */
FmgrInfo *ri_TrigFunctions;
/* array of trigger WHEN expr states */
ExprState **ri_TrigWhenExprs;
/* optional runtime measurements for triggers */
Instrumentation *ri_TrigInstrument;
/* FDW callback functions, if foreign table */
struct FdwRoutine *ri_FdwRoutine;
/* available to save private state of FDW */
void *ri_FdwState;
/* true when modifying foreign table directly */
bool ri_usesFdwDirectModify;
/* list of WithCheckOption's to be checked */
List *ri_WithCheckOptions;
/* list of WithCheckOption expr states */
List *ri_WithCheckOptionExprs;
/* array of constraint-checking expr states */
ExprState **ri_ConstraintExprs;
/* for removing junk attributes from tuples */
JunkFilter *ri_junkFilter;
/* list of RETURNING expressions */
List *ri_returningList;
/* for computing a RETURNING list */
ProjectionInfo *ri_projectReturning;
/* list of arbiter indexes to use to check conflicts */
List *ri_onConflictArbiterIndexes;
/* ON CONFLICT evaluation state */
OnConflictSetState *ri_onConflict;
/* partition check expression */
List *ri_PartitionCheck;
/* partition check expression state */
ExprState *ri_PartitionCheckExpr;
/* relation descriptor for root partitioned table */
Relation ri_PartitionRoot;
/* true if ready for tuple routing */
bool ri_PartitionReadyForRouting;
} ResultRelInfo;
5、List
typedef struct ListCell ListCell;
typedef struct List
{
NodeTag type; /* T_List, T_IntList, or T_OidList */
int length;
ListCell *head;
ListCell *tail;
} List;
struct ListCell
{
union
{
void *ptr_value;
int int_value;
Oid oid_value;
} data;
ListCell *next;
};
6、TransitionCaptureState
typedef struct TransitionCaptureState
{
/*
* Is there at least one trigger specifying each transition relation on
* the relation explicitly named in the DML statement or COPY command?
* Note: in current usage, these flags could be part of the private state,
* but it seems possibly useful to let callers see them.
*/
bool tcs_delete_old_table;
bool tcs_update_old_table;
bool tcs_update_new_table;
bool tcs_insert_new_table;
/*
* For UPDATE and DELETE, AfterTriggerSaveEvent may need to convert the
* new and old tuples from a child table's format to the format of the
* relation named in a query so that it is compatible with the transition
* tuplestores. The caller must store the conversion map here if so.
*/
TupleConversionMap *tcs_map;
/*
* For INSERT and COPY, it would be wasteful to convert tuples from child
* format to parent format after they have already been converted in the
* opposite direction during routing. In that case we bypass conversion
* and allow the inserting code (copy.c and nodeModifyTable.c) to provide
* the original tuple directly.
*/
HeapTuple tcs_original_insert_tuple;
/*
* Private data including the tuplestore(s) into which to insert tuples.
*/
struct AfterTriggersTableData *tcs_private;
} TransitionCaptureState;
*7、ModifyTable *
/* ----------------
* ModifyTable node -
* Apply rows produced by subplan(s) to result table(s),
* by inserting, updating, or deleting.
*
* Note that rowMarks and epqParam are presumed to be valid for all the
* subplan(s); they can't contain any info that varies across subplans.
* ----------------
*/
typedef struct ModifyTable
{
Plan plan;
CmdType operation; /* INSERT, UPDATE, or DELETE */
bool canSetTag; /* do we set the command tag/es_processed? */
Index nominalRelation; /* Parent RT index for use of EXPLAIN */
/* RT indexes of non-leaf tables in a partition tree */
List *partitioned_rels;
bool partColsUpdated; /* some part key in hierarchy updated */
List *resultRelations; /* integer list of RT indexes */
int resultRelIndex; /* index of first resultRel in plan's list */
int rootResultRelIndex; /* index of the partitioned table root */
List *plans; /* plan(s) producing source data */
List *withCheckOptionLists; /* per-target-table WCO lists */
List *returningLists; /* per-target-table RETURNING tlists */
List *fdwPrivLists; /* per-target-table FDW private data lists */
Bitmapset *fdwDirectModifyPlans; /* indices of FDW DM plans */
List *rowMarks; /* PlanRowMarks (non-locking only) */
int epqParam; /* ID of Param for EvalPlanQual re-eval */
OnConflictAction onConflictAction; /* ON CONFLICT action */
List *arbiterIndexes; /* List of ON CONFLICT arbiter index OIDs */
List *onConflictSet; /* SET for INSERT ON CONFLICT DO UPDATE */
Node *onConflictWhere; /* WHERE for ON CONFLICT UPDATE */
Index exclRelRTI; /* RTI of the EXCLUDED pseudo relation */
List *exclRelTlist; /* tlist of the EXCLUDED pseudo relation */
} ModifyTable;
8、OnConflictAction
/*
* OnConflictAction -
* "ON CONFLICT" clause type of query
*
* This is needed in both parsenodes.h and plannodes.h, so put it here...
*/
typedef enum OnConflictAction
{
ONCONFLICT_NONE, /* No "ON CONFLICT" clause */
ONCONFLICT_NOTHING, /* ON CONFLICT ... DO NOTHING */
ONCONFLICT_UPDATE /* ON CONFLICT ... DO UPDATE */
} OnConflictAction;
8、MemoryContext
typedef struct MemoryContextData
{
NodeTag type; /* identifies exact kind of context */
/* these two fields are placed here to minimize alignment wastage: */
bool isReset; /* T = no space alloced since last reset */
bool allowInCritSection; /* allow palloc in critical section */
const MemoryContextMethods *methods; /* virtual function table */
MemoryContext parent; /* NULL if no parent (toplevel context) */
MemoryContext firstchild; /* head of linked list of children */
MemoryContext prevchild; /* previous child of same parent */
MemoryContext nextchild; /* next child of same parent */
const char *name; /* context name (just for debugging) */
const char *ident; /* context ID if any (just for debugging) */
MemoryContextCallback *reset_cbs; /* list of reset/delete callbacks */
} MemoryContextData;
/* utils/palloc.h contains typedef struct MemoryContextData *MemoryContext */
/*
* Type MemoryCon