插入
预备知识
概述
数据库的插入操作通常是由一条Insert语句实现的。Insert语句的完整执行流程比较长,包括解析SQL语句、元组组装、元组插入等一系列流程。本章只介绍元组的组装和插入部分。元组组装与插入都在ExecInsert函数中实现,代码如下:
static TupleTableSlot *
ExecInsert(ModifyTableState *mtstate,
TupleTableSlot *slot,
TupleTableSlot *planSlot,
List *arbiterIndexes,
OnConflictAction onconflict,
EState *estate,
bool canSetTag)
{
HeapTuple tuple;
ResultRelInfo *resultRelInfo;
Relation resultRelationDesc;
Oid newId;
List *recheckIndexes = NIL;
/*
* get the heap tuple out of the tuple table slot, making sure we have a
* writable copy
* 组装一个元组
*/
tuple = ExecMaterializeSlot(slot);
/*
* get information on the (current) result relation
*/
resultRelInfo = estate->es_result_relation_info;
resultRelationDesc = resultRelInfo->ri_RelationDesc;
/*
* If the result relation has OIDs, force the tuple's OID to zero so that
* heap_insert will assign a fresh OID. Usually the OID already will be
* zero at this point, but there are corner cases where the plan tree can
* return a tuple extracted literally from some table with the same
* rowtype.
*
* XXX if we ever wanted to allow users to assign their own OIDs to new
* rows, this'd be the place to do it. For the moment, we make a point of
* doing this before calling triggers, so that a user-supplied trigger
* could hack the OID if desired.
*/
if (resultRelationDesc->rd_rel->relhasoids)
HeapTupleSetOid(tuple, InvalidOid);
/*
* BEFORE ROW INSERT Triggers.
*
* Note: We fire BEFORE ROW TRIGGERS for every attempted insertion in an
* INSERT ... ON CONFLICT statement. We cannot check for constraint
* violations before firing these triggers, because they can change the
* values to insert. Also, they can run arbitrary user-defined code with
* side-effects that we can't cancel by just not inserting the tuple.
*/
if (resultRelInfo->ri_TrigDesc &&
resultRelInfo->ri_TrigDesc->trig_insert_before_row)
{
slot = ExecBRInsertTriggers(estate, resultRelInfo, slot);
if (slot == NULL) /* "do nothing" */
return NULL;
/* trigger might have changed tuple */
tuple = ExecMaterializeSlot(slot);
}
/* INSTEAD OF ROW INSERT Triggers */
if (resultRelInfo->ri_TrigDesc &&
resultRelInfo->ri_TrigDesc->trig_insert_instead_row)
{
slot = ExecIRInsertTriggers(estate, resultRelInfo, slot);
if (slot == NULL) /* "do nothing" */
return NULL;
/* trigger might have changed tuple */
tuple = ExecMaterializeSlot(slot);
newId = InvalidOid;
}
else if (resultRelInfo->ri_FdwRoutine)
{
/*
* insert into foreign table: let the FDW do it
*/
slot = resultRelInfo->ri_FdwRoutine->ExecForeignInsert(estate,
resultRelInfo,
slot,
planSlot);
if (slot == NULL) /* "do nothing" */
return NULL;
/* FDW might have changed tuple */
tuple = ExecMaterializeSlot(slot);
/*
* AFTER ROW Triggers or RETURNING expressions might reference the
* tableoid column, so initialize t_tableOid before evaluating them.
*/
tuple->t_tableOid = RelationGetRelid(resultRelationDesc);
newId = InvalidOid;
}
else
{
/*
* Constraints might reference the tableoid column, so initialize
* t_tableOid before evaluating them.
*/
tuple->t_tableOid = RelationGetRelid(resultRelationDesc);
/*
* Check any RLS INSERT WITH CHECK policies
*
* ExecWithCheckOptions() will skip any WCOs which are not of the kind
* we are looking for at this point.
*/
if (resultRelInfo->ri_WithCheckOptions != NIL)
ExecWithCheckOptions(WCO_RLS_INSERT_CHECK,
resultRelInfo, slot, estate);
/*
* Check the constraints of the tuple
*/
if (resultRelationDesc->rd_att->constr)
ExecConstraints(resultRelInfo, slot, estate);
if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0)
{
/* Perform a speculative insertion. */
uint32 specToken;
ItemPointerData conflictTid;
bool specConflict;
/*
* Do a non-conclusive check for conflicts first.
*
* We're not holding any locks yet, so this doesn't guarantee that
* the later insert won't conflict. But it avoids leaving behind
* a lot of canceled speculative insertions, if you run a lot of
* INSERT ON CONFLICT statements that do conflict.
*
* We loop back here if we find a conflict below, either during
* the pre-check, or when we re-check after inserting the tuple
* speculatively.
*/
vlock:
specConflict = false;
if (!ExecCheckIndexConstraints(slot, estate, &conflictTid,
arbiterIndexes))
{
/* committed conflict tuple found */
if (onconflict == ONCONFLICT_UPDATE)
{
/*
* In case of ON CONFLICT DO UPDATE, execute the UPDATE
* part. Be prepared to retry if the UPDATE fails because
* of another concurrent UPDATE/DELETE to the conflict
* tuple.
*/
TupleTableSlot *returning = NULL;
if (ExecOnConflictUpdate(mtstate, resultRelInfo,
&conflictTid, planSlot, slot,
estate, canSetTag, &returning))
{
InstrCountFiltered2(&mtstate->ps, 1);
return returning;
}
else
goto vlock;
}
else
{
/*
* In case of ON CONFLICT DO NOTHING, do nothing. However,
* verify that the tuple is visible to the executor's MVCC
* snapshot at higher isolation levels.
*/
Assert(onconflict == ONCONFLICT_NOTHING);
ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid);
InstrCountFiltered2(&mtstate->ps, 1);
return NULL;
}
}
/*
* Before we start insertion proper, acquire our "speculative
* insertion lock". Others can use that to wait for us to decide
* if we're going to go ahead with the insertion, instead of
* waiting for the whole transaction to complete.
*/
specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId());
HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
/*
* insert the tuple, with the speculative token
* 插入元组
*/
newId = heap_insert(resultRelationDesc, tuple,
estate->es_output_cid,
HEAP_INSERT_SPECULATIVE,
NULL);
/* insert index entries for tuple */
recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self),
estate, true, &specConflict,
arbiterIndexes);
/* adjust the tuple's state accordingly */
if (!specConflict)
heap_finish_speculative(resultRelationDesc, tuple);
else
heap_abort_speculative(resultRelationDesc, tuple);
/*
* Wake up anyone waiting for our decision. They will re-check
* the tuple, see that it's no longer speculative, and wait on our
* XID as if this was a regularly inserted tuple all along. Or if
* we killed the tuple, they will see it's dead, and proceed as if
* the tuple never existed.
*/
SpeculativeInsertionLockRelease(GetCurrentTransactionId());
/*
* If there was a conflict, start from the beginning. We'll do
* the pre-check again, which will now find the conflicting tuple
* (unless it aborts before we get there).
*/
if (specConflict)
{
list_free(recheckIndexes);
goto vlock;
}
/* Since there was no insertion conflict, we're done */
}
else
{
/*
* insert the tuple normally.
*
* Note: heap_insert returns the tid (location) of the new tuple
* in the t_self field.
* 元组插入
*/
newId = heap_insert(resultRelationDesc, tuple,
estate->es_output_cid,
0, NULL);
/* insert index entries for tuple */
if (resultRelInfo->ri_NumIndices > 0)
recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self),
estate, false, NULL,
arbiterIndexes);
}
}
if (canSetTag)
{
(estate->es_processed)++;
estate->es_lastoid = newId;
setLastTid(&(tuple->t_self));
}
/* AFTER ROW INSERT Triggers */
ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes);
list_free(recheckIndexes);
/*
* Check any WITH CHECK OPTION constraints from parent views. We are
* required to do this after testing all constraints and uniqueness
* violations per the SQL spec, so we do it after actually inserting the
* record into the heap and all indexes.
*
* ExecWithCheckOptions will elog(ERROR) if a violation is found, so the
* tuple will never be seen, if it violates the WITH CHECK OPTION.
*
* ExecWithCheckOptions() will skip any WCOs which are not of the kind we
* are looking for at this point.
*/
if (resultRelInfo->ri_WithCheckOptions != NIL)
ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);
/* Process RETURNING if present */
if (resultRelInfo->ri_projectReturning)
return ExecProcessReturning(resultRelInfo, slot, planSlot);
return NULL;
}
元组组装
首先我们来看看如何组装一个元组,在上面的代码中我们不难发现,元组的组装是由ExecMaterializeSlot函数来完成的,该函数返回一个HeapTuple类型的结构体,HeapTuple定义如下:
typedef struct HeapTupleData
{
uint32 t_len; /* length of *t_data 其长度包括HeapTupleHeader结构体的大小 */
ItemPointerData t_self; /* SelfItemPointer 该元组自身的ctid */
Oid t_tableOid; /* table the tuple came from 该元组所属表的oid */
HeapTupleHeader t_data; /* -> tuple header and data 元组的具体内容 */
} HeapTupleData;
typedef HeapTupleData *HeapTuple;
其中需要强调的是t_data,HeapTupleData可以理解为t_data的一个载体,而t_data才是一条元组具体的内容,t_data最终会被写入物理文件,具体的操作我们在后面就会看到。写入的方式是拷贝整个t_data的内容到物理文件,该操作可以等价于如下伪代码:
memcpy(insert_pos, tuple->t_data, tuple->t_len);
下面我们来看看如何组装一个HeapTuple,函数调用顺序:ExecMaterializeSlot > ExecCopySlotTuple > heap_form_tuple,其中最核心的函数是:heap_form_tuple,该函数由三个步骤组成:
- 获取元组的长度。
- 从内存上下文中,分配相应的内存空间。
- 使用步骤2的内容空间,组装一条元组。
代码如下:
HeapTuple
heap_form_tuple(TupleDesc tupleDescriptor,
Datum *values,
bool *isnull)
{
HeapTuple tuple; /* return tuple */
HeapTupleHeader td; /* tuple data */
Size len,
data_len;
int hoff;
bool hasnull = false;
int numberOfAttributes = tupleDescriptor->natts;
int i;
if (numberOfAttributes > MaxTupleAttributeNumber)
ereport(ERROR,
(errcode(ERRCODE_TOO_MANY_COLUMNS),
errmsg("number of columns (%d) exceeds limit (%d)",
numberOfAttributes, MaxTupleAttributeNumber)));
/*
* Check for nulls
*/
for (i = 0; i < numberOfAttributes; i++)
{
if (isnull[i])
{
hasnull = true;
break;
}
}
/*
* Determine total space needed
* 步骤1:获取元组的长度,包括HeapTupleHeaderData结构体,以及元组内容长度
*/
len = offsetof(HeapTupleHeaderData, t_bits);
if (hasnull)
len += BITMAPLEN(numberOfAttributes);
if (tupleDescriptor->tdhasoid)
len += sizeof(Oid);
hoff = len = MAXALIGN(len); /* align user data safely */
/*
* 获取元组内容的长度
*/
data_len = heap_compute_data_size(tupleDescriptor, values, isnull);
len += data_len;
/*
* Allocate and zero the space needed. Note that the tuple body and
* HeapTupleData management structure are allocated in one chunk.
* 步骤2:从内存上下文中,分配相应的内存空间
*/
tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len);
tuple->t_data = td = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
/*
* And fill in the information. Note we fill the Datum fields even though
* this tuple may never become a Datum. This lets HeapTupleHeaderGetDatum
* identify the tuple type if needed.
*/
tuple->t_len = len;
ItemPointerSetInvalid(&(tuple->t_self));
tuple->t_tableOid = InvalidOid;
HeapTupleHeaderSetDatumLength(td, len);
HeapTupleHeaderSetTypeId(td, tupleDescriptor->tdtypeid);
HeapTupleHeaderSetTypMod(td, tupleDescriptor->tdtypmod);
/* We also make sure that t_ctid is invalid unless explicitly set */
ItemPointerSetInvalid(&(td->t_ctid));
HeapTupleHeaderSetNatts(td, numberOfAttributes);
td->t_hoff = hoff; /* 关键代码,详见后面描述 */
if (tupleDescriptor->tdhasoid) /* else leave infomask = 0 */
td->t_infomask = HEAP_HASOID;
/*
* 步骤3:使用步骤2的内容空间,组装一条元组
*/
heap_fill_tuple(tupleDescriptor,
values,
isnull,
(char *) td + hoff,
data_len,
&td->t_infomask,
(hasnull ? td->t_bits : NULL));
return tuple;
}
上面说过HeapTupleHeader用于表示一条元组,由两部分组成
- HeapTupleHeader结构体本身。
- 元组的具体内容。
HeapTupleHeader中有一个非常重要的成员t_hoff,该成员是元组具体内容相对于HeapTupleHeader起始地址的偏移,这个偏移包括
HeapTupleHeader的大小以及由于字节对齐所需要的填充位大小。具体的计算方式见上面代码的37行~45行。
步骤1中调用heap_compute_data_size来计算存放元组具体内容所需的空间大小。长度的计算规则是由元组中各属性的组织方式来决定的,也就是步骤3,所以我们来看看heap_fill_tuple是如何组织一条元组的。假定我们有如下的建表语句和插入语句:
create table test(a int, b char(10), c int);
insert into test values(99,'obvious',1);
其实际数据的组织结构如下图所示:
不难看出,PostgreSQL的记录组织十分紧凑,其中int类型占4个字节(字段a和字段c)。字段b占12个字节,其中第一个字节(对应[4])用来存储字段长度,字符串’obvious’占7个字节,不足10个字节,所以要补充3个空格(字段b的数据类型为char[10]),最后还要存放一个0(对应[14])。
对于该组织结构,目前还有两个遗留问题:
- [4]处的长度为什么不是23,而不是12。
- [14]处为什么需要存一个0。
元组的具体内容会被写入到xlog中。
元组插入
元组组装好后就需要将其插入。那么插入一条元组要经历那些步骤呢?
-
获取一个有足够空间可供元组插入的物理块
因为元组最终会被写入物理块(使用最终的原因是元组肯定是先写入缓存页,再持久化到物理块),所以肯定要获取一个有足够空间的物理块。获取物理块的方式很多,我们能想到的简单的方式就是遍历所有的物理块,检查每一个块的空闲空间,最终确定一个有足够空间的块。但是显然这种方式会有一些性能问题,首先顺序遍历算法时间复杂度较高。其次由于要检查块的空闲空间,所以必然需要将块加载到缓存页中,这样会增加I\O。所以在PostgreSQL中采用了FSM(Free Space Map,空闲空间映射表)来记录物理块的空闲空间。FSM采用最大堆二叉树的结构,存放块空闲空间大小,通过FSM就可以根据需要的空间大小快速定位到一个物理块,但FSM并不是实时更新的,所以定位到的物理块并不一定就真正拥有足够的空间,还需要进一步判断(但是肯定比顺序遍历随便找一个块来判断要靠谱),这也就是为什么需要接下来的步骤2和步骤3。
-
将物理块加载到缓存页中
在步骤1中通过FSM找到的物理块可能不在缓存页中,所以需要将其加载到缓存页中。
-
判断物理块是否有足够的空间
如果有足够空间,则执行步骤4,否则执行步骤1。
-
插入记录
上述四个步骤,可以被简化为两个大的步骤:
-
获取用于插入的缓存页
该步骤对应上面的步骤1~步骤3
-
将记录写入缓存页
该步骤对应上面的步骤4
这两个步骤都在heap_insert中执行,其中步骤1调用RelationGetBufferForTuple,步骤2调用RelationPutHeapTuple。下面我们具体来看下这两个函数。
RelationGetBufferForTuple
RelationGetBufferForTuple用于返回一个可用元组插入的缓存页。
该函数的流程如下:
-
获取current insertion target block,将该块作为targetBlock。
每张表有保存了一个当前插入块,即上一次执行了插入操作的块。首先使用上次执行插入的块执行当前插入,这样可以使插入操作尽量集中,从而减少I\O。
-
将targetBlock加载到缓存页中,加载后的缓存页为buffer。
-
判断buffer是否有足够的空间用于元组的插入,如果有则返回buffer,否则执行步骤4。
-
调用RecordAndGetPageWithFreeSpace从FSM中获取一个物理块,作为targetBlock,如果存在targetBlock则执行步骤2,否则执行步骤5。
-
如果不存在targetBlock,则需要分配一个物理块作为targetBlock。
-
将分配的物理块加载到缓存页。
下面我们来看看具体代码,并将上述步骤和代码相对应。
Buffer
RelationGetBufferForTuple(Relation relation, Size len,
Buffer otherBuffer, int options,
BulkInsertState bistate,
Buffer *vmbuffer, Buffer *vmbuffer_other)
{
bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM);
Buffer buffer = InvalidBuffer;
Page page;
Size pageFreeSpace = 0,
saveFreeSpace = 0;
BlockNumber targetBlock,
otherBlock;
bool needLock;
len = MAXALIGN(len); /* be conservative */
/* Bulk insert is not supported for updates, only inserts. */
Assert(otherBuffer == InvalidBuffer || !bistate);
/*
* If we're gonna fail for oversize tuple, do it right away
*/
if (len > MaxHeapTupleSize)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("row is too big: size %zu, maximum size %zu",
len, MaxHeapTupleSize)));
/* Compute desired extra freespace due to fillfactor option */
saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
HEAP_DEFAULT_FILLFACTOR);
if (otherBuffer != InvalidBuffer)
otherBlock = BufferGetBlockNumber(otherBuffer);
else
otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */
/*
* We first try to put the tuple on the same page we last inserted a tuple
* on, as cached in the BulkInsertState or relcache entry. If that
* doesn't work, we ask the Free Space Map to locate a suitable page.
* Since the FSM's info might be out of date, we have to be prepared to
* loop around and retry multiple times. (To insure this isn't an infinite
* loop, we must update the FSM with the correct amount of free space on
* each page that proves not to be suitable.) If the FSM has no record of
* a page with enough free space, we give up and extend the relation.
*
* When use_fsm is false, we either put the tuple onto the existing target
* page or extend the relation.
*/
if (len + saveFreeSpace > MaxHeapTupleSize)
{
/* can't fit, don't bother asking FSM */
targetBlock = InvalidBlockNumber;
use_fsm = false;
}
else if (bistate && bistate->current_buf != InvalidBuffer)
targetBlock = BufferGetBlockNumber(bistate->current_buf);
else
/* 步骤1: 获取current insertion target block*/
targetBlock = RelationGetTargetBlock(relation);
if (targetBlock == InvalidBlockNumber && use_fsm)
{
/*
* We have no cached target page, so ask the FSM for an initial
* target.
*/
targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
/*
* If the FSM knows nothing of the rel, try the last page before we
* give up and extend. This avoids one-tuple-per-page syndrome during
* bootstrapping or in a recently-started system.
*/
if (targetBlock == InvalidBlockNumber)
{
BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
if (nblocks > 0)
targetBlock = nblocks - 1;
}
}
loop:
/*
* 主要循环:从FSM中获取有足够空间的targetBlock
*/
while (targetBlock != InvalidBlockNumber)
{
/*
* Read and exclusive-lock the target block, as well as the other
* block if one was given, taking suitable care with lock ordering and
* the possibility they are the same block.
*
* If the page-level all-visible flag is set, caller will need to
* clear both that and the corresponding visibility map bit. However,
* by the time we return, we'll have x-locked the buffer, and we don't
* want to do any I/O while in that state. So we check the bit here
* before taking the lock, and pin the page if it appears necessary.
* Checking without the lock creates a risk of getting the wrong
* answer, so we'll have to recheck after acquiring the lock.
*
* 步骤2:将targetBlock加载到缓存页中(详见:ReadBufferBI)
*/
if (otherBuffer == InvalidBuffer)
{
/* easy case */
buffer = ReadBufferBI(relation, targetBlock, bistate);
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
}
else if (otherBlock == targetBlock)
{
/* also easy case */
buffer = otherBuffer;
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
}
else if (otherBlock < targetBlock)
{
/* lock other buffer first */
buffer = ReadBuffer(relation, targetBlock);
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
}
else
{
/* lock target buffer first */
buffer = ReadBuffer(relation, targetBlock);
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
}
/*
* We now have the target page (and the other buffer, if any) pinned
* and locked. However, since our initial PageIsAllVisible checks
* were performed before acquiring the lock, the results might now be
* out of date, either for the selected victim buffer, or for the
* other buffer passed by the caller. In that case, we'll need to
* give up our locks, go get the pin(s) we failed to get earlier, and
* re-lock. That's pretty painful, but hopefully shouldn't happen
* often.
*
* Note that there's a small possibility that we didn't pin the page
* above but still have the correct page pinned anyway, either because
* we've already made a previous pass through this loop, or because
* caller passed us the right page anyway.
*
* Note also that it's possible that by the time we get the pin and
* retake the buffer locks, the visibility map bit will have been
* cleared by some other backend anyway. In that case, we'll have
* done a bit of extra work for no gain, but there's no real harm
* done.
*/
if (otherBuffer == InvalidBuffer || buffer <= otherBuffer)
GetVisibilityMapPins(relation, buffer, otherBuffer,
targetBlock, otherBlock, vmbuffer,
vmbuffer_other);
else
GetVisibilityMapPins(relation, otherBuffer, buffer,
otherBlock, targetBlock, vmbuffer_other,
vmbuffer);
/*
* Now we can check to see if there's enough free space here. If so,
* we're done.
* 步骤3:判断buffer是否有足够的空间用于元组的插入,如果有则返回buffer
*/
page = BufferGetPage(buffer);
pageFreeSpace = PageGetHeapFreeSpace(page);
if (len + saveFreeSpace <= pageFreeSpace)
{
/*
* use this page as future insert target, too
* 将该块作为current insertion target block
*/
RelationSetTargetBlock(relation, targetBlock);
return buffer;
}
/*
* Not enough space, so we must give up our page locks and pin (if
* any) and prepare to look elsewhere. We don't care which order we
* unlock the two buffers in, so this can be slightly simpler than the
* code above.
*/
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
if (otherBuffer == InvalidBuffer)
ReleaseBuffer(buffer);
else if (otherBlock != targetBlock)
{
LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buffer);
}
/* Without FSM, always fall out of the loop and extend */
if (!use_fsm)
break;
/*
* Update FSM as to condition of this page, and ask for another page
* to try.
* 步骤4:调用RecordAndGetPageWithFreeSpace从FSM中获取一个物理块
*/
targetBlock = RecordAndGetPageWithFreeSpace(relation,
targetBlock,
pageFreeSpace,
len + saveFreeSpace);
}
/*
* Have to extend the relation.
*
* We have to use a lock to ensure no one else is extending the rel at the
* same time, else we will both try to initialize the same new page. We
* can skip locking for new or temp relations, however, since no one else
* could be accessing them.
*/
needLock = !RELATION_IS_LOCAL(relation);
/*
* If we need the lock but are not able to acquire it immediately, we'll
* consider extending the relation by multiple blocks at a time to manage
* contention on the relation extension lock. However, this only makes
* sense if we're using the FSM; otherwise, there's no point.
*/
if (needLock)
{
if (!use_fsm)
LockRelationForExtension(relation, ExclusiveLock);
else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock))
{
/* Couldn't get the lock immediately; wait for it. */
LockRelationForExtension(relation, ExclusiveLock);
/*
* Check if some other backend has extended a block for us while
* we were waiting on the lock.
*/
targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
/*
* If some other waiter has already extended the relation, we
* don't need to do so; just use the existing freespace.
*/
if (targetBlock != InvalidBlockNumber)
{
UnlockRelationForExtension(relation, ExclusiveLock);
goto loop;
}
/* Time to bulk-extend. */
RelationAddExtraBlocks(relation, bistate);
}
}
/*
* In addition to whatever extension we performed above, we always add at
* least one block to satisfy our own request.
*
* XXX This does an lseek - rather expensive - but at the moment it is the
* only way to accurately determine how many blocks are in a relation. Is
* it worth keeping an accurate file length in shared memory someplace,
* rather than relying on the kernel to do it for us?
* 步骤5,步骤6 物理块的分配和加载都在这个函数中进行,P_NEW就表示要分配一个物理块!
*/
buffer = ReadBufferBI(relation, P_NEW, bistate);
/*
* We can be certain that locking the otherBuffer first is OK, since it
* must have a lower page number.
*/
if (otherBuffer != InvalidBuffer)
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
/*
* Now acquire lock on the new page.
*/
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/*
* Release the file-extension lock; it's now OK for someone else to extend
* the relation some more. Note that we cannot release this lock before
* we have buffer lock on the new page, or we risk a race condition
* against vacuumlazy.c --- see comments therein.
*/
if (needLock)
UnlockRelationForExtension(relation, ExclusiveLock);
/*
* We need to initialize the empty new page. Double-check that it really
* is empty (this should never happen, but if it does we don't want to
* risk wiping out valid data).
*/
page = BufferGetPage(buffer);
if (!PageIsNew(page))
elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
BufferGetBlockNumber(buffer),
RelationGetRelationName(relation));
PageInit(page, BufferGetPageSize(buffer), 0);
if (len > PageGetHeapFreeSpace(page))
{
/* We should not get here given the test at the top */
elog(PANIC, "tuple is too big: size %zu", len);
}
/*
* Remember the new page as our target for future insertions.
*
* XXX should we enter the new page into the free space map immediately,
* or just keep it for this backend's exclusive use in the short run
* (until VACUUM sees it)? Seems to depend on whether you expect the
* current backend to make more insertions or not, which is probably a
* good bet most of the time. So for now, don't add it to FSM yet.
*
* 将该块作为current insertion target block
*/
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
return buffer;
}
物理块添加
上述步骤5和6用于不存在满足条件的块时,分配一个物理块并将其加载到缓存Buffer中。为了调试这种情况,我们使用了如下的测试用例:
-
创建一张表,此时表为空
create table t2(a int);
-
向表中插入一条记录,由于此时表为空,所以需要分配物理块
insert into t2 values(1);
按照上述用例进行调试
意料之中的,断点落在了步骤5,6的地方。下面我们就来具体看看ReadBufferBI的实现。通过ReadBufferBI最终会调用ReadBuffer_common(调用顺序:ReadBufferBI > ReadBuffer > ReadBufferExtended > ReadBuffer_common),步骤5,6最终实现在ReadBuffer_common中,步骤5,6可以进一步细化为以下几个步骤:
-
调用mdnblocks获取物理块号,因为为该物理块分配缓存页时会用到物理块号。
-
为该物理块分配缓存页。
-
调用mdextend创建物理块
创建物理块的标准做法是,向文件中写一个物理块的空数据(默认大小为8k)。
ReadBuffer_common的代码如下:
static Buffer
ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum, ReadBufferMode mode,
BufferAccessStrategy strategy, bool *hit)
{
BufferDesc *bufHdr;
Block bufBlock;
bool found;
bool isExtend;
bool isLocalBuf = SmgrIsTemp(smgr);
*hit = false;
/* Make sure we will have room to remember the buffer pin */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
isExtend = (blockNum == P_NEW);
TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
isExtend);
/*
* Substitute proper block number if caller asked for P_NEW
* 步骤1:调用mdnblocks获取物理块号,此处smgrnblocks实际会调用mdnblocks
*/
if (isExtend)
blockNum = smgrnblocks(smgr, forkNum);
if (isLocalBuf)
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
if (found)
pgBufferUsage.local_blks_hit++;
else
pgBufferUsage.local_blks_read++;
}
else
{
/*
* lookup the buffer. IO_IN_PROGRESS is set if the requested block is
* not currently in memory.
* 步骤2:为该物理块分配缓存页
*/
bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
strategy, &found);
if (found)
pgBufferUsage.shared_blks_hit++;
else
pgBufferUsage.shared_blks_read++;
}
/* At this point we do NOT hold any locks. */
/* if it was already in the buffer pool, we're done */
if (found)
{
if (!isExtend)
{
/* Just need to update stats before we exit */
*hit = true;
VacuumPageHit++;
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageHit;
TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
isExtend,
found);
/*
* In RBM_ZERO_AND_LOCK mode the caller expects the page to be
* locked on return.
*/
if (!isLocalBuf)
{
if (mode == RBM_ZERO_AND_LOCK)
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
LW_EXCLUSIVE);
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
}
return BufferDescriptorGetBuffer(bufHdr);
}
/*
* We get here only in the corner case where we are trying to extend
* the relation but we found a pre-existing buffer marked BM_VALID.
* This can happen because mdread doesn't complain about reads beyond
* EOF (when zero_damaged_pages is ON) and so a previous attempt to
* read a block beyond EOF could have left a "valid" zero-filled
* buffer. Unfortunately, we have also seen this case occurring
* because of buggy Linux kernels that sometimes return an
* lseek(SEEK_END) result that doesn't account for a recent write. In
* that situation, the pre-existing buffer would contain valid data
* that we don't want to overwrite. Since the legitimate case should
* always have left a zero-filled buffer, complain if not PageIsNew.
*/
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
if (!PageIsNew((Page) bufBlock))
ereport(ERROR,
(errmsg("unexpected data beyond EOF in block %u of relation %s",
blockNum, relpath(smgr->smgr_rnode, forkNum)),
errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
/*
* We *must* do smgrextend before succeeding, else the page will not
* be reserved by the kernel, and the next P_NEW call will decide to
* return the same page. Clear the BM_VALID bit, do the StartBufferIO
* call that BufferAlloc didn't, and proceed.
*/
if (isLocalBuf)
{
/* Only need to adjust flags */
uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
Assert(buf_state & BM_VALID);
buf_state &= ~BM_VALID;
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
}
else
{
/*
* Loop to handle the very small possibility that someone re-sets
* BM_VALID between our clearing it and StartBufferIO inspecting
* it.
*/
do
{
uint32 buf_state = LockBufHdr(bufHdr);
Assert(buf_state & BM_VALID);
buf_state &= ~BM_VALID;
UnlockBufHdr(bufHdr, buf_state);
} while (!StartBufferIO(bufHdr, true));
}
}
/*
* if we have gotten to this point, we have allocated a buffer for the
* page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
* if it's a shared buffer.
*
* Note: if smgrextend fails, we will end up with a buffer that is
* allocated but not marked BM_VALID. P_NEW will still select the same
* block number (because the relation didn't get any longer on disk) and
* so future attempts to extend the relation will find the same buffer (if
* it's not been recycled) but come right back here to try smgrextend
* again.
*/
Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
if (isExtend)
{
/* new buffers are zero-filled */
MemSet((char *) bufBlock, 0, BLCKSZ);
/*
* don't set checksum for all-zero page
* 步骤3:创建物理块smgrextend实际会调用mdextend
*/
smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
/*
* NB: we're *not* doing a ScheduleBufferTagForWriteback here;
* although we're essentially performing a write. At least on linux
* doing so defeats the 'delayed allocation' mechanism, leading to
* increased file fragmentation.
*/
}
else
{
/*
* Read in the page, unless the caller intends to overwrite it and
* just wants us to allocate a buffer.
*/
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
MemSet((char *) bufBlock, 0, BLCKSZ);
else
{
instr_time io_start,
io_time;
if (track_io_timing)
INSTR_TIME_SET_CURRENT(io_start);
smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
INSTR_TIME_SUBTRACT(io_time, io_start);
pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
}
/* check for garbage data */
if (!PageIsVerified((Page) bufBlock, blockNum))
{
if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
{
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid page in block %u of relation %s; zeroing out page",
blockNum,
relpath(smgr->smgr_rnode, forkNum))));
MemSet((char *) bufBlock, 0, BLCKSZ);
}
else
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid page in block %u of relation %s",
blockNum,
relpath(smgr->smgr_rnode, forkNum))));
}
}
}
/*
* In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
* the page as valid, to make sure that no other backend sees the zeroed
* page before the caller has had a chance to initialize it.
*
* Since no-one else can be looking at the page contents yet, there is no
* difference between an exclusive lock and a cleanup-strength lock. (Note
* that we cannot use LockBuffer() or LockBufferForCleanup() here, because
* they assert that the buffer is already valid.)
*/
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
!isLocalBuf)
{
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
}
if (isLocalBuf)
{
/* Only need to adjust flags */
uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
buf_state |= BM_VALID;
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
}
else
{
/* Set BM_VALID, terminate IO, and wake up any waiters */
TerminateBufferIO(bufHdr, false, BM_VALID);
}
VacuumPageMiss++;
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageMiss;
TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
isExtend,
found);
return BufferDescriptorGetBuffer(bufHdr);
}
RelationPutHeapTuple
RelationPutHeapTuple最终会调用PageAddItemExtended(调用路径:RelationPutHeapTuple > PageAddItem > PageAddItemExtended),PageAddItemExtended的实现主要有三个步骤:
- 获取元组对应的ItemIdData。
- 写入ItemIdData,这是元组的定长部分,用于表示元组的实际位置。
- 写入元组。
代码如下:
OffsetNumber
PageAddItemExtended(Page page,
Item item,
Size size,
OffsetNumber offsetNumber,
int flags)
{
PageHeader phdr = (PageHeader) page;
Size alignedSize;
int lower;
int upper;
ItemId itemId;
OffsetNumber limit;
bool needshuffle = false;
/*
* Be wary about corrupted page pointers
*/
if (phdr->pd_lower < SizeOfPageHeaderData ||
phdr->pd_lower > phdr->pd_upper ||
phdr->pd_upper > phdr->pd_special ||
phdr->pd_special > BLCKSZ)
ereport(PANIC,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
/*
* Select offsetNumber to place the new item at
* 获取该元组对应ItemIdData的下标,后面会赋值给offsetNumber,见82行
*/
limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
/* was offsetNumber passed in? */
if (OffsetNumberIsValid(offsetNumber))
{
/* yes, check it */
if ((flags & PAI_OVERWRITE) != 0)
{
if (offsetNumber < limit)
{
itemId = PageGetItemId(phdr, offsetNumber);
if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId))
{
elog(WARNING, "will not overwrite a used ItemId");
return InvalidOffsetNumber;
}
}
}
else
{
if (offsetNumber < limit)
needshuffle = true; /* need to move existing linp's */
}
}
else
{
/* offsetNumber was not passed in, so find a free slot */
/* if no free slot, we'll put it at limit (1st open slot) */
if (PageHasFreeLinePointers(phdr))
{
/*
* Look for "recyclable" (unused) ItemId. We check for no storage
* as well, just to be paranoid --- unused items should never have
* storage.
*/
for (offsetNumber = 1; offsetNumber < limit; offsetNumber++)
{
itemId = PageGetItemId(phdr, offsetNumber);
if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId))
break;
}
if (offsetNumber >= limit)
{
/* the hint is wrong, so reset it */
PageClearHasFreeLinePointers(phdr);
}
}
else
{
/* don't bother searching if hint says there's no free slot */
offsetNumber = limit;
}
}
/*
* Reject placing items beyond the first unused line pointer, unless
* caller asked for that behavior specifically.
*/
if ((flags & PAI_ALLOW_FAR_OFFSET) == 0 && offsetNumber > limit)
{
elog(WARNING, "specified item offset is too large");
return InvalidOffsetNumber;
}
/* Reject placing items beyond heap boundary, if heap */
if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage)
{
elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page");
return InvalidOffsetNumber;
}
/*
* Compute new lower and upper pointers for page, see if it'll fit.
*
* Note: do arithmetic as signed ints, to avoid mistakes if, say,
* alignedSize > pd_upper.
*/
if ((flags & PAI_ALLOW_FAR_OFFSET) != 0)
lower = Max(phdr->pd_lower,
SizeOfPageHeaderData + sizeof(ItemIdData) * offsetNumber);
else if (offsetNumber == limit || needshuffle)
lower = phdr->pd_lower + sizeof(ItemIdData);
else
lower = phdr->pd_lower;
alignedSize = MAXALIGN(size);
upper = (int) phdr->pd_upper - (int) alignedSize;
if (lower > upper)
return InvalidOffsetNumber;
/*
* OK to insert the item. First, shuffle the existing pointers if needed.
* 步骤1:获取元组对应的ItemIdData
*/
itemId = PageGetItemId(phdr, offsetNumber);
if (needshuffle)
memmove(itemId + 1, itemId,
(limit - offsetNumber) * sizeof(ItemIdData));
/*
* set the item pointer
* 步骤2:写入ItemIdData
*/
ItemIdSetNormal(itemId, upper, size);
/*
* Items normally contain no uninitialized bytes. Core bufpage consumers
* conform, but this is not a necessary coding rule; a new index AM could
* opt to depart from it. However, data type input functions and other
* C-language functions that synthesize datums should initialize all
* bytes; datumIsEqual() relies on this. Testing here, along with the
* similar check in printtup(), helps to catch such mistakes.
*
* Values of the "name" type retrieved via index-only scans may contain
* uninitialized bytes; see comment in btrescan(). Valgrind will report
* this as an error, but it is safe to ignore.
*/
VALGRIND_CHECK_MEM_IS_DEFINED(item, size);
/*
* copy the item's data onto the page
* 步骤3:写入元组
*/
memcpy((char *) page + upper, item, size);
/* adjust page header */
phdr->pd_lower = (LocationIndex) lower;
phdr->pd_upper = (LocationIndex) upper;
return offsetNumber;
}