From 8a45b33d2c0ea5e683f989afeda3e8754365c804 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 05:22:25 +0300 Subject: [PATCH 01/56] Custom user shared invalidation message --- src/backend/utils/cache/inval.c | 48 +++++++++++++++++++++++++++++++++ src/include/storage/sinval.h | 11 ++++++++ src/include/utils/inval.h | 4 +++ 3 files changed, 63 insertions(+) diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 603aa4157be..e50bb2b681e 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -251,6 +251,7 @@ int debug_discard_caches = 0; #define MAX_SYSCACHE_CALLBACKS 64 #define MAX_RELCACHE_CALLBACKS 10 +#define MAX_USERCACHE_CALLBACKS 10 static struct SYSCACHECALLBACK { @@ -272,6 +273,14 @@ static struct RELCACHECALLBACK static int relcache_callback_count = 0; +static struct USERCACHECALLBACK +{ + UsercacheCallbackFunction function; + Datum arg; +} usercache_callback_list[MAX_RELCACHE_CALLBACKS]; + +static int usercache_callback_count = 0; + /* ---------------------------------------------------------------- * Invalidation subgroup support functions * ---------------------------------------------------------------- @@ -692,6 +701,16 @@ InvalidateSystemCachesExtended(bool debug_discard) ccitem->function(ccitem->arg, InvalidOid); } + + for (i = 0; i < usercache_callback_count; i++) + { + struct USERCACHECALLBACK *ccitem = usercache_callback_list + i; + + ccitem->function(ccitem->arg, + InvalidOid, + InvalidOid, + InvalidOid); + } } /* @@ -773,6 +792,19 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) else if (msg->sn.dbId == MyDatabaseId) InvalidateCatalogSnapshot(); } + else if (msg->id == SHAREDINVALUSERCACHE_ID) + { + int i; + for (i = 0; i < usercache_callback_count; i++) + { + struct USERCACHECALLBACK *ccitem = usercache_callback_list + i; + + ccitem->function(ccitem->arg, + msg->usr.arg1, + msg->usr.arg2, + msg->usr.arg3); + } + } else elog(FATAL, "unrecognized SI message ID: %d", msg->id); } @@ -1567,6 +1599,22 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, ++relcache_callback_count; } +/* + * CacheRegisterUsercacheCallback + */ +void +CacheRegisterUsercacheCallback(UsercacheCallbackFunction func, + Datum arg) +{ + if (usercache_callback_count >= MAX_USERCACHE_CALLBACKS) + elog(FATAL, "out of usercache_callback_list slots"); + + usercache_callback_list[usercache_callback_count].function = func; + usercache_callback_list[usercache_callback_count].arg = arg; + + ++usercache_callback_count; +} + /* * CallSyscacheCallbacks * diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h index 8f5744b21bc..6d262ed080c 100644 --- a/src/include/storage/sinval.h +++ b/src/include/storage/sinval.h @@ -110,6 +110,16 @@ typedef struct Oid relId; /* relation ID */ } SharedInvalSnapshotMsg; +#define SHAREDINVALUSERCACHE_ID (-6) + +typedef struct +{ + int8 id; /* type field --- must be first */ + Oid arg1; /* user-specific values */ + Oid arg2; + Oid arg3; +} SharedInvalUserMsg; + typedef union { int8 id; /* type field --- must be first */ @@ -119,6 +129,7 @@ typedef union SharedInvalSmgrMsg sm; SharedInvalRelmapMsg rm; SharedInvalSnapshotMsg sn; + SharedInvalUserMsg usr; } SharedInvalidationMessage; diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 24695facf22..225b8e4ddaa 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -22,6 +22,7 @@ extern PGDLLIMPORT int debug_discard_caches; typedef void (*SyscacheCallbackFunction) (Datum arg, int cacheid, uint32 hashvalue); typedef void (*RelcacheCallbackFunction) (Datum arg, Oid relid); +typedef void (*UsercacheCallbackFunction) (Datum arg, Oid arg1, Oid arg2, Oid arg3); extern void AcceptInvalidationMessages(void); @@ -59,6 +60,9 @@ extern void CacheRegisterSyscacheCallback(int cacheid, extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, Datum arg); +extern void CacheRegisterUsercacheCallback(UsercacheCallbackFunction func, + Datum arg); + extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue); extern void InvalidateSystemCaches(void); From 4ca02ea1c2fecc56e52eaf180c6961beb5113ab9 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 05:24:57 +0300 Subject: [PATCH 02/56] CacheInvalidateRelcacheByDbidRelid() --- src/backend/utils/cache/inval.c | 19 +++++++++++++++++++ src/include/utils/inval.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index e50bb2b681e..4b779ccd951 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -1461,6 +1461,25 @@ CacheInvalidateRelcacheByRelid(Oid relid) ReleaseSysCache(tup); } +/* + * CacheInvalidateRelcacheByDbidRelid + */ +void +CacheInvalidateRelcacheByDbidRelid(Oid dbid, Oid relid) +{ + SharedInvalidationMessage msg; + + PrepareInvalidationState(); + + msg.rc.id = SHAREDINVALRELCACHE_ID; + msg.rc.dbId = dbid; + msg.rc.relId = relid; + /* check AddCatcacheInvalidationMessage() for an explanation */ + VALGRIND_MAKE_MEM_DEFINED(&msg, sizeof(msg)); + + SendSharedInvalidMessages(&msg, 1); +} + /* * CacheInvalidateSmgr diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 225b8e4ddaa..69498b9f77f 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -49,6 +49,8 @@ extern void CacheInvalidateRelcacheByTuple(HeapTuple classTuple); extern void CacheInvalidateRelcacheByRelid(Oid relid); +extern void CacheInvalidateRelcacheByDbidRelid(Oid dbid, Oid relid); + extern void CacheInvalidateSmgr(RelFileLocatorBackend rlocator); extern void CacheInvalidateRelmap(Oid databaseId); From 0b30fdcdce663ad8a1dcdf375e3a4d140c7f2866 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 20:26:34 +0300 Subject: [PATCH 03/56] CommitSeqNo data type --- src/include/access/transam.h | 16 ++++++++++++++++ src/include/c.h | 4 +++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 28a2d287fd5..2ce2fe4dc3f 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -196,6 +196,22 @@ FullTransactionIdAdvance(FullTransactionId *dest) #define FirstUnpinnedObjectId 12000 #define FirstNormalObjectId 16384 +#define COMMITSEQNO_INPROGRESS UINT64CONST(0x0) +#define COMMITSEQNO_NON_DELETED UINT64CONST(0x1) +#define COMMITSEQNO_ABORTED UINT64CONST(0x2) +#define COMMITSEQNO_FROZEN UINT64CONST(0x3) +#define COMMITSEQNO_COMMITTING UINT64CONST(0x4) +#define COMMITSEQNO_FIRST_NORMAL UINT64CONST(0x5) +#define COMMITSEQNO_MAX_NORMAL UINT64CONST(0x7FFFFFFFFFFFFFFF) + +#define COMMITSEQNO_IS_INPROGRESS(csn) ((csn) == COMMITSEQNO_INPROGRESS || (csn) == COMMITSEQNO_NON_DELETED) +#define COMMITSEQNO_IS_NON_DELETED(csn) ((csn) == COMMITSEQNO_NON_DELETED) +#define COMMITSEQNO_IS_ABORTED(csn) ((csn) == COMMITSEQNO_ABORTED) +#define COMMITSEQNO_IS_FROZEN(csn) ((csn) == COMMITSEQNO_FROZEN) +#define COMMITSEQNO_IS_NORMAL(csn) ((csn) >= COMMITSEQNO_FIRST_NORMAL) +#define COMMITSEQNO_IS_COMMITTING(csn) ((csn) == COMMITSEQNO_COMMITTING) +#define COMMITSEQNO_IS_COMMITTED(csn) ((csn) >= COMMITSEQNO_FROZEN) + /* * TransamVariables is a data structure in shared memory that is used to track * OID and XID assignment state. For largely historical reasons, there is diff --git a/src/include/c.h b/src/include/c.h index dc1841346cd..b8f75ac8329 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -637,7 +637,7 @@ typedef double float8; /* * Oid, RegProcedure, TransactionId, SubTransactionId, MultiXactId, - * CommandId + * CommandId, CommitSeqNo */ /* typedef Oid is in postgres_ext.h */ @@ -668,6 +668,8 @@ typedef uint32 CommandId; #define FirstCommandId ((CommandId) 0) #define InvalidCommandId (~(CommandId)0) +typedef uint64 CommitSeqNo; + /* ---------------- * Variable-length datatypes all share the 'struct varlena' header. From 7ed5668cee0852c6f718e3795761ffd056ca5ad4 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 12 Dec 2021 20:36:18 +0300 Subject: [PATCH 04/56] Custom TOAST --- contrib/pageinspect/heapfuncs.c | 1 + contrib/test_decoding/test_decoding.c | 2 +- src/backend/access/common/detoast.c | 42 ++++++++++++++++--- src/backend/access/common/toast_compression.c | 7 +++- src/backend/access/common/toast_internals.c | 4 +- src/backend/access/table/toast_helper.c | 6 +-- src/backend/replication/logical/proto.c | 2 +- src/backend/replication/pgoutput/pgoutput.c | 4 +- src/backend/utils/adt/varlena.c | 2 +- src/include/access/detoast.h | 14 +++++++ src/include/varatt.h | 33 ++++++++++++++- src/test/regress/regress.c | 2 +- 12 files changed, 99 insertions(+), 20 deletions(-) diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 38a539dad1b..cff8b945297 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -368,6 +368,7 @@ tuple_data_split_internal(Oid relid, char *tupdata, */ if (VARATT_IS_EXTERNAL(tupdata + off) && !VARATT_IS_EXTERNAL_ONDISK(tupdata + off) && + !VARATT_IS_EXTERNAL_ORIOLEDB(tupdata + off) && !VARATT_IS_EXTERNAL_INDIRECT(tupdata + off)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c index 7c50d139698..02d5c2e07da 100644 --- a/contrib/test_decoding/test_decoding.c +++ b/contrib/test_decoding/test_decoding.c @@ -578,7 +578,7 @@ tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple, bool skip_ /* print data */ if (isnull) appendStringInfoString(s, "null"); - else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(origval)) + else if (typisvarlena && (VARATT_IS_EXTERNAL_ONDISK(origval) || VARATT_IS_EXTERNAL_ORIOLEDB(origval))) appendStringInfoString(s, "unchanged-toast-datum"); else if (!typisvarlena) print_literal(s, typid, diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c index 3547cdba56e..d9ab4fb0956 100644 --- a/src/backend/access/common/detoast.c +++ b/src/backend/access/common/detoast.c @@ -26,7 +26,6 @@ static struct varlena *toast_fetch_datum(struct varlena *attr); static struct varlena *toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 slicelength); -static struct varlena *toast_decompress_datum(struct varlena *attr); static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); /* ---------- @@ -46,7 +45,7 @@ detoast_external_attr(struct varlena *attr) { struct varlena *result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) { /* * This is an external stored plain value @@ -115,7 +114,7 @@ detoast_external_attr(struct varlena *attr) struct varlena * detoast_attr(struct varlena *attr) { - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) { /* * This is an externally stored datum --- fetch it back from there @@ -332,6 +331,20 @@ detoast_attr_slice(struct varlena *attr, return result; } +static ToastFunc o_detoast_func = NULL; + +void +register_o_detoast_func(ToastFunc func) +{ + o_detoast_func = func; +} + +void +deregister_o_detoast_func() +{ + o_detoast_func = NULL; +} + /* ---------- * toast_fetch_datum - * @@ -347,6 +360,17 @@ toast_fetch_datum(struct varlena *attr) struct varatt_external toast_pointer; int32 attrsize; + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + if (o_detoast_func != NULL) + { + result = o_detoast_func(attr); + if (result == NULL) + elog(ERROR, "unexpected NULL detoast result"); + return result; + } + } + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) elog(ERROR, "toast_fetch_datum shouldn't be called for non-ondisk datums"); @@ -467,7 +491,7 @@ toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, * * Decompress a compressed version of a varlena datum */ -static struct varlena * +struct varlena * toast_decompress_datum(struct varlena *attr) { ToastCompressionId cmid; @@ -547,11 +571,17 @@ toast_raw_datum_size(Datum value) struct varlena *attr = (struct varlena *) DatumGetPointer(value); Size result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + result = toasted->raw_size + VARHDRSZ; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { - /* va_rawsize is the size of the original datum -- including header */ struct varatt_external toast_pointer; + /* va_rawsize is the size of the original datum -- including header */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); result = toast_pointer.va_rawsize; } diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index 52230f31c68..0717947d689 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -260,7 +260,12 @@ toast_get_compression_id(struct varlena *attr) * the external toast pointer. If compressed inline, fetch it from the * toast compression header. */ - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + cmid = toasted->formatFlags >> ORIOLEDB_EXT_FORMAT_FLAGS_BITS; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { struct varatt_external toast_pointer; diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 90d0654e629..538a554c917 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -239,7 +239,7 @@ toast_save_datum(Relation rel, Datum value, { struct varatt_external old_toast_pointer; - Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); + Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal) || VARATT_IS_EXTERNAL_ORIOLEDB(oldexternal)); /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) @@ -395,7 +395,7 @@ toast_delete_datum(Relation rel, Datum value, bool is_speculative) int validIndex; SnapshotData SnapshotToast; - if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + if (!VARATT_IS_EXTERNAL_ONDISK(attr) && !VARATT_IS_EXTERNAL_ORIOLEDB(attr)) return; /* Must copy to access aligned fields */ diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c index 53224932f0d..a0738622657 100644 --- a/src/backend/access/table/toast_helper.c +++ b/src/backend/access/table/toast_helper.c @@ -71,10 +71,10 @@ toast_tuple_init(ToastTupleContext *ttc) * we have to delete it later. */ if (att->attlen == -1 && !ttc->ttc_oldisnull[i] && - VARATT_IS_EXTERNAL_ONDISK(old_value)) + (VARATT_IS_EXTERNAL_ONDISK(old_value) || VARATT_IS_EXTERNAL_ORIOLEDB(old_value))) { if (ttc->ttc_isnull[i] || - !VARATT_IS_EXTERNAL_ONDISK(new_value) || + !(VARATT_IS_EXTERNAL_ONDISK(new_value) || VARATT_IS_EXTERNAL_ORIOLEDB(new_value)) || memcmp((char *) old_value, (char *) new_value, VARSIZE_EXTERNAL(old_value)) != 0) { @@ -330,7 +330,7 @@ toast_delete_external(Relation rel, const Datum *values, const bool *isnull, if (isnull[i]) continue; - else if (VARATT_IS_EXTERNAL_ONDISK(value)) + else if (VARATT_IS_EXTERNAL_ONDISK(value) || VARATT_IS_EXTERNAL_ORIOLEDB(value)) toast_delete_datum(rel, value, is_speculative); } } diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c index 95c09c95167..db41c955ec1 100644 --- a/src/backend/replication/logical/proto.c +++ b/src/backend/replication/logical/proto.c @@ -814,7 +814,7 @@ logicalrep_write_tuple(StringInfo out, Relation rel, TupleTableSlot *slot, continue; } - if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + if (att->attlen == -1 && (VARATT_IS_EXTERNAL_ONDISK(values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(values[i]))) { /* * Unchanged toasted datum. (Note that we don't promise to detect diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 00e7024563e..e6a4f0063a1 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -1320,8 +1320,8 @@ pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot, * VARTAG_INDIRECT. See ReorderBufferToastReplace. */ if (att->attlen == -1 && - VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) && - !VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i])) + (VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(new_slot->tts_values[i])) && + !(VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(old_slot->tts_values[i])) ) { if (!tmp_new_slot) { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index d2e2e9bbba0..66625735b21 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -5139,7 +5139,7 @@ pg_column_toast_chunk_id(PG_FUNCTION_ARGS) attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0)); - if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + if (!(VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr))) PG_RETURN_NULL(); VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); diff --git a/src/include/access/detoast.h b/src/include/access/detoast.h index 12d8cdb356a..9d78980e986 100644 --- a/src/include/access/detoast.h +++ b/src/include/access/detoast.h @@ -63,6 +63,13 @@ extern struct varlena *detoast_attr_slice(struct varlena *attr, int32 sliceoffset, int32 slicelength); +/* ---------- + * toast_decompress_datum - + * + * Decompress a compressed version of a varlena datum + */ +extern struct varlena *toast_decompress_datum(struct varlena *attr); + /* ---------- * toast_raw_datum_size - * @@ -79,4 +86,11 @@ extern Size toast_raw_datum_size(Datum value); */ extern Size toast_datum_size(Datum value); +/* + * for in_memory module + */ +typedef struct varlena* (*ToastFunc) (struct varlena *attr); +extern void register_o_detoast_func(ToastFunc func); +extern void deregister_o_detoast_func(void); + #endif /* DETOAST_H */ diff --git a/src/include/varatt.h b/src/include/varatt.h index f04435e9ef3..9da76dea1d6 100644 --- a/src/include/varatt.h +++ b/src/include/varatt.h @@ -38,6 +38,23 @@ typedef struct varatt_external Oid va_toastrelid; /* RelID of TOAST table containing it */ } varatt_external; +typedef struct OToastExternal +{ + uint16 data_size; /* length of OToastExternal data */ + int16 attnum; + int32 raw_size; /* original data size */ + int32 toasted_size; /* compressed original data size */ + /* for fetching data from TOAST tree */ + CommitSeqNo csn; + /* for finding TOAST tree */ + Oid datoid; + Oid relid; + Oid relnode; + /* for storing primary index tuple */ + uint8 formatFlags; /* primary index tuple flags */ + char data[FLEXIBLE_ARRAY_MEMBER]; /* data (primary index tuple) */ +} OToastExternal; + /* * These macros define the "saved size" portion of va_extinfo. Its remaining * two high-order bits identify the compression method. @@ -86,17 +103,21 @@ typedef enum vartag_external VARTAG_INDIRECT = 1, VARTAG_EXPANDED_RO = 2, VARTAG_EXPANDED_RW = 3, - VARTAG_ONDISK = 18 + VARTAG_ONDISK = 18, + VARTAG_ORIOLEDB = 34 } vartag_external; /* this test relies on the specific tag values above */ #define VARTAG_IS_EXPANDED(tag) \ (((tag) & ~1) == VARTAG_EXPANDED_RO) +#define O_TOAST_EXTERNAL_SZ offsetof(OToastExternal, data) + #define VARTAG_SIZE(tag) \ ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \ (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ + (tag) == VARTAG_ORIOLEDB ? O_TOAST_EXTERNAL_SZ : \ (AssertMacro(false), 0)) /* @@ -282,11 +303,16 @@ typedef struct #define VARDATA_SHORT(PTR) VARDATA_1B(PTR) #define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) -#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) +#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR)) \ + + (VARATT_IS_EXTERNAL_ORIOLEDB(PTR) ? \ + *((uint16 *) VARDATA_1B_E(PTR)) \ + : 0)) + #define VARDATA_EXTERNAL(PTR) VARDATA_1B_E(PTR) #define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) #define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) + #define VARATT_IS_EXTERNAL_ONDISK(PTR) \ (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) #define VARATT_IS_EXTERNAL_INDIRECT(PTR) \ @@ -299,6 +325,9 @@ typedef struct (VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) #define VARATT_IS_EXTERNAL_NON_EXPANDED(PTR) \ (VARATT_IS_EXTERNAL(PTR) && !VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) +#define VARATT_IS_EXTERNAL_ORIOLEDB(PTR) \ + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ORIOLEDB) + #define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) #define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 45a6ad3c49e..7b8e91d07b3 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -606,7 +606,7 @@ make_tuple_indirect(PG_FUNCTION_ARGS) continue; /* copy datum, so it still lives later */ - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) attr = detoast_external_attr(attr); else { From 7a2fffd4efa8a6ac18b9f29e3b259b03dcb0b455 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 23 Mar 2023 00:12:00 +0300 Subject: [PATCH 05/56] Allow locking updated tuples in tuple_update() and tuple_delete() Discussion: https://postgr.es/m/CAPpHfdua-YFw3XTprfutzGp28xXLigFtzNbuFY8yPhqeq6X5kg%40mail.gmail.com Reviewed-by: Aleksander Alekseev, Pavel Borisov, Vignesh C, Mason Sharp Reviewed-by: Andres Freund, Chris Travers --- src/backend/access/heap/heapam.c | 205 ++++++++++---- src/backend/access/heap/heapam_handler.c | 94 +++++-- src/backend/access/table/tableam.c | 26 +- src/backend/commands/trigger.c | 55 +--- src/backend/executor/execReplication.c | 19 +- src/backend/executor/nodeModifyTable.c | 331 +++++++++-------------- src/include/access/heapam.h | 19 +- src/include/access/tableam.h | 69 +++-- src/include/commands/trigger.h | 4 +- 9 files changed, 474 insertions(+), 348 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 91b20147a00..9d6b0ad10ae 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2669,10 +2669,11 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) } /* - * heap_delete - delete a tuple + * heap_delete - delete a tuple, optionally fetching it into a slot * * See table_tuple_delete() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. + * this routine directly takes a tuple rather than a slot. Also, we don't + * place a lock on the tuple in this function, just fetch the existing version. * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last @@ -2681,8 +2682,9 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) */ TM_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + CommandId cid, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -2760,7 +2762,7 @@ heap_delete(Relation relation, ItemPointer tid, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("attempted to delete invisible tuple"))); } - else if (result == TM_BeingModified && wait) + else if (result == TM_BeingModified && (options & TABLE_MODIFY_WAIT)) { TransactionId xwait; uint16 infomask; @@ -2901,7 +2903,30 @@ heap_delete(Relation relation, ItemPointer tid, tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); else tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); + + /* + * If we're asked to lock the updated tuple, we just fetch the + * existing tuple. That let's the caller save some resources on + * placing the lock. + */ + if (result == TM_Updated && + (options & TABLE_MODIFY_LOCK_UPDATED)) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + bslot->base.tupdata = tp; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + UnlockReleaseBuffer(buffer); + } if (have_tuple_lock) UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); if (vmbuffer != InvalidBuffer) @@ -3075,8 +3100,24 @@ heap_delete(Relation relation, ItemPointer tid, */ CacheInvalidateHeapTuple(relation, &tp, NULL); - /* Now we can release the buffer */ - ReleaseBuffer(buffer); + /* Fetch the old tuple version if we're asked for that. */ + if (options & TABLE_MODIFY_FETCH_OLD_TUPLE) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + bslot->base.tupdata = tp; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + } /* * Release the lmgr tuple lock, if we had it. @@ -3108,8 +3149,8 @@ simple_heap_delete(Relation relation, ItemPointer tid) result = heap_delete(relation, tid, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); + TABLE_MODIFY_WAIT /* wait for commit */ , + &tmfd, false /* changingPart */ , NULL); switch (result) { case TM_SelfModified: @@ -3136,10 +3177,11 @@ simple_heap_delete(Relation relation, ItemPointer tid) } /* - * heap_update - replace a tuple + * heap_update - replace a tuple, optionally fetching it into a slot * * See table_tuple_update() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. + * this routine directly takes a tuple rather than a slot. Also, we don't + * place a lock on the tuple in this function, just fetch the existing version. * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last @@ -3148,9 +3190,9 @@ simple_heap_delete(Relation relation, ItemPointer tid) */ TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, + CommandId cid, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) + TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -3327,7 +3369,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer); /* see below about the "no wait" case */ - Assert(result != TM_BeingModified || wait); + Assert(result != TM_BeingModified || (options & TABLE_MODIFY_WAIT)); if (result == TM_Invisible) { @@ -3336,7 +3378,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("attempted to update invisible tuple"))); } - else if (result == TM_BeingModified && wait) + else if (result == TM_BeingModified && (options & TABLE_MODIFY_WAIT)) { TransactionId xwait; uint16 infomask; @@ -3540,7 +3582,30 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); else tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); + + /* + * If we're asked to lock the updated tuple, we just fetch the + * existing tuple. That let's the caller save some resouces on + * placing the lock. + */ + if (result == TM_Updated && + (options & TABLE_MODIFY_LOCK_UPDATED)) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + bslot->base.tupdata = oldtup; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + UnlockReleaseBuffer(buffer); + } if (have_tuple_lock) UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); if (vmbuffer != InvalidBuffer) @@ -4019,7 +4084,26 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, /* Now we can release the buffer(s) */ if (newbuf != buffer) ReleaseBuffer(newbuf); - ReleaseBuffer(buffer); + + /* Fetch the old tuple version if we're asked for that. */ + if (options & TABLE_MODIFY_FETCH_OLD_TUPLE) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + bslot->base.tupdata = oldtup; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + } + if (BufferIsValid(vmbuffer_new)) ReleaseBuffer(vmbuffer_new); if (BufferIsValid(vmbuffer)) @@ -4227,8 +4311,8 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + TABLE_MODIFY_WAIT /* wait for commit */ , + &tmfd, &lockmode, update_indexes, NULL); switch (result) { case TM_SelfModified: @@ -4291,12 +4375,14 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * tuples. * * Output parameters: - * *tuple: all fields filled in - * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *slot: BufferHeapTupleTableSlot filled with tuple * *tmfd: filled in failure cases (see below) * * Function results are the same as the ones for table_tuple_lock(). * + * If *slot already contains the target tuple, it takes advantage on that by + * skipping the ReadBuffer() call. + * * In the failure cases other than TM_Invisible, the routine fills * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, * if necessary), and t_cmax (the last only for TM_SelfModified, @@ -4307,15 +4393,14 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * See README.tuplock for a thorough explanation of this mechanism. */ TM_Result -heap_lock_tuple(Relation relation, HeapTuple tuple, +heap_lock_tuple(Relation relation, ItemPointer tid, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, TM_FailureData *tmfd) + bool follow_updates, TM_FailureData *tmfd) { TM_Result result; - ItemPointer tid = &(tuple->t_self); ItemId lp; Page page; + Buffer buffer; Buffer vmbuffer = InvalidBuffer; BlockNumber block; TransactionId xid, @@ -4327,8 +4412,24 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, bool skip_tuple_lock = false; bool have_tuple_lock = false; bool cleared_all_frozen = false; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + HeapTuple tuple = &bslot->base.tupdata; + + Assert(TTS_IS_BUFFERTUPLE(slot)); - *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + /* Take advantage if slot already contains the relevant tuple */ + if (!TTS_EMPTY(slot) && + slot->tts_tableOid == relation->rd_id && + ItemPointerCompare(&slot->tts_tid, tid) == 0 && + BufferIsValid(bslot->buffer)) + { + buffer = bslot->buffer; + IncrBufferRefCount(buffer); + } + else + { + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + } block = ItemPointerGetBlockNumber(tid); /* @@ -4337,21 +4438,22 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * in the middle of changing this, so we'll need to recheck after we have * the lock. */ - if (PageIsAllVisible(BufferGetPage(*buffer))) + if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = BufferGetPage(*buffer); + page = BufferGetPage(buffer); lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); Assert(ItemIdIsNormal(lp)); + tuple->t_self = *tid; tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); tuple->t_len = ItemIdGetLength(lp); tuple->t_tableOid = RelationGetRelid(relation); l3: - result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); + result = HeapTupleSatisfiesUpdate(tuple, cid, buffer); if (result == TM_Invisible) { @@ -4380,7 +4482,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, infomask2 = tuple->t_data->t_infomask2; ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* * If any subtransaction of the current top transaction already holds @@ -4532,12 +4634,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = res; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } } - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Make sure it's still an appropriate lock, else start over. @@ -4572,7 +4674,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Make sure it's still an appropriate lock, else start over. @@ -4600,7 +4702,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * No conflict, but if the xmax changed under us in the * meantime, start over. */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), xwait)) @@ -4612,7 +4714,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, } else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* if the xmax changed in the meantime, start over */ if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || @@ -4640,7 +4742,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, TransactionIdIsCurrentTransactionId(xwait)) { /* ... but if the xmax changed in the meantime, start over */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), xwait)) @@ -4662,7 +4764,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (require_sleep && (result == TM_Updated || result == TM_Deleted)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } else if (require_sleep) @@ -4687,7 +4789,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } @@ -4713,7 +4815,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } break; @@ -4753,7 +4855,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } break; @@ -4779,12 +4881,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = res; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } } - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * xwait is done, but if xwait had just locked the tuple then some @@ -4806,7 +4908,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * don't check for this in the multixact case, because some * locker transactions might still be running. */ - UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); + UpdateXmaxHintBits(tuple->t_data, buffer, xwait); } } @@ -4865,9 +4967,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) { - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto l3; } @@ -4930,7 +5032,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, cleared_all_frozen = true; - MarkBufferDirty(*buffer); + MarkBufferDirty(buffer); /* * XLOG stuff. You might think that we don't need an XLOG record because @@ -4950,7 +5052,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, XLogRecPtr recptr; XLogBeginInsert(); - XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); xlrec.xmax = xid; @@ -4971,7 +5073,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, result = TM_Ok; out_locked: - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); out_unlocked: if (BufferIsValid(vmbuffer)) @@ -4989,6 +5091,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (have_tuple_lock) UnlockTupleTuplock(relation, tid, mode); + /* Put the target tuple to the slot */ + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); + return result; } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 6f8b1b79298..ed830464aea 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -46,6 +46,12 @@ #include "utils/builtins.h" #include "utils/rel.h" +static TM_Result heapam_tuple_lock(Relation relation, ItemPointer tid, + Snapshot snapshot, TupleTableSlot *slot, + CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, uint8 flags, + TM_FailureData *tmfd); + static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, Datum *values, bool *isnull, RewriteState rwstate); @@ -299,23 +305,55 @@ heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, static TM_Result heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + Snapshot snapshot, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { + TM_Result result; + /* * Currently Deleting of index tuples are handled at vacuum, in case if * the storage itself is cleaning the dead tuples by itself, it is the * time to call the index tuple deletion also. */ - return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart); + result = heap_delete(relation, tid, cid, crosscheck, options, + tmfd, changingPart, oldSlot); + + /* + * If the tuple has been concurrently updated, then get the lock on it. + * (Do only if caller asked for this by setting the + * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the + * delete should succeed even if there are more concurrent update + * attempts. + */ + if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED)) + { + /* + * heapam_tuple_lock() will take advantage of tuple loaded into + * oldSlot by heap_delete(). + */ + result = heapam_tuple_lock(relation, tid, snapshot, + oldSlot, cid, LockTupleExclusive, + (options & TABLE_MODIFY_WAIT) ? + LockWaitBlock : + LockWaitSkip, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + tmfd); + + if (result == TM_Ok) + return TM_Updated; + } + + return result; } static TM_Result heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, - LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes) + int options, TM_FailureData *tmfd, + LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { bool shouldFree = true; HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); @@ -325,8 +363,8 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, slot->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slot->tts_tableOid; - result = heap_update(relation, otid, tuple, cid, crosscheck, wait, - tmfd, lockmode, update_indexes); + result = heap_update(relation, otid, tuple, cid, crosscheck, options, + tmfd, lockmode, update_indexes, oldSlot); ItemPointerCopy(&tuple->t_self, &slot->tts_tid); /* @@ -353,6 +391,31 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, if (shouldFree) pfree(tuple); + /* + * If the tuple has been concurrently updated, then get the lock on it. + * (Do only if caller asked for this by setting the + * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the + * update should succeed even if there are more concurrent update + * attempts. + */ + if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED)) + { + /* + * heapam_tuple_lock() will take advantage of tuple loaded into + * oldSlot by heap_update(). + */ + result = heapam_tuple_lock(relation, otid, snapshot, + oldSlot, cid, *lockmode, + (options & TABLE_MODIFY_WAIT) ? + LockWaitBlock : + LockWaitSkip, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + tmfd); + + if (result == TM_Ok) + return TM_Updated; + } + return result; } @@ -364,7 +427,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, { BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; TM_Result result; - Buffer buffer; HeapTuple tuple = &bslot->base.tupdata; bool follow_updates; @@ -374,9 +436,8 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, Assert(TTS_IS_BUFFERTUPLE(slot)); tuple_lock_retry: - tuple->t_self = *tid; - result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy, - follow_updates, &buffer, tmfd); + result = heap_lock_tuple(relation, tid, slot, cid, mode, wait_policy, + follow_updates, tmfd); if (result == TM_Updated && (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION)) @@ -384,8 +445,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, /* Should not encounter speculative tuple on recheck */ Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); - ReleaseBuffer(buffer); - if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self)) { SnapshotData SnapshotDirty; @@ -407,6 +466,8 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, InitDirtySnapshot(SnapshotDirty); for (;;) { + Buffer buffer = InvalidBuffer; + if (ItemPointerIndicatesMovedPartitions(tid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), @@ -501,7 +562,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, /* * This is a live tuple, so try to lock it again. */ - ReleaseBuffer(buffer); + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); goto tuple_lock_retry; } @@ -512,7 +573,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, */ if (tuple->t_data == NULL) { - Assert(!BufferIsValid(buffer)); + ReleaseBuffer(buffer); return TM_Deleted; } @@ -565,9 +626,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, slot->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slot->tts_tableOid; - /* store in slot, transferring existing pin */ - ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); - return result; } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index e57a0b7ea31..8d3675be959 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -287,16 +287,23 @@ simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) * via ereport(). */ void -simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot) +simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, + TupleTableSlot *oldSlot) { TM_Result result; TM_FailureData tmfd; + int options = TABLE_MODIFY_WAIT; /* wait for commit */ + + /* Fetch old tuple if the relevant slot is provided */ + if (oldSlot) + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; result = table_tuple_delete(rel, tid, GetCurrentCommandId(true), snapshot, InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); + options, + &tmfd, false /* changingPart */ , + oldSlot); switch (result) { @@ -335,17 +342,24 @@ void simple_table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, Snapshot snapshot, - TU_UpdateIndexes *update_indexes) + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { TM_Result result; TM_FailureData tmfd; LockTupleMode lockmode; + int options = TABLE_MODIFY_WAIT; /* wait for commit */ + + /* Fetch old tuple if the relevant slot is provided */ + if (oldSlot) + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; result = table_tuple_update(rel, otid, slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + options, + &tmfd, &lockmode, update_indexes, + oldSlot); switch (result) { diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 58b7fc5bbd5..8ac9ccf5abf 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -2772,8 +2772,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *slot, TransitionCaptureState *transition_capture, bool is_crosspart_update) { @@ -2782,20 +2782,11 @@ ExecARDeleteTriggers(EState *estate, if ((trigdesc && trigdesc->trig_delete_after_row) || (transition_capture && transition_capture->tcs_delete_old_table)) { - TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); - - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); - if (fdw_trigtuple == NULL) - GetTupleForTrigger(estate, - NULL, - relinfo, - tupleid, - LockTupleExclusive, - slot, - NULL, - NULL, - NULL); - else + /* + * Put the FDW old tuple to the slot. Otherwise, caller is expected + * to have old tuple alredy fetched to the slot. + */ + if (fdw_trigtuple != NULL) ExecForceStoreHeapTuple(fdw_trigtuple, slot, false); AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, @@ -3086,18 +3077,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, * Note: 'src_partinfo' and 'dst_partinfo', when non-NULL, refer to the source * and destination partitions, respectively, of a cross-partition update of * the root partitioned table mentioned in the query, given by 'relinfo'. - * 'tupleid' in that case refers to the ctid of the "old" tuple in the source - * partition, and 'newslot' contains the "new" tuple in the destination - * partition. This interface allows to support the requirements of - * ExecCrossPartitionUpdateForeignKey(); is_crosspart_update must be true in - * that case. + * 'oldslot' contains the "old" tuple in the source partition, and 'newslot' + * contains the "new" tuple in the destination partition. This interface + * allows to support the requirements of ExecCrossPartitionUpdateForeignKey(); + * is_crosspart_update must be true in that case. */ void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, ResultRelInfo *dst_partinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, TransitionCaptureState *transition_capture, @@ -3116,29 +3106,14 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, * separately for DELETE and INSERT to capture transition table rows. * In such case, either old tuple or new tuple can be NULL. */ - TupleTableSlot *oldslot; - ResultRelInfo *tupsrc; - Assert((src_partinfo != NULL && dst_partinfo != NULL) || !is_crosspart_update); - tupsrc = src_partinfo ? src_partinfo : relinfo; - oldslot = ExecGetTriggerOldSlot(estate, tupsrc); - - if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid)) - GetTupleForTrigger(estate, - NULL, - tupsrc, - tupleid, - LockTupleExclusive, - oldslot, - NULL, - NULL, - NULL); - else if (fdw_trigtuple != NULL) + if (fdw_trigtuple != NULL) + { + Assert(oldslot); ExecForceStoreHeapTuple(fdw_trigtuple, oldslot, false); - else - ExecClearTuple(oldslot); + } AfterTriggerSaveEvent(estate, relinfo, src_partinfo, dst_partinfo, diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index d0a89cd5778..0cad843fb69 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -577,6 +577,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, { List *recheckIndexes = NIL; TU_UpdateIndexes update_indexes; + TupleTableSlot *oldSlot = NULL; /* Compute stored generated columns */ if (rel->rd_att->constr && @@ -590,8 +591,12 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, if (rel->rd_rel->relispartition) ExecPartitionCheck(resultRelInfo, slot, estate, true); + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_after_row) + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + simple_table_tuple_update(rel, tid, slot, estate->es_snapshot, - &update_indexes); + &update_indexes, oldSlot); if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None)) recheckIndexes = ExecInsertIndexTuples(resultRelInfo, @@ -602,7 +607,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, - tid, NULL, slot, + NULL, oldSlot, slot, recheckIndexes, NULL, false); list_free(recheckIndexes); @@ -636,12 +641,18 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, if (!skip_tuple) { + TupleTableSlot *oldSlot = NULL; + + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_delete_after_row) + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + /* OK, delete the tuple */ - simple_table_tuple_delete(rel, tid, estate->es_snapshot); + simple_table_tuple_delete(rel, tid, estate->es_snapshot, oldSlot); /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, - tid, NULL, NULL, false); + NULL, oldSlot, NULL, false); } } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 4913e493199..fded69d095e 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -136,7 +136,7 @@ static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, ItemPointer tupleid, - TupleTableSlot *oldslot, + TupleTableSlot *oldSlot, TupleTableSlot *newslot); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, @@ -576,6 +576,10 @@ ExecInitInsertProjection(ModifyTableState *mtstate, resultRelInfo->ri_newTupleSlot = table_slot_create(resultRelInfo->ri_RelationDesc, &estate->es_tupleTable); + if (node->onConflictAction == ONCONFLICT_UPDATE) + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); /* Build ProjectionInfo if needed (it probably isn't). */ if (need_projection) @@ -1165,7 +1169,7 @@ ExecInsert(ModifyTableContext *context, ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, NULL, - NULL, + resultRelInfo->ri_oldTupleSlot, slot, NULL, mtstate->mt_transition_capture, @@ -1345,7 +1349,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TM_Result ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool changingPart) + ItemPointer tupleid, bool changingPart, int options, + TupleTableSlot *oldSlot) { EState *estate = context->estate; @@ -1353,9 +1358,10 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, estate->es_output_cid, estate->es_snapshot, estate->es_crosscheck_snapshot, - true /* wait for commit */ , + options /* wait for commit */ , &context->tmfd, - changingPart); + changingPart, + oldSlot); } /* @@ -1367,7 +1373,8 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool changingPart) + ItemPointer tupleid, HeapTuple oldtuple, + TupleTableSlot *slot, bool changingPart) { ModifyTableState *mtstate = context->mtstate; EState *estate = context->estate; @@ -1385,8 +1392,8 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, { ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, - tupleid, oldtuple, - NULL, NULL, mtstate->mt_transition_capture, + oldtuple, + slot, NULL, NULL, mtstate->mt_transition_capture, false); /* @@ -1397,10 +1404,30 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } /* AFTER ROW DELETE Triggers */ - ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, + ExecARDeleteTriggers(estate, resultRelInfo, oldtuple, slot, ar_delete_trig_tcs, changingPart); } +/* + * Initializes the tuple slot in a ResultRelInfo for DELETE action. + * + * We mark 'projectNewInfoValid' even though the projections themselves + * are not initialized here. + */ +static void +ExecInitDeleteTupleSlot(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo) +{ + EState *estate = mtstate->ps.state; + + Assert(!resultRelInfo->ri_projectNewInfoValid); + + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + resultRelInfo->ri_projectNewInfoValid = true; +} + /* ---------------------------------------------------------------- * ExecDelete * @@ -1428,6 +1455,7 @@ ExecDelete(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, + TupleTableSlot *oldSlot, bool processReturning, bool changingPart, bool canSetTag, @@ -1491,6 +1519,11 @@ ExecDelete(ModifyTableContext *context, } else { + int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + + if (!IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + /* * delete the tuple * @@ -1501,7 +1534,8 @@ ExecDelete(ModifyTableContext *context, * transaction-snapshot mode transactions. */ ldelete: - result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart); + result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart, + options, oldSlot); if (tmresult) *tmresult = result; @@ -1548,7 +1582,6 @@ ExecDelete(ModifyTableContext *context, case TM_Updated: { - TupleTableSlot *inputslot; TupleTableSlot *epqslot; if (IsolationUsesXactSnapshot()) @@ -1557,87 +1590,29 @@ ExecDelete(ModifyTableContext *context, errmsg("could not serialize access due to concurrent update"))); /* - * Already know that we're going to need to do EPQ, so - * fetch tuple directly into the right slot. + * We need to do EPQ. The latest tuple is already found + * and locked as a result of TABLE_MODIFY_LOCK_UPDATED. */ - EvalPlanQualBegin(context->epqstate); - inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc, - resultRelInfo->ri_RangeTableIndex); + Assert(context->tmfd.traversed); + epqslot = EvalPlanQual(context->epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + oldSlot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; - result = table_tuple_lock(resultRelationDesc, tupleid, - estate->es_snapshot, - inputslot, estate->es_output_cid, - LockTupleExclusive, LockWaitBlock, - TUPLE_LOCK_FLAG_FIND_LAST_VERSION, - &context->tmfd); - - switch (result) + /* + * If requested, skip delete and pass back the updated + * row. + */ + if (epqreturnslot) { - case TM_Ok: - Assert(context->tmfd.traversed); - epqslot = EvalPlanQual(context->epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - inputslot); - if (TupIsNull(epqslot)) - /* Tuple not passing quals anymore, exiting... */ - return NULL; - - /* - * If requested, skip delete and pass back the - * updated row. - */ - if (epqreturnslot) - { - *epqreturnslot = epqslot; - return NULL; - } - else - goto ldelete; - - case TM_SelfModified: - - /* - * This can be reached when following an update - * chain from a tuple updated by another session, - * reaching a tuple that was already updated in - * this transaction. If previously updated by this - * command, ignore the delete, otherwise error - * out. - * - * See also TM_SelfModified response to - * table_tuple_delete() above. - */ - if (context->tmfd.cmax != estate->es_output_cid) - ereport(ERROR, - (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), - errmsg("tuple to be deleted was already modified by an operation triggered by the current command"), - errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); - return NULL; - - case TM_Deleted: - /* tuple already deleted; nothing to do */ - return NULL; - - default: - - /* - * TM_Invisible should be impossible because we're - * waiting for updated row versions, and would - * already have errored out if the first version - * is invisible. - * - * TM_Updated should be impossible, because we're - * locking the latest version via - * TUPLE_LOCK_FLAG_FIND_LAST_VERSION. - */ - elog(ERROR, "unexpected table_tuple_lock status: %u", - result); - return NULL; + *epqreturnslot = epqslot; + return NULL; } - - Assert(false); - break; + else + goto ldelete; } case TM_Deleted: @@ -1671,7 +1646,8 @@ ExecDelete(ModifyTableContext *context, if (tupleDeleted) *tupleDeleted = true; - ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, changingPart); + ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, + oldSlot, changingPart); /* Process RETURNING if present and if requested */ if (processReturning && resultRelInfo->ri_projectReturning) @@ -1689,17 +1665,13 @@ ExecDelete(ModifyTableContext *context, } else { + /* Copy old tuple to the returning slot */ slot = ExecGetReturningSlot(estate, resultRelInfo); if (oldtuple != NULL) - { ExecForceStoreHeapTuple(oldtuple, slot, false); - } else - { - if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid, - SnapshotAny, slot)) - elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING"); - } + ExecCopySlot(slot, oldSlot); + Assert(!TupIsNull(slot)); } rslot = ExecProcessReturning(resultRelInfo, slot, context->planSlot); @@ -1799,12 +1771,16 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, MemoryContextSwitchTo(oldcxt); } + /* Make sure ri_oldTupleSlot is initialized. */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitUpdateProjection(mtstate, resultRelInfo); + /* * Row movement, part 1. Delete the tuple, but skip RETURNING processing. * We want to return rows from INSERT. */ ExecDelete(context, resultRelInfo, - tupleid, oldtuple, + tupleid, oldtuple, resultRelInfo->ri_oldTupleSlot, false, /* processReturning */ true, /* changingPart */ false, /* canSetTag */ @@ -1845,21 +1821,13 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, return true; else { - /* Fetch the most recent version of old tuple. */ - TupleTableSlot *oldSlot; - - /* ... but first, make sure ri_oldTupleSlot is initialized. */ - if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) - ExecInitUpdateProjection(mtstate, resultRelInfo); - oldSlot = resultRelInfo->ri_oldTupleSlot; - if (!table_tuple_fetch_row_version(resultRelInfo->ri_RelationDesc, - tupleid, - SnapshotAny, - oldSlot)) - elog(ERROR, "failed to fetch tuple being updated"); - /* and project the new tuple to retry the UPDATE with */ + /* + * ExecDelete already fetches the most recent version of old tuple + * to resultRelInfo->ri_RelationDesc. So, just project the new + * tuple to retry the UPDATE with. + */ *retry_slot = ExecGetUpdateNewTuple(resultRelInfo, epqslot, - oldSlot); + resultRelInfo->ri_oldTupleSlot); return false; } } @@ -1978,7 +1946,8 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo, static TM_Result ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag, UpdateContext *updateCxt) + bool canSetTag, int options, TupleTableSlot *oldSlot, + UpdateContext *updateCxt) { EState *estate = context->estate; Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -2070,7 +2039,8 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ExecCrossPartitionUpdateForeignKey(context, resultRelInfo, insert_destrel, - tupleid, slot, + tupleid, + resultRelInfo->ri_oldTupleSlot, inserted_tuple); return TM_Ok; @@ -2113,10 +2083,10 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, estate->es_output_cid, estate->es_snapshot, estate->es_crosscheck_snapshot, - true /* wait for commit */ , + options /* wait for commit */ , &context->tmfd, &updateCxt->lockmode, - &updateCxt->updateIndexes); - + &updateCxt->updateIndexes, + oldSlot); return result; } @@ -2129,7 +2099,8 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, static void ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, ResultRelInfo *resultRelInfo, ItemPointer tupleid, - HeapTuple oldtuple, TupleTableSlot *slot) + HeapTuple oldtuple, TupleTableSlot *slot, + TupleTableSlot *oldSlot) { ModifyTableState *mtstate = context->mtstate; List *recheckIndexes = NIL; @@ -2145,7 +2116,7 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, NULL, NULL, - tupleid, oldtuple, slot, + oldtuple, oldSlot, slot, recheckIndexes, mtstate->operation == CMD_INSERT ? mtstate->mt_oc_transition_capture : @@ -2234,7 +2205,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, /* Perform the root table's triggers. */ ExecARUpdateTriggers(context->estate, rootRelInfo, sourcePartInfo, destPartInfo, - tupleid, NULL, newslot, NIL, NULL, true); + NULL, oldslot, newslot, NIL, NULL, true); } /* ---------------------------------------------------------------- @@ -2256,6 +2227,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, * NULL when the foreign table has no relevant triggers. * * slot contains the new tuple value to be stored. + * oldSlot is the slot to store the old tuple. * planSlot is the output of the ModifyTable's subplan; we use it * to access values from other input tables (for RETURNING), * row-ID junk columns, etc. @@ -2268,7 +2240,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, static TupleTableSlot * ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag) + TupleTableSlot *oldSlot, bool canSetTag, bool locked) { EState *estate = context->estate; Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -2321,6 +2293,11 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } else { + int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + + if (!locked && !IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + /* * If we generate a new candidate tuple after EvalPlanQual testing, we * must loop back here to try again. (We don't need to redo triggers, @@ -2330,7 +2307,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ redo_act: result = ExecUpdateAct(context, resultRelInfo, tupleid, oldtuple, slot, - canSetTag, &updateCxt); + canSetTag, options, oldSlot, &updateCxt); /* * If ExecUpdateAct reports that a cross-partition update was done, @@ -2381,88 +2358,30 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, case TM_Updated: { - TupleTableSlot *inputslot; TupleTableSlot *epqslot; - TupleTableSlot *oldSlot; if (IsolationUsesXactSnapshot()) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + Assert(!locked); /* - * Already know that we're going to need to do EPQ, so - * fetch tuple directly into the right slot. + * We need to do EPQ. The latest tuple is already found + * and locked as a result of TABLE_MODIFY_LOCK_UPDATED. */ - inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc, - resultRelInfo->ri_RangeTableIndex); - - result = table_tuple_lock(resultRelationDesc, tupleid, - estate->es_snapshot, - inputslot, estate->es_output_cid, - updateCxt.lockmode, LockWaitBlock, - TUPLE_LOCK_FLAG_FIND_LAST_VERSION, - &context->tmfd); - - switch (result) - { - case TM_Ok: - Assert(context->tmfd.traversed); - - epqslot = EvalPlanQual(context->epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - inputslot); - if (TupIsNull(epqslot)) - /* Tuple not passing quals anymore, exiting... */ - return NULL; - - /* Make sure ri_oldTupleSlot is initialized. */ - if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) - ExecInitUpdateProjection(context->mtstate, - resultRelInfo); - - /* Fetch the most recent version of old tuple. */ - oldSlot = resultRelInfo->ri_oldTupleSlot; - if (!table_tuple_fetch_row_version(resultRelationDesc, - tupleid, - SnapshotAny, - oldSlot)) - elog(ERROR, "failed to fetch tuple being updated"); - slot = ExecGetUpdateNewTuple(resultRelInfo, - epqslot, oldSlot); - goto redo_act; - - case TM_Deleted: - /* tuple already deleted; nothing to do */ - return NULL; - - case TM_SelfModified: - - /* - * This can be reached when following an update - * chain from a tuple updated by another session, - * reaching a tuple that was already updated in - * this transaction. If previously modified by - * this command, ignore the redundant update, - * otherwise error out. - * - * See also TM_SelfModified response to - * table_tuple_update() above. - */ - if (context->tmfd.cmax != estate->es_output_cid) - ereport(ERROR, - (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), - errmsg("tuple to be updated was already modified by an operation triggered by the current command"), - errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); - return NULL; - - default: - /* see table_tuple_lock call in ExecDelete() */ - elog(ERROR, "unexpected table_tuple_lock status: %u", - result); - return NULL; - } + Assert(context->tmfd.traversed); + epqslot = EvalPlanQual(context->epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + oldSlot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; + slot = ExecGetUpdateNewTuple(resultRelInfo, + epqslot, + oldSlot); + goto redo_act; } break; @@ -2486,7 +2405,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, (estate->es_processed)++; ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, tupleid, oldtuple, - slot); + slot, oldSlot); /* Process RETURNING if present */ if (resultRelInfo->ri_projectReturning) @@ -2704,7 +2623,8 @@ ExecOnConflictUpdate(ModifyTableContext *context, *returning = ExecUpdate(context, resultRelInfo, conflictTid, NULL, resultRelInfo->ri_onConflict->oc_ProjSlot, - canSetTag); + existing, + canSetTag, true); /* * Clear out existing tuple, as there might not be another conflict among @@ -2985,7 +2905,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, Assert(oldtuple == NULL); result = ExecUpdateAct(context, resultRelInfo, tupleid, - NULL, newslot, canSetTag, + NULL, newslot, canSetTag, TABLE_MODIFY_WAIT, NULL, &updateCxt); /* @@ -3007,7 +2927,8 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok) { ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, - tupleid, NULL, newslot); + tupleid, NULL, newslot, + resultRelInfo->ri_oldTupleSlot); mtstate->mt_merge_updated += 1; } break; @@ -3037,13 +2958,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, Assert(oldtuple == NULL); result = ExecDeleteAct(context, resultRelInfo, tupleid, - false); + false, TABLE_MODIFY_WAIT, NULL); } if (result == TM_Ok) { ExecDeleteEpilogue(context, resultRelInfo, tupleid, NULL, - false); + resultRelInfo->ri_oldTupleSlot, false); mtstate->mt_merge_deleted += 1; } break; @@ -4108,12 +4029,18 @@ ExecModifyTable(PlanState *pstate) /* Now apply the update. */ slot = ExecUpdate(&context, resultRelInfo, tupleid, oldtuple, - slot, node->canSetTag); + slot, resultRelInfo->ri_oldTupleSlot, + node->canSetTag, false); break; case CMD_DELETE: + /* Initialize slot for DELETE to fetch the old tuple */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitDeleteTupleSlot(node, resultRelInfo); + slot = ExecDelete(&context, resultRelInfo, tupleid, oldtuple, - true, false, node->canSetTag, NULL, NULL, NULL); + resultRelInfo->ri_oldTupleSlot, true, false, + node->canSetTag, NULL, NULL, NULL); break; case CMD_MERGE: diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 9e9aec88a62..871c640c8db 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -322,19 +322,22 @@ extern void heap_multi_insert(Relation relation, struct TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate); extern TM_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - struct TM_FailureData *tmfd, bool changingPart); + CommandId cid, Snapshot crosscheck, int options, + struct TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot); extern void heap_finish_speculative(Relation relation, ItemPointer tid); extern void heap_abort_speculative(Relation relation, ItemPointer tid); extern TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, + CommandId cid, Snapshot crosscheck, int options, struct TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); -extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, struct TM_FailureData *tmfd); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); +extern TM_Result heap_lock_tuple(Relation relation, ItemPointer tid, + TupleTableSlot *slot, + CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, bool follow_updates, + struct TM_FailureData *tmfd); extern void heap_inplace_update(Relation relation, HeapTuple tuple); extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index da661289c1f..504cd383f57 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -267,6 +267,11 @@ typedef struct TM_IndexDeleteOp /* Follow update chain and lock latest version of tuple */ #define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1) +/* "options" flag bits for table_tuple_update and table_tuple_delete */ +#define TABLE_MODIFY_WAIT 0x0001 +#define TABLE_MODIFY_FETCH_OLD_TUPLE 0x0002 +#define TABLE_MODIFY_LOCK_UPDATED 0x0004 + /* Typedef for callback function for table_index_build_scan */ typedef void (*IndexBuildCallback) (Relation index, @@ -536,9 +541,10 @@ typedef struct TableAmRoutine CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, + int options, TM_FailureData *tmfd, - bool changingPart); + bool changingPart, + TupleTableSlot *oldSlot); /* see table_tuple_update() for reference about parameters */ TM_Result (*tuple_update) (Relation rel, @@ -547,10 +553,11 @@ typedef struct TableAmRoutine CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, + int options, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); /* see table_tuple_lock() for reference about parameters */ TM_Result (*tuple_lock) (Relation rel, @@ -1462,7 +1469,7 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, } /* - * Delete a tuple. + * Delete a tuple (and optionally lock the last tuple version). * * NB: do not call this directly unless prepared to deal with * concurrent-update conditions. Use simple_table_tuple_delete instead. @@ -1473,11 +1480,21 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, * cid - delete command ID (used for visibility test, and stored into * cmax if successful) * crosscheck - if not InvalidSnapshot, also check tuple against this - * wait - true if should wait for any conflicting update to commit/abort + * options: + * If TABLE_MODIFY_WAIT, wait for any conflicting update to commit/abort. + * If TABLE_MODIFY_FETCH_OLD_TUPLE option is given, the existing tuple is + * fetched into oldSlot when the update is successful. + * If TABLE_MODIFY_LOCK_UPDATED option is given and the tuple is + * concurrently updated, then the last tuple version is locked and fetched + * into oldSlot. + * * Output parameters: * tmfd - filled in failure cases (see below) * changingPart - true iff the tuple is being moved to another partition * table due to an update of the partition key. Otherwise, false. + * oldSlot - slot to save the deleted or locked tuple. Can be NULL if none of + * TABLE_MODIFY_FETCH_OLD_TUPLE or TABLE_MODIFY_LOCK_UPDATED options + * is specified. * * Normal, successful return value is TM_Ok, which means we did actually * delete it. Failure return codes are TM_SelfModified, TM_Updated, and @@ -1489,16 +1506,18 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, */ static inline TM_Result table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + Snapshot snapshot, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { return rel->rd_tableam->tuple_delete(rel, tid, cid, snapshot, crosscheck, - wait, tmfd, changingPart); + options, tmfd, changingPart, + oldSlot); } /* - * Update a tuple. + * Update a tuple (and optionally lock the last tuple version). * * NB: do not call this directly unless you are prepared to deal with * concurrent-update conditions. Use simple_table_tuple_update instead. @@ -1510,13 +1529,23 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * cid - update command ID (used for visibility test, and stored into * cmax/cmin if successful) * crosscheck - if not InvalidSnapshot, also check old tuple against this - * wait - true if should wait for any conflicting update to commit/abort + * options: + * If TABLE_MODIFY_WAIT, wait for any conflicting update to commit/abort. + * If TABLE_MODIFY_FETCH_OLD_TUPLE option is given, the existing tuple is + * fetched into oldSlot when the update is successful. + * If TABLE_MODIFY_LOCK_UPDATED option is given and the tuple is + * concurrently updated, then the last tuple version is locked and fetched + * into oldSlot. + * * Output parameters: * tmfd - filled in failure cases (see below) * lockmode - filled with lock mode acquired on tuple * update_indexes - in success cases this is set to true if new index entries * are required for this tuple - * + * oldSlot - slot to save the deleted or locked tuple. Can be NULL if none of + * TABLE_MODIFY_FETCH_OLD_TUPLE or TABLE_MODIFY_LOCK_UPDATED options + * is specified. + * Normal, successful return value is TM_Ok, which means we did actually * update it. Failure return codes are TM_SelfModified, TM_Updated, and * TM_BeingModified (the last only possible if wait == false). @@ -1534,13 +1563,15 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, static inline TM_Result table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) + int options, TM_FailureData *tmfd, LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { return rel->rd_tableam->tuple_update(rel, otid, slot, cid, snapshot, crosscheck, - wait, tmfd, - lockmode, update_indexes); + options, tmfd, + lockmode, update_indexes, + oldSlot); } /* @@ -2054,10 +2085,12 @@ table_scan_sample_next_tuple(TableScanDesc scan, extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, - Snapshot snapshot); + Snapshot snapshot, + TupleTableSlot *oldSlot); extern void simple_table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, Snapshot snapshot, - TU_UpdateIndexes *update_indexes); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); /* ---------------------------------------------------------------------------- diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index 8a5a9fe6422..cb968d03ecd 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -216,8 +216,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, TM_FailureData *tmfd); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *slot, TransitionCaptureState *transition_capture, bool is_crosspart_update); extern bool ExecIRDeleteTriggers(EState *estate, @@ -240,8 +240,8 @@ extern void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, ResultRelInfo *dst_partinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, TransitionCaptureState *transition_capture, From 1d6745e222a52c2aef7a003bbc73efb86ba3325a Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 22 Mar 2023 16:47:09 -0700 Subject: [PATCH 06/56] Add EvalPlanQual delete returning isolation test Author: Andres Freund Reviewed-by: Pavel Borisov Discussion: https://www.postgresql.org/message-id/flat/CAPpHfdua-YFw3XTprfutzGp28xXLigFtzNbuFY8yPhqeq6X5kg%40mail.gmail.com --- .../isolation/expected/eval-plan-qual-2.out | 37 +++++++++++++++++++ src/test/isolation/isolation_schedule | 1 + .../isolation/specs/eval-plan-qual-2.spec | 30 +++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 src/test/isolation/expected/eval-plan-qual-2.out create mode 100644 src/test/isolation/specs/eval-plan-qual-2.spec diff --git a/src/test/isolation/expected/eval-plan-qual-2.out b/src/test/isolation/expected/eval-plan-qual-2.out new file mode 100644 index 00000000000..117a3d3be8d --- /dev/null +++ b/src/test/isolation/expected/eval-plan-qual-2.out @@ -0,0 +1,37 @@ +Parsed test spec with 3 sessions + +starting permutation: read_u wx2 wb1 c2 c1 read_u read +step read_u: SELECT * FROM accounts; +accountid|balance|balance2 +---------+-------+-------- +checking | 600| 1200 +savings | 600| 1200 +(2 rows) + +step wx2: UPDATE accounts SET balance = balance + 450 WHERE accountid = 'checking' RETURNING balance; +balance +------- + 1050 +(1 row) + +step wb1: DELETE FROM accounts WHERE balance = 600 RETURNING *; +step c2: COMMIT; +step wb1: <... completed> +accountid|balance|balance2 +---------+-------+-------- +savings | 600| 1200 +(1 row) + +step c1: COMMIT; +step read_u: SELECT * FROM accounts; +accountid|balance|balance2 +---------+-------+-------- +checking | 1050| 2100 +(1 row) + +step read: SELECT * FROM accounts ORDER BY accountid; +accountid|balance|balance2 +---------+-------+-------- +checking | 1050| 2100 +(1 row) + diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 143109aa4da..f4df2146488 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -36,6 +36,7 @@ test: fk-partitioned-2 test: fk-snapshot test: subxid-overflow test: eval-plan-qual +test: eval-plan-qual-2 test: eval-plan-qual-trigger test: inplace-inval test: intra-grant-inplace diff --git a/src/test/isolation/specs/eval-plan-qual-2.spec b/src/test/isolation/specs/eval-plan-qual-2.spec new file mode 100644 index 00000000000..30447bef24a --- /dev/null +++ b/src/test/isolation/specs/eval-plan-qual-2.spec @@ -0,0 +1,30 @@ +setup +{ + CREATE TABLE accounts (accountid text PRIMARY KEY, balance numeric not null, + balance2 numeric GENERATED ALWAYS AS (balance * 2) STORED); + INSERT INTO accounts VALUES ('checking', 600), ('savings', 600); +} + +teardown +{ + DROP TABLE accounts; +} + +session s1 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step wb1 { DELETE FROM accounts WHERE balance = 600 RETURNING *; } +step c1 { COMMIT; } + +session s2 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step wx2 { UPDATE accounts SET balance = balance + 450 WHERE accountid = 'checking' RETURNING balance; } +step c2 { COMMIT; } + +session s3 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step read { SELECT * FROM accounts ORDER BY accountid; } +step read_u { SELECT * FROM accounts; } + +teardown { COMMIT; } + +permutation read_u wx2 wb1 c2 c1 read_u read From b9a4b5d93f4bd3c1cfb1cc30930c1c260add73bf Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 00:04:21 +0300 Subject: [PATCH 07/56] Improvements to TableAM API --- contrib/amcheck/verify_nbtree.c | 2 +- src/backend/access/common/detoast.c | 20 +- src/backend/access/common/heaptuple.c | 4 + src/backend/access/common/reloptions.c | 6 +- src/backend/access/heap/heapam_handler.c | 343 ++++++++++++++++++- src/backend/access/table/tableam.c | 4 +- src/backend/access/table/tableamapi.c | 26 +- src/backend/catalog/aclchk.c | 2 +- src/backend/commands/analyze.c | 14 +- src/backend/commands/tablecmds.c | 58 ++-- src/backend/commands/trigger.c | 237 ++++++++++--- src/backend/executor/execExprInterp.c | 4 +- src/backend/executor/execMain.c | 28 +- src/backend/executor/execReplication.c | 10 +- src/backend/executor/nodeLockRows.c | 17 +- src/backend/executor/nodeModifyTable.c | 417 +++++++---------------- src/backend/executor/nodeTidscan.c | 2 +- src/backend/nodes/read.c | 11 + src/backend/optimizer/plan/planner.c | 16 +- src/backend/optimizer/prep/preptlist.c | 20 +- src/backend/optimizer/util/appendinfo.c | 32 +- src/backend/optimizer/util/inherit.c | 48 ++- src/backend/parser/parse_relation.c | 13 + src/backend/postmaster/autovacuum.c | 4 +- src/backend/rewrite/rewriteHandler.c | 1 + src/backend/utils/adt/ri_triggers.c | 5 +- src/backend/utils/cache/relcache.c | 38 ++- src/backend/utils/sort/tuplestore.c | 30 ++ src/include/access/reloptions.h | 2 + src/include/access/sysattr.h | 3 +- src/include/access/tableam.h | 189 ++++++---- src/include/commands/trigger.h | 4 +- src/include/commands/vacuum.h | 3 + src/include/foreign/fdwapi.h | 6 +- src/include/nodes/execnodes.h | 3 + src/include/nodes/parsenodes.h | 1 + src/include/nodes/plannodes.h | 4 +- src/include/nodes/primnodes.h | 7 + src/include/nodes/readfuncs.h | 1 + src/include/optimizer/appendinfo.h | 5 + src/include/optimizer/planner.h | 3 +- src/include/utils/tuplestore.h | 3 + src/include/varatt.h | 2 + 43 files changed, 1115 insertions(+), 533 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 34990c5cea3..ed4497f9620 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -994,7 +994,7 @@ heap_entry_is_visible(BtreeCheckState *state, ItemPointer tid) TupleTableSlot *slot = table_slot_create(state->heaprel, NULL); tid_visible = table_tuple_fetch_row_version(state->heaprel, - tid, state->snapshot, slot); + PointerGetDatum(tid), state->snapshot, slot); if (slot != NULL) ExecDropSingleTupleTableSlot(slot); diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c index d9ab4fb0956..27d0e37607a 100644 --- a/src/backend/access/common/detoast.c +++ b/src/backend/access/common/detoast.c @@ -28,6 +28,8 @@ static struct varlena *toast_fetch_datum_slice(struct varlena *attr, int32 slicelength); static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); +static ToastFunc o_detoast_func = NULL; + /* ---------- * detoast_external_attr - * @@ -222,7 +224,14 @@ detoast_attr_slice(struct varlena *attr, else if (pg_add_s32_overflow(sliceoffset, slicelength, &slicelimit)) slicelength = slicelimit = -1; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + Assert(o_detoast_func != NULL); + preslice = o_detoast_func(attr); + if (preslice == NULL) + elog(ERROR, "unexpected NULL detoast result"); + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { struct varatt_external toast_pointer; @@ -331,8 +340,6 @@ detoast_attr_slice(struct varlena *attr, return result; } -static ToastFunc o_detoast_func = NULL; - void register_o_detoast_func(ToastFunc func) { @@ -633,7 +640,12 @@ toast_datum_size(Datum value) struct varlena *attr = (struct varlena *) DatumGetPointer(value); Size result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + result = toasted->toasted_size - VARHDRSZ; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { /* * Attribute is stored externally - return the extsize whether diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 9e3407bf987..a1b8a99b739 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -755,6 +755,10 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) case TableOidAttributeNumber: result = ObjectIdGetDatum(tup->t_tableOid); break; + case RowIdAttributeNumber: + *isnull = true; + result = 0; + break; default: elog(ERROR, "invalid attnum: %d", attnum); result = 0; /* keep compiler quiet */ diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index d6eb5d85599..963995388bb 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/spgist_private.h" +#include "access/tableam.h" #include "catalog/pg_type.h" #include "commands/defrem.h" #include "commands/tablespace.h" @@ -1377,7 +1378,7 @@ untransformRelOptions(Datum options) */ bytea * extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, - amoptions_function amoptions) + const TableAmRoutine *tableam, amoptions_function amoptions) { bytea *options; bool isnull; @@ -1399,7 +1400,8 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: - options = heap_reloptions(classForm->relkind, datum, false); + options = tableam_reloptions(tableam, classForm->relkind, + datum, false); break; case RELKIND_PARTITIONED_TABLE: options = partitioned_table_reloptions(datum, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ed830464aea..2c2c7061189 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -23,6 +23,7 @@ #include "access/heapam.h" #include "access/heaptoast.h" #include "access/multixact.h" +#include "access/reloptions.h" #include "access/rewriteheap.h" #include "access/syncscan.h" #include "access/tableam.h" @@ -46,7 +47,7 @@ #include "utils/builtins.h" #include "utils/rel.h" -static TM_Result heapam_tuple_lock(Relation relation, ItemPointer tid, +static TM_Result heapam_tuple_lock(Relation relation, Datum tid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, @@ -76,6 +77,20 @@ heapam_slot_callbacks(Relation relation) return &TTSOpsBufferHeapTuple; } +static RowRefType +heapam_get_row_ref_type(Relation rel) +{ + return ROW_REF_TID; +} + +static void +heapam_free_rd_amcache(Relation rel) +{ + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; +} + /* ------------------------------------------------------------------------ * Index Scan Callbacks for heap AM @@ -185,7 +200,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, static bool heapam_fetch_row_version(Relation relation, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot) { @@ -194,7 +209,7 @@ heapam_fetch_row_version(Relation relation, Assert(TTS_IS_BUFFERTUPLE(slot)); - bslot->base.tupdata.t_self = *tid; + bslot->base.tupdata.t_self = *DatumGetItemPointer(tupleid); if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false)) { /* store in slot, transferring existing pin */ @@ -244,7 +259,7 @@ heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, * ---------------------------------------------------------------------------- */ -static void +static TupleTableSlot * heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, int options, BulkInsertState bistate) { @@ -261,6 +276,8 @@ heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, if (shouldFree) pfree(tuple); + + return slot; } static void @@ -303,13 +320,285 @@ heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, pfree(tuple); } +/* + * ExecCheckTupleVisible -- verify tuple is visible + * + * It would not be consistent with guarantees of the higher isolation levels to + * proceed with avoiding insertion (taking speculative insertion's alternative + * path) on the basis of another tuple that is not visible to MVCC snapshot. + * Check for the need to raise a serialization failure, and do so as necessary. + */ +static void +ExecCheckTupleVisible(EState *estate, + Relation rel, + TupleTableSlot *slot) +{ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) + { + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + /* + * We should not raise a serialization failure if the conflict is + * against a tuple inserted by our own transaction, even if it's not + * visible to our snapshot. (This would happen, for example, if + * conflicting keys are proposed for insertion in a single command.) + */ + if (!TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + } +} + +/* + * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() + */ +static void +ExecCheckTIDVisible(EState *estate, + Relation rel, + ItemPointer tid, + TupleTableSlot *tempSlot) +{ + /* Redundantly check isolation level */ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_fetch_row_version(rel, PointerGetDatum(tid), + SnapshotAny, tempSlot)) + elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); + ExecCheckTupleVisible(estate, rel, tempSlot); + ExecClearTuple(tempSlot); +} + +static inline TupleTableSlot * +heapam_tuple_insert_with_arbiter(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + uint32 specToken; + ItemPointerData conflictTid; + bool specConflict; + List *recheckIndexes = NIL; + + while (true) + { + specConflict = false; + if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, &conflictTid, + arbiterIndexes)) + { + if (lockedSlot) + { + TM_Result test; + TM_FailureData tmfd; + Datum xminDatum; + TransactionId xmin; + bool isnull; + + /* Determine lock mode to use */ + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + + /* + * Lock tuple for update. Don't follow updates when tuple cannot be + * locked without doing so. A row locking conflict here means our + * previous conclusion that the tuple is conclusively committed is not + * true anymore. + */ + test = table_tuple_lock(rel, PointerGetDatum(&conflictTid), + estate->es_snapshot, + lockedSlot, estate->es_output_cid, + lockmode, LockWaitBlock, 0, + &tmfd); + switch (test) + { + case TM_Ok: + /* success! */ + break; + + case TM_Invisible: + + /* + * This can occur when a just inserted tuple is updated again in + * the same command. E.g. because multiple rows with the same + * conflicting key values are inserted. + * + * This is somewhat similar to the ExecUpdate() TM_SelfModified + * case. We do not want to proceed because it would lead to the + * same row being updated a second time in some unspecified order, + * and in contrast to plain UPDATEs there's no historical behavior + * to break. + * + * It is the user's responsibility to prevent this situation from + * occurring. These problems are why the SQL standard similarly + * specifies that for SQL MERGE, an exception must be raised in + * the event of an attempt to update the same row twice. + */ + xminDatum = slot_getsysattr(lockedSlot, + MinTransactionIdAttributeNumber, + &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + if (TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + /* translator: %s is a SQL command name */ + errmsg("%s command cannot affect row a second time", + "ON CONFLICT DO UPDATE"), + errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); + + /* This shouldn't happen */ + elog(ERROR, "attempted to lock invisible tuple"); + break; + + case TM_SelfModified: + + /* + * This state should never be reached. As a dirty snapshot is used + * to find conflicting tuples, speculative insertion wouldn't have + * seen this row to conflict with. + */ + elog(ERROR, "unexpected self-updated tuple"); + break; + + case TM_Updated: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + + /* + * As long as we don't support an UPDATE of INSERT ON CONFLICT for + * a partitioned table we shouldn't reach to a case where tuple to + * be lock is moved to another partition due to concurrent update + * of the partition key. + */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + + /* + * Tell caller to try again from the very start. + * + * It does not make sense to use the usual EvalPlanQual() style + * loop here, as the new version of the row might not conflict + * anymore, or the conflicting tuple has actually been deleted. + */ + ExecClearTuple(lockedSlot); + return false; + + case TM_Deleted: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent delete"))); + + /* see TM_Updated case */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + ExecClearTuple(lockedSlot); + return false; + + default: + elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + } + + /* Success, the tuple is locked. */ + + /* + * Verify that the tuple is visible to our MVCC snapshot if the current + * isolation level mandates that. + * + * It's not sufficient to rely on the check within ExecUpdate() as e.g. + * CONFLICT ... WHERE clause may prevent us from reaching that. + * + * This means we only ever continue when a new command in the current + * transaction could see the row, even though in READ COMMITTED mode the + * tuple will not be visible according to the current statement's + * snapshot. This is in line with the way UPDATE deals with newer tuple + * versions. + */ + ExecCheckTupleVisible(estate, rel, lockedSlot); + return NULL; + } + else + { + ExecCheckTIDVisible(estate, rel, &conflictTid, tempSlot); + return NULL; + } + } + + /* + * Before we start insertion proper, acquire our "speculative + * insertion lock". Others can use that to wait for us to decide + * if we're going to go ahead with the insertion, instead of + * waiting for the whole transaction to complete. + */ + specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); + + /* insert the tuple, with the speculative token */ + heapam_tuple_insert_speculative(rel, slot, + estate->es_output_cid, + 0, + NULL, + specToken); + + /* insert index entries for tuple */ + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, false, true, + &specConflict, + arbiterIndexes, + false); + + /* adjust the tuple's state accordingly */ + heapam_tuple_complete_speculative(rel, slot, + specToken, !specConflict); + + /* + * Wake up anyone waiting for our decision. They will re-check + * the tuple, see that it's no longer speculative, and wait on our + * XID as if this was a regularly inserted tuple all along. Or if + * we killed the tuple, they will see it's dead, and proceed as if + * the tuple never existed. + */ + SpeculativeInsertionLockRelease(GetCurrentTransactionId()); + + /* + * If there was a conflict, start from the beginning. We'll do + * the pre-check again, which will now find the conflicting tuple + * (unless it aborts before we get there). + */ + if (specConflict) + { + list_free(recheckIndexes); + CHECK_FOR_INTERRUPTS(); + continue; + } + + return slot; + } +} + static TM_Result -heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, +heapam_tuple_delete(Relation relation, Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, bool changingPart, TupleTableSlot *oldSlot) { TM_Result result; + ItemPointer tid = DatumGetItemPointer(tupleid); /* * Currently Deleting of index tuples are handled at vacuum, in case if @@ -332,7 +621,7 @@ heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, * heapam_tuple_lock() will take advantage of tuple loaded into * oldSlot by heap_delete(). */ - result = heapam_tuple_lock(relation, tid, snapshot, + result = heapam_tuple_lock(relation, tupleid, snapshot, oldSlot, cid, LockTupleExclusive, (options & TABLE_MODIFY_WAIT) ? LockWaitBlock : @@ -349,7 +638,7 @@ heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, static TM_Result -heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, +heapam_tuple_update(Relation relation, Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, @@ -358,6 +647,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, bool shouldFree = true; HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); TM_Result result; + ItemPointer otid = DatumGetItemPointer(tupleid); /* Update the tuple with table oid */ slot->tts_tableOid = RelationGetRelid(relation); @@ -404,7 +694,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, * heapam_tuple_lock() will take advantage of tuple loaded into * oldSlot by heap_update(). */ - result = heapam_tuple_lock(relation, otid, snapshot, + result = heapam_tuple_lock(relation, tupleid, snapshot, oldSlot, cid, *lockmode, (options & TABLE_MODIFY_WAIT) ? LockWaitBlock : @@ -420,7 +710,7 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, } static TM_Result -heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, +heapam_tuple_lock(Relation relation, Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd) @@ -428,6 +718,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; TM_Result result; HeapTuple tuple = &bslot->base.tupdata; + ItemPointer tid = DatumGetItemPointer(tupleid); bool follow_updates; follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0; @@ -2641,6 +2932,29 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, } } +static bool +heapam_tuple_is_current(Relation rel, TupleTableSlot *slot) +{ + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + return TransactionIdIsCurrentTransactionId(xmin); +} + +static bytea * +heapam_reloptions(char relkind, Datum reloptions, bool validate) +{ + if (relkind == RELKIND_RELATION || + relkind == RELKIND_TOASTVALUE || + relkind == RELKIND_MATVIEW) + return heap_reloptions(relkind, reloptions, validate); + + return NULL; +} /* ------------------------------------------------------------------------ * Definition of the heap table access method. @@ -2651,6 +2965,8 @@ static const TableAmRoutine heapam_methods = { .type = T_TableAmRoutine, .slot_callbacks = heapam_slot_callbacks, + .get_row_ref_type = heapam_get_row_ref_type, + .free_rd_amcache = heapam_free_rd_amcache, .scan_begin = heap_beginscan, .scan_end = heap_endscan, @@ -2670,8 +2986,7 @@ static const TableAmRoutine heapam_methods = { .index_fetch_tuple = heapam_index_fetch_tuple, .tuple_insert = heapam_tuple_insert, - .tuple_insert_speculative = heapam_tuple_insert_speculative, - .tuple_complete_speculative = heapam_tuple_complete_speculative, + .tuple_insert_with_arbiter = heapam_tuple_insert_with_arbiter, .multi_insert = heap_multi_insert, .tuple_delete = heapam_tuple_delete, .tuple_update = heapam_tuple_update, @@ -2703,7 +3018,11 @@ static const TableAmRoutine heapam_methods = { .scan_bitmap_next_block = heapam_scan_bitmap_next_block, .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, .scan_sample_next_block = heapam_scan_sample_next_block, - .scan_sample_next_tuple = heapam_scan_sample_next_tuple + .scan_sample_next_tuple = heapam_scan_sample_next_tuple, + + .tuple_is_current = heapam_tuple_is_current, + + .reloptions = heapam_reloptions }; diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 8d3675be959..fed5ac6cd01 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -298,7 +298,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_delete(rel, tid, + result = table_tuple_delete(rel, PointerGetDatum(tid), GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, @@ -354,7 +354,7 @@ simple_table_tuple_update(Relation rel, ItemPointer otid, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_update(rel, otid, slot, + result = table_tuple_update(rel, PointerGetDatum(otid), slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index e9b598256fb..cd01bd9934f 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -13,10 +13,11 @@ #include "access/tableam.h" #include "access/xact.h" +#include "catalog/pg_am.h" #include "commands/defrem.h" #include "miscadmin.h" #include "utils/guc_hooks.h" - +#include "utils/syscache.h" /* * GetTableAmRoutine @@ -68,8 +69,7 @@ GetTableAmRoutine(Oid amhandler) * Could be made optional, but would require throwing error during * parse-analysis. */ - Assert(routine->tuple_insert_speculative != NULL); - Assert(routine->tuple_complete_speculative != NULL); + Assert(routine->tuple_insert_with_arbiter != NULL); Assert(routine->multi_insert != NULL); Assert(routine->tuple_delete != NULL); @@ -97,9 +97,29 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->scan_sample_next_block != NULL); Assert(routine->scan_sample_next_tuple != NULL); + Assert(routine->tuple_is_current != NULL); + return routine; } +const TableAmRoutine * +GetTableAmRoutineByAmOid(Oid amoid) +{ + HeapTuple ht_am; + Form_pg_am amrec; + const TableAmRoutine *tableam = NULL; + + ht_am = SearchSysCache1(AMOID, ObjectIdGetDatum(amoid)); + if (!HeapTupleIsValid(ht_am)) + elog(ERROR, "cache lookup failed for access method %u", + amoid); + amrec = (Form_pg_am)GETSTRUCT(ht_am); + + tableam = GetTableAmRoutine(amrec->amhandler); + ReleaseSysCache(ht_am); + return tableam; +} + /* check_hook: validate new default_table_access_method */ bool check_default_table_access_method(char **newval, void **extra, GucSource source) diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index a44ccee3b68..043303bc2e3 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -1638,7 +1638,7 @@ expand_all_col_privileges(Oid table_oid, Form_pg_class classForm, AttrNumber curr_att; Assert(classForm->relnatts - FirstLowInvalidHeapAttributeNumber < num_col_privileges); - for (curr_att = FirstLowInvalidHeapAttributeNumber + 1; + for (curr_att = FirstLowInvalidHeapAttributeNumber + 2; curr_att <= classForm->relnatts; curr_att++) { diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index c590a2adc35..f63faedfcfb 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -87,9 +87,6 @@ static void compute_index_stats(Relation onerel, double totalrows, MemoryContext col_context); static VacAttrStats *examine_attribute(Relation onerel, int attnum, Node *index_expr); -static int acquire_sample_rows(Relation onerel, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, double *totaldeadrows); static int compare_rows(const void *a, const void *b, void *arg); static int acquire_inherited_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, @@ -190,10 +187,7 @@ analyze_rel(Oid relid, RangeVar *relation, if (onerel->rd_rel->relkind == RELKIND_RELATION || onerel->rd_rel->relkind == RELKIND_MATVIEW) { - /* Regular table, so we'll use the regular row acquisition function */ - acquirefunc = acquire_sample_rows; - /* Also get regular table's size */ - relpages = RelationGetNumberOfBlocks(onerel); + table_analyze(onerel, &acquirefunc, &relpages); } else if (onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { @@ -1154,7 +1148,7 @@ block_sampling_read_stream_next(ReadStream *stream, * block. The previous sampling method put too much credence in the row * density near the start of the table. */ -static int +int acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) @@ -1421,9 +1415,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, if (childrel->rd_rel->relkind == RELKIND_RELATION || childrel->rd_rel->relkind == RELKIND_MATVIEW) { - /* Regular table, so use the regular row acquisition function */ - acquirefunc = acquire_sample_rows; - relpages = RelationGetNumberOfBlocks(childrel); + table_analyze(childrel, &acquirefunc, &relpages); } else if (childrel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 0ecdecc2564..75fde5a75d8 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -700,6 +700,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, ObjectAddress address; LOCKMODE parentLockmode; Oid accessMethodId = InvalidOid; + const TableAmRoutine *tableam = NULL; /* * Truncate relname to appropriate length (probably a waste of time, as @@ -835,6 +836,29 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, if (!OidIsValid(ownerId)) ownerId = GetUserId(); + + /* + * For relations with table AM and partitioned tables, select access + * method to use: an explicitly indicated one, or (in the case of a + * partitioned table) the parent's, if it has one. + */ + if (stmt->accessMethod != NULL) + { + Assert(RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE); + accessMethodId = get_table_am_oid(stmt->accessMethod, false); + } + else if (RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE) + { + if (stmt->partbound) + { + Assert(list_length(inheritOids) == 1); + accessMethodId = get_rel_relam(linitial_oid(inheritOids)); + } + + if (RELKIND_HAS_TABLE_AM(relkind) && !OidIsValid(accessMethodId)) + accessMethodId = get_table_am_oid(default_table_access_method, false); + } + /* * Parse and validate reloptions, if any. */ @@ -843,6 +867,12 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, switch (relkind) { + case RELKIND_RELATION: + case RELKIND_TOASTVALUE: + case RELKIND_MATVIEW: + tableam = GetTableAmRoutineByAmOid(accessMethodId); + (void) tableam_reloptions(tableam, relkind, reloptions, true); + break; case RELKIND_VIEW: (void) view_reloptions(reloptions, true); break; @@ -851,6 +881,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, break; default: (void) heap_reloptions(relkind, reloptions, true); + break; } if (stmt->ofTypename) @@ -941,28 +972,6 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, } } - /* - * For relations with table AM and partitioned tables, select access - * method to use: an explicitly indicated one, or (in the case of a - * partitioned table) the parent's, if it has one. - */ - if (stmt->accessMethod != NULL) - { - Assert(RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE); - accessMethodId = get_table_am_oid(stmt->accessMethod, false); - } - else if (RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE) - { - if (stmt->partbound) - { - Assert(list_length(inheritOids) == 1); - accessMethodId = get_rel_relam(linitial_oid(inheritOids)); - } - - if (RELKIND_HAS_TABLE_AM(relkind) && !OidIsValid(accessMethodId)) - accessMethodId = get_table_am_oid(default_table_access_method, false); - } - /* * Create the relation. Inherited defaults and constraints are passed in * for immediate handling --- since they don't need parsing, they can be @@ -6304,8 +6313,10 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) /* Write the tuple out to the new relation */ if (newrel) + { table_tuple_insert(newrel, insertslot, mycid, ti_options, bistate); + } ResetExprContext(econtext); @@ -14933,7 +14944,8 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: - (void) heap_reloptions(rel->rd_rel->relkind, newOptions, true); + (void) table_reloptions(rel, rel->rd_rel->relkind, + newOptions, true); break; case RELKIND_PARTITIONED_TABLE: (void) partitioned_table_reloptions(newOptions, true); diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 8ac9ccf5abf..1aee9d64212 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -76,7 +76,7 @@ static void SetTriggerFlags(TriggerDesc *trigdesc, Trigger *trigger); static bool GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tid, + Datum tupleid, LockTupleMode lockmode, TupleTableSlot *oldslot, TupleTableSlot **epqslot, @@ -2681,7 +2681,7 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo, bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, @@ -2695,7 +2695,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, bool should_free = false; int i; - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); + Assert(HeapTupleIsValid(fdw_trigtuple) ^ (DatumGetPointer(tupleid) != NULL)); if (fdw_trigtuple == NULL) { TupleTableSlot *epqslot_candidate = NULL; @@ -2923,7 +2923,7 @@ ExecASUpdateTriggers(EState *estate, ResultRelInfo *relinfo, bool ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, @@ -2943,7 +2943,7 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, /* Determine lock mode to use */ lockmode = ExecUpdateLockMode(estate, relinfo); - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); + Assert(HeapTupleIsValid(fdw_trigtuple) ^ (DatumGetPointer(tupleid) != NULL)); if (fdw_trigtuple == NULL) { TupleTableSlot *epqslot_candidate = NULL; @@ -3260,7 +3260,7 @@ static bool GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tid, + Datum tupleid, LockTupleMode lockmode, TupleTableSlot *oldslot, TupleTableSlot **epqslot, @@ -3285,7 +3285,9 @@ GetTupleForTrigger(EState *estate, */ if (!IsolationUsesXactSnapshot()) lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; - test = table_tuple_lock(relation, tid, estate->es_snapshot, oldslot, + + test = table_tuple_lock(relation, tupleid, + estate->es_snapshot, oldslot, estate->es_output_cid, lockmode, LockWaitBlock, lockflags, @@ -3381,8 +3383,8 @@ GetTupleForTrigger(EState *estate, * We expect the tuple to be present, thus very simple error handling * suffices. */ - if (!table_tuple_fetch_row_version(relation, tid, SnapshotAny, - oldslot)) + if (!table_tuple_fetch_row_version(relation, tupleid, + SnapshotAny, oldslot)) elog(ERROR, "failed to fetch tuple for trigger"); } @@ -3588,18 +3590,22 @@ typedef SetConstraintStateData *SetConstraintState; * cycles. So we need only ensure that ats_firing_id is zero when attaching * a new event to an existing AfterTriggerSharedData record. */ -typedef uint32 TriggerFlags; +typedef uint64 TriggerFlags; -#define AFTER_TRIGGER_OFFSET 0x07FFFFFF /* must be low-order bits */ -#define AFTER_TRIGGER_DONE 0x80000000 -#define AFTER_TRIGGER_IN_PROGRESS 0x40000000 +#define AFTER_TRIGGER_SIZE UINT64CONST(0xFFFF000000000) /* must be low-order bits */ +#define AFTER_TRIGGER_SIZE_SHIFT (36) +#define AFTER_TRIGGER_OFFSET UINT64CONST(0x000000FFFFFFF) /* must be low-order bits */ +#define AFTER_TRIGGER_DONE UINT64CONST(0x0000800000000) +#define AFTER_TRIGGER_IN_PROGRESS UINT64CONST(0x0000400000000) /* bits describing the size and tuple sources of this event */ -#define AFTER_TRIGGER_FDW_REUSE 0x00000000 -#define AFTER_TRIGGER_FDW_FETCH 0x20000000 -#define AFTER_TRIGGER_1CTID 0x10000000 -#define AFTER_TRIGGER_2CTID 0x30000000 -#define AFTER_TRIGGER_CP_UPDATE 0x08000000 -#define AFTER_TRIGGER_TUP_BITS 0x38000000 +#define AFTER_TRIGGER_FDW_REUSE UINT64CONST(0x0000000000000) +#define AFTER_TRIGGER_FDW_FETCH UINT64CONST(0x0000200000000) +#define AFTER_TRIGGER_1CTID UINT64CONST(0x0000100000000) +#define AFTER_TRIGGER_ROWID1 UINT64CONST(0x0000010000000) +#define AFTER_TRIGGER_2CTID UINT64CONST(0x0000300000000) +#define AFTER_TRIGGER_ROWID2 UINT64CONST(0x0000020000000) +#define AFTER_TRIGGER_CP_UPDATE UINT64CONST(0x0000080000000) +#define AFTER_TRIGGER_TUP_BITS UINT64CONST(0x0000380000000) typedef struct AfterTriggerSharedData *AfterTriggerShared; typedef struct AfterTriggerSharedData @@ -3651,6 +3657,9 @@ typedef struct AfterTriggerEventDataZeroCtids } AfterTriggerEventDataZeroCtids; #define SizeofTriggerEvent(evt) \ + (((evt)->ate_flags & AFTER_TRIGGER_SIZE) >> AFTER_TRIGGER_SIZE_SHIFT) + +#define BasicSizeofTriggerEvent(evt) \ (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_CP_UPDATE ? \ sizeof(AfterTriggerEventData) : \ (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID ? \ @@ -4003,14 +4012,34 @@ afterTriggerCopyBitmap(Bitmapset *src) */ static void afterTriggerAddEvent(AfterTriggerEventList *events, - AfterTriggerEvent event, AfterTriggerShared evtshared) + AfterTriggerEvent event, AfterTriggerShared evtshared, + bytea *rowid1, bytea *rowid2) { - Size eventsize = SizeofTriggerEvent(event); - Size needed = eventsize + sizeof(AfterTriggerSharedData); + Size basiceventsize = MAXALIGN(BasicSizeofTriggerEvent(event)); + Size eventsize; + Size needed; AfterTriggerEventChunk *chunk; AfterTriggerShared newshared; AfterTriggerEvent newevent; + if (SizeofTriggerEvent(event) == 0) + { + eventsize = basiceventsize; + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + eventsize += MAXALIGN(VARSIZE(rowid1)); + + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + eventsize += MAXALIGN(VARSIZE(rowid2)); + + event->ate_flags |= eventsize << AFTER_TRIGGER_SIZE_SHIFT; + } + else + { + eventsize = SizeofTriggerEvent(event); + } + + needed = eventsize + sizeof(AfterTriggerSharedData); + /* * If empty list or not enough room in the tail chunk, make a new chunk. * We assume here that a new shared record will always be needed. @@ -4043,7 +4072,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events, * sizes used should be MAXALIGN multiples, to ensure that the shared * records will be aligned safely. */ -#define MIN_CHUNK_SIZE 1024 +#define MIN_CHUNK_SIZE (1024*4) #define MAX_CHUNK_SIZE (1024*1024) #if MAX_CHUNK_SIZE > (AFTER_TRIGGER_OFFSET+1) @@ -4062,6 +4091,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events, chunksize *= 2; /* okay, double it */ else chunksize /= 2; /* too many shared records */ + chunksize = Max(chunksize, MIN_CHUNK_SIZE); chunksize = Min(chunksize, MAX_CHUNK_SIZE); } chunk = MemoryContextAlloc(afterTriggers.event_cxt, chunksize); @@ -4102,7 +4132,26 @@ afterTriggerAddEvent(AfterTriggerEventList *events, /* Insert the data */ newevent = (AfterTriggerEvent) chunk->freeptr; - memcpy(newevent, event, eventsize); + if (!rowid1 && !rowid2) + { + memcpy(newevent, event, eventsize); + } + else + { + Pointer ptr = chunk->freeptr; + + memcpy(newevent, event, basiceventsize); + ptr += basiceventsize; + + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + { + memcpy(ptr, rowid1, MAXALIGN(VARSIZE(rowid1))); + ptr += MAXALIGN(VARSIZE(rowid1)); + } + + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + memcpy(ptr, rowid2, MAXALIGN(VARSIZE(rowid2))); + } /* ... and link the new event to its shared record */ newevent->ate_flags &= ~AFTER_TRIGGER_OFFSET; newevent->ate_flags |= (char *) newshared - (char *) newevent; @@ -4262,6 +4311,7 @@ AfterTriggerExecute(EState *estate, int tgindx; bool should_free_trig = false; bool should_free_new = false; + Pointer ptr; /* * Locate trigger in trigdesc. It might not be present, and in fact the @@ -4297,15 +4347,17 @@ AfterTriggerExecute(EState *estate, { Tuplestorestate *fdw_tuplestore = GetCurrentFDWTuplestore(); - if (!tuplestore_gettupleslot(fdw_tuplestore, true, false, - trig_tuple_slot1)) + if (!tuplestore_force_gettupleslot(fdw_tuplestore, true, false, + trig_tuple_slot1)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); if ((evtshared->ats_event & TRIGGER_EVENT_OPMASK) == TRIGGER_EVENT_UPDATE && - !tuplestore_gettupleslot(fdw_tuplestore, true, false, - trig_tuple_slot2)) + !tuplestore_force_gettupleslot(fdw_tuplestore, true, false, + trig_tuple_slot2)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); + trig_tuple_slot1->tts_tid = event->ate_ctid1; + trig_tuple_slot2->tts_tid = event->ate_ctid2; } /* fall through */ case AFTER_TRIGGER_FDW_REUSE: @@ -4337,13 +4389,26 @@ AfterTriggerExecute(EState *estate, break; default: - if (ItemPointerIsValid(&(event->ate_ctid1))) + ptr = (Pointer) event + MAXALIGN(BasicSizeofTriggerEvent(event)); + if (ItemPointerIsValid(&(event->ate_ctid1)) || + (event->ate_flags & AFTER_TRIGGER_ROWID1)) { + Datum tupleid; + TupleTableSlot *src_slot = ExecGetTriggerOldSlot(estate, src_relInfo); - if (!table_tuple_fetch_row_version(src_rel, - &(event->ate_ctid1), + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + { + tupleid = PointerGetDatum(ptr); + ptr += MAXALIGN(VARSIZE(ptr)); + } + else + { + tupleid = PointerGetDatum(&(event->ate_ctid1)); + } + + if (!table_tuple_fetch_row_version(src_rel, tupleid, SnapshotAny, src_slot)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); @@ -4379,13 +4444,23 @@ AfterTriggerExecute(EState *estate, /* don't touch ctid2 if not there */ if (((event->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID || (event->ate_flags & AFTER_TRIGGER_CP_UPDATE)) && - ItemPointerIsValid(&(event->ate_ctid2))) + (ItemPointerIsValid(&(event->ate_ctid2)) || + (event->ate_flags & AFTER_TRIGGER_ROWID2))) { + Datum tupleid; + TupleTableSlot *dst_slot = ExecGetTriggerNewSlot(estate, dst_relInfo); - if (!table_tuple_fetch_row_version(dst_rel, - &(event->ate_ctid2), + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + { + tupleid = PointerGetDatum(ptr); + } + else + { + tupleid = PointerGetDatum(&(event->ate_ctid2)); + } + if (!table_tuple_fetch_row_version(dst_rel, tupleid, SnapshotAny, dst_slot)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); @@ -4559,7 +4634,7 @@ afterTriggerMarkEvents(AfterTriggerEventList *events, { deferred_found = true; /* add it to move_list */ - afterTriggerAddEvent(move_list, event, evtshared); + afterTriggerAddEvent(move_list, event, evtshared, NULL, NULL); /* mark original copy "done" so we don't do it again */ event->ate_flags |= AFTER_TRIGGER_DONE; } @@ -4663,6 +4738,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, /* caution: trigdesc could be NULL here */ finfo = rInfo->ri_TrigFunctions; instr = rInfo->ri_TrigInstrument; + if (slot1 != NULL) { ExecDropSingleTupleTableSlot(slot1); @@ -6052,6 +6128,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, int tgtype_level; int i; Tuplestorestate *fdw_tuplestore = NULL; + bytea *rowId1 = NULL; + bytea *rowId2 = NULL; /* * Check state. We use a normal test not Assert because it is possible to @@ -6145,6 +6223,21 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, * if so. This preserves the behavior that statement-level triggers fire * just once per statement and fire after row-level triggers. */ + + /* Determine flags */ + if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) + { + if (row_trigger && event == TRIGGER_EVENT_UPDATE) + { + if (relkind == RELKIND_PARTITIONED_TABLE) + new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; + else + new_event.ate_flags = AFTER_TRIGGER_2CTID; + } + else + new_event.ate_flags = AFTER_TRIGGER_1CTID; + } + switch (event) { case TRIGGER_EVENT_INSERT: @@ -6155,6 +6248,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot != NULL); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerSetInvalid(&(new_event.ate_ctid2)); + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + rowId1 = DatumGetByteaP(slot_getsysattr(newslot, RowIdAttributeNumber, &isnull)); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + Assert(!isnull); + } } else { @@ -6174,6 +6274,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot == NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerSetInvalid(&(new_event.ate_ctid2)); + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + rowId1 = DatumGetByteaP(slot_getsysattr(oldslot, RowIdAttributeNumber, &isnull)); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + Assert(!isnull); + } } else { @@ -6189,10 +6296,54 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, tgtype_event = TRIGGER_TYPE_UPDATE; if (row_trigger) { + bool src_rowid = false, + dst_rowid = false; Assert(oldslot != NULL); Assert(newslot != NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid2)); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + Relation src_rel = src_partinfo->ri_RelationDesc; + Relation dst_rel = dst_partinfo->ri_RelationDesc; + + src_rowid = table_get_row_ref_type(src_rel) == + ROW_REF_ROWID; + dst_rowid = table_get_row_ref_type(dst_rel) == + ROW_REF_ROWID; + } + else + { + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + src_rowid = true; + dst_rowid = true; + } + } + + if (src_rowid) + { + Datum val; + bool isnull; + val = slot_getsysattr(oldslot, + RowIdAttributeNumber, + &isnull); + rowId1 = DatumGetByteaP(val); + Assert(!isnull); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + } + + if (dst_rowid) + { + Datum val; + bool isnull; + val = slot_getsysattr(newslot, + RowIdAttributeNumber, + &isnull); + rowId2 = DatumGetByteaP(val); + Assert(!isnull); + new_event.ate_flags |= AFTER_TRIGGER_ROWID2; + } /* * Also remember the OIDs of partitions to fetch these tuples @@ -6230,20 +6381,6 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, break; } - /* Determine flags */ - if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) - { - if (row_trigger && event == TRIGGER_EVENT_UPDATE) - { - if (relkind == RELKIND_PARTITIONED_TABLE) - new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; - else - new_event.ate_flags = AFTER_TRIGGER_2CTID; - } - else - new_event.ate_flags = AFTER_TRIGGER_1CTID; - } - /* else, we'll initialize ate_flags for each trigger */ tgtype_level = (row_trigger ? TRIGGER_TYPE_ROW : TRIGGER_TYPE_STATEMENT); @@ -6409,7 +6546,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, new_shared.ats_modifiedcols = afterTriggerCopyBitmap(modifiedCols); afterTriggerAddEvent(&afterTriggers.query_stack[afterTriggers.query_depth].events, - &new_event, &new_shared); + &new_event, &new_shared, rowId1, rowId2); } /* diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index aa68c115ba9..d830006d61b 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -4994,7 +4994,9 @@ ExecEvalSysVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext, op->resnull); *op->resvalue = d; /* this ought to be unreachable, but it's cheap enough to check */ - if (unlikely(*op->resnull)) + if (op->d.var.attnum != RowIdAttributeNumber && + op->d.var.attnum != SelfItemPointerAttributeNumber && + unlikely(*op->resnull)) elog(ERROR, "failed to fetch attribute from slot"); } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 2365c6861be..26b3974b9fb 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -869,13 +869,15 @@ InitPlan(QueryDesc *queryDesc, int eflags) Oid relid; Relation relation; ExecRowMark *erm; + RangeTblEntry *rangeEntry; /* ignore "parent" rowmarks; they are irrelevant at runtime */ if (rc->isParent) continue; /* get relation's OID (will produce InvalidOid if subquery) */ - relid = exec_rt_fetch(rc->rti, estate)->relid; + rangeEntry = exec_rt_fetch(rc->rti, estate); + relid = rangeEntry->relid; /* open relation, if we need to access it for this mark type */ switch (rc->markType) @@ -908,6 +910,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) erm->prti = rc->prti; erm->rowmarkId = rc->rowmarkId; erm->markType = rc->markType; + if (erm->markType == ROW_MARK_COPY) + erm->refType = ROW_REF_COPY; + else + erm->refType = rangeEntry->reftype; erm->strength = rc->strength; erm->waitPolicy = rc->waitPolicy; erm->ermActive = false; @@ -1273,6 +1279,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_ChildToRootMap = NULL; resultRelInfo->ri_ChildToRootMapValid = false; resultRelInfo->ri_CopyMultiInsertBuffer = NULL; + + resultRelInfo->ri_RowRefType = table_get_row_ref_type(resultRelationDesc); } /* @@ -2407,17 +2415,28 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist) aerm->rowmark = erm; /* Look up the resjunk columns associated with this rowmark */ - if (erm->markType != ROW_MARK_COPY) + if (erm->refType == ROW_REF_TID) { + Assert(erm->markType != ROW_MARK_COPY); /* need ctid for all methods other than COPY */ snprintf(resname, sizeof(resname), "ctid%u", erm->rowmarkId); aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, resname); if (!AttributeNumberIsValid(aerm->ctidAttNo)) elog(ERROR, "could not find junk %s column", resname); + } else if (erm->refType == ROW_REF_ROWID) + { + Assert(erm->markType != ROW_MARK_COPY); + /* need ctid for all methods other than COPY */ + snprintf(resname, sizeof(resname), "rowid%u", erm->rowmarkId); + aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, + resname); + if (!AttributeNumberIsValid(aerm->ctidAttNo)) + elog(ERROR, "could not find junk %s column", resname); } else { + Assert(erm->markType == ROW_MARK_COPY); /* need wholerow if COPY */ snprintf(resname, sizeof(resname), "wholerow%u", erm->rowmarkId); aerm->wholeAttNo = ExecFindJunkAttributeInTlist(targetlist, @@ -2705,8 +2724,9 @@ EvalPlanQualFetchRowMark(EPQState *epqstate, Index rti, TupleTableSlot *slot) { /* ordinary table, fetch the tuple */ if (!table_tuple_fetch_row_version(erm->relation, - (ItemPointer) DatumGetPointer(datum), - SnapshotAny, slot)) + datum, + SnapshotAny, + slot)) elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); return true; } diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 0cad843fb69..1ace97b6d47 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -250,7 +250,8 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(), + res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + GetLatestSnapshot(), outslot, GetCurrentCommandId(false), lockmode, @@ -434,7 +435,8 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(), + res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + GetLatestSnapshot(), outslot, GetCurrentCommandId(false), lockmode, @@ -569,7 +571,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + PointerGetDatum(tid), NULL, slot, NULL, NULL)) skip_tuple = true; /* "do nothing" */ } @@ -636,7 +638,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + PointerGetDatum(tid), NULL, NULL, NULL, NULL); } if (!skip_tuple) diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c index 41754ddfea9..ac401d7a470 100644 --- a/src/backend/executor/nodeLockRows.c +++ b/src/backend/executor/nodeLockRows.c @@ -27,6 +27,7 @@ #include "executor/nodeLockRows.h" #include "foreign/fdwapi.h" #include "miscadmin.h" +#include "utils/datum.h" #include "utils/rel.h" @@ -157,7 +158,16 @@ ExecLockRows(PlanState *pstate) } /* okay, try to lock (and fetch) the tuple */ - tid = *((ItemPointer) DatumGetPointer(datum)); + if (erm->refType == ROW_REF_TID) + { + tid = *((ItemPointer) DatumGetPointer(datum)); + datum = PointerGetDatum(&tid); + } + else + { + Assert(erm->refType = ROW_REF_ROWID); + datum = datumCopy(datum, false, -1); + } switch (erm->markType) { case ROW_MARK_EXCLUSIVE: @@ -182,12 +192,15 @@ ExecLockRows(PlanState *pstate) if (!IsolationUsesXactSnapshot()) lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; - test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot, + test = table_tuple_lock(erm->relation, datum, estate->es_snapshot, markSlot, estate->es_output_cid, lockmode, erm->waitPolicy, lockflags, &tmfd); + if (erm->refType == ROW_REF_ROWID) + pfree(DatumGetPointer(datum)); + switch (test) { case TM_WouldBlock: diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index fded69d095e..4b3fbbe2009 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -135,12 +135,11 @@ static void ExecPendingInserts(EState *estate); static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, - ItemPointer tupleid, + Datum tupleid, TupleTableSlot *oldSlot, TupleTableSlot *newslot); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning); @@ -153,13 +152,13 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, static TupleTableSlot *ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, bool canSetTag); static void ExecInitMerge(ModifyTableState *mtstate, EState *estate); static TupleTableSlot *ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, bool canSetTag, bool *matched); @@ -167,7 +166,6 @@ static TupleTableSlot *ExecMergeNotMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, bool canSetTag); - /* * Verify that the tuples to be produced by INSERT match the * target relation's rowtype @@ -276,66 +274,6 @@ ExecProcessReturning(ResultRelInfo *resultRelInfo, return ExecProject(projectReturning); } -/* - * ExecCheckTupleVisible -- verify tuple is visible - * - * It would not be consistent with guarantees of the higher isolation levels to - * proceed with avoiding insertion (taking speculative insertion's alternative - * path) on the basis of another tuple that is not visible to MVCC snapshot. - * Check for the need to raise a serialization failure, and do so as necessary. - */ -static void -ExecCheckTupleVisible(EState *estate, - Relation rel, - TupleTableSlot *slot) -{ - if (!IsolationUsesXactSnapshot()) - return; - - if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) - { - Datum xminDatum; - TransactionId xmin; - bool isnull; - - xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - - /* - * We should not raise a serialization failure if the conflict is - * against a tuple inserted by our own transaction, even if it's not - * visible to our snapshot. (This would happen, for example, if - * conflicting keys are proposed for insertion in a single command.) - */ - if (!TransactionIdIsCurrentTransactionId(xmin)) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent update"))); - } -} - -/* - * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() - */ -static void -ExecCheckTIDVisible(EState *estate, - ResultRelInfo *relinfo, - ItemPointer tid, - TupleTableSlot *tempSlot) -{ - Relation rel = relinfo->ri_RelationDesc; - - /* Redundantly check isolation level */ - if (!IsolationUsesXactSnapshot()) - return; - - if (!table_tuple_fetch_row_version(rel, tid, SnapshotAny, tempSlot)) - elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); - ExecCheckTupleVisible(estate, rel, tempSlot); - ExecClearTuple(tempSlot); -} - /* * Initialize to compute stored generated columns for a tuple * @@ -1021,12 +959,19 @@ ExecInsert(ModifyTableContext *context, if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0) { /* Perform a speculative insertion. */ - uint32 specToken; - ItemPointerData conflictTid; - bool specConflict; List *arbiterIndexes; + TupleTableSlot *existing = NULL, + *returningSlot, + *inserted; + LockTupleMode lockmode = LockTupleExclusive; arbiterIndexes = resultRelInfo->ri_onConflictArbiterIndexes; + returningSlot = ExecGetReturningSlot(estate, resultRelInfo); + if (onconflict == ONCONFLICT_UPDATE) + { + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + existing = resultRelInfo->ri_onConflict->oc_Existing; + } /* * Do a non-conclusive check for conflicts first. @@ -1043,23 +988,29 @@ ExecInsert(ModifyTableContext *context, */ vlock: CHECK_FOR_INTERRUPTS(); - specConflict = false; - if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, - &conflictTid, arbiterIndexes)) + + inserted = table_tuple_insert_with_arbiter(resultRelInfo, + slot, estate->es_output_cid, + 0, NULL, arbiterIndexes, estate, + lockmode, existing, returningSlot); + if (!inserted) { - /* committed conflict tuple found */ if (onconflict == ONCONFLICT_UPDATE) { + TupleTableSlot *returning = NULL; + + if (TTS_EMPTY(existing)) + goto vlock; + /* * In case of ON CONFLICT DO UPDATE, execute the UPDATE * part. Be prepared to retry if the UPDATE fails because * of another concurrent UPDATE/DELETE to the conflict * tuple. */ - TupleTableSlot *returning = NULL; if (ExecOnConflictUpdate(context, resultRelInfo, - &conflictTid, slot, canSetTag, + slot, canSetTag, &returning)) { InstrCountTuples2(&mtstate->ps, 1); @@ -1082,57 +1033,13 @@ ExecInsert(ModifyTableContext *context, * ExecGetReturningSlot() in the DO NOTHING case... */ Assert(onconflict == ONCONFLICT_NOTHING); - ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid, - ExecGetReturningSlot(estate, resultRelInfo)); InstrCountTuples2(&mtstate->ps, 1); return NULL; } } - - /* - * Before we start insertion proper, acquire our "speculative - * insertion lock". Others can use that to wait for us to decide - * if we're going to go ahead with the insertion, instead of - * waiting for the whole transaction to complete. - */ - specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); - - /* insert the tuple, with the speculative token */ - table_tuple_insert_speculative(resultRelationDesc, slot, - estate->es_output_cid, - 0, - NULL, - specToken); - - /* insert index entries for tuple */ - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, false, true, - &specConflict, - arbiterIndexes, - false); - - /* adjust the tuple's state accordingly */ - table_tuple_complete_speculative(resultRelationDesc, slot, - specToken, !specConflict); - - /* - * Wake up anyone waiting for our decision. They will re-check - * the tuple, see that it's no longer speculative, and wait on our - * XID as if this was a regularly inserted tuple all along. Or if - * we killed the tuple, they will see it's dead, and proceed as if - * the tuple never existed. - */ - SpeculativeInsertionLockRelease(GetCurrentTransactionId()); - - /* - * If there was a conflict, start from the beginning. We'll do - * the pre-check again, which will now find the conflicting tuple - * (unless it aborts before we get there). - */ - if (specConflict) + else { - list_free(recheckIndexes); - goto vlock; + slot = inserted; } /* Since there was no insertion conflict, we're done */ @@ -1140,9 +1047,9 @@ ExecInsert(ModifyTableContext *context, else { /* insert the tuple normally */ - table_tuple_insert(resultRelationDesc, slot, - estate->es_output_cid, - 0, NULL); + slot = table_tuple_insert(resultRelationDesc, slot, + estate->es_output_cid, + 0, NULL); /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) @@ -1318,7 +1225,7 @@ ExecPendingInserts(EState *estate) */ static bool ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot **epqreturnslot, TM_Result *result) { if (result) @@ -1349,7 +1256,7 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TM_Result ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool changingPart, int options, + Datum tupleid, bool changingPart, int options, TupleTableSlot *oldSlot) { EState *estate = context->estate; @@ -1373,7 +1280,7 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + HeapTuple oldtuple, TupleTableSlot *slot, bool changingPart) { ModifyTableState *mtstate = context->mtstate; @@ -1453,7 +1360,7 @@ ExecInitDeleteTupleSlot(ModifyTableState *mtstate, static TupleTableSlot * ExecDelete(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *oldSlot, bool processReturning, @@ -1646,7 +1553,7 @@ ExecDelete(ModifyTableContext *context, if (tupleDeleted) *tupleDeleted = true; - ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, + ExecDeleteEpilogue(context, resultRelInfo, oldtuple, oldSlot, changingPart); /* Process RETURNING if present and if requested */ @@ -1663,7 +1570,7 @@ ExecDelete(ModifyTableContext *context, /* FDW must have provided a slot containing the deleted row */ Assert(!TupIsNull(slot)); } - else + else if (!slot || TupIsNull(slot)) { /* Copy old tuple to the returning slot */ slot = ExecGetReturningSlot(estate, resultRelInfo); @@ -1712,7 +1619,7 @@ ExecDelete(ModifyTableContext *context, static bool ExecCrossPartitionUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, bool canSetTag, UpdateContext *updateCxt, @@ -1868,7 +1775,7 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, */ static bool ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, TM_Result *result) { Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -1945,7 +1852,7 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo, */ static TM_Result ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, bool canSetTag, int options, TupleTableSlot *oldSlot, UpdateContext *updateCxt) { @@ -2098,7 +2005,7 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, - ResultRelInfo *resultRelInfo, ItemPointer tupleid, + ResultRelInfo *resultRelInfo, HeapTuple oldtuple, TupleTableSlot *slot, TupleTableSlot *oldSlot) { @@ -2148,7 +2055,7 @@ static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, - ItemPointer tupleid, + Datum tupleid, TupleTableSlot *oldslot, TupleTableSlot *newslot) { @@ -2239,7 +2146,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, */ static TupleTableSlot * ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, TupleTableSlot *oldSlot, bool canSetTag, bool locked) { EState *estate = context->estate; @@ -2293,10 +2200,14 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } else { - int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + int options = TABLE_MODIFY_WAIT; - if (!locked && !IsolationUsesXactSnapshot()) - options |= TABLE_MODIFY_LOCK_UPDATED; + if (!locked) + { + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; + if (!IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + } /* * If we generate a new candidate tuple after EvalPlanQual testing, we @@ -2404,7 +2315,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (canSetTag) (estate->es_processed)++; - ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, tupleid, oldtuple, + ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, oldtuple, slot, oldSlot); /* Process RETURNING if present */ @@ -2428,144 +2339,26 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning) { ModifyTableState *mtstate = context->mtstate; ExprContext *econtext = mtstate->ps.ps_ExprContext; - Relation relation = resultRelInfo->ri_RelationDesc; ExprState *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause; TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing; - TM_FailureData tmfd; - LockTupleMode lockmode; - TM_Result test; - Datum xminDatum; - TransactionId xmin; - bool isnull; + Datum tupleid; - /* Determine lock mode to use */ - lockmode = ExecUpdateLockMode(context->estate, resultRelInfo); - - /* - * Lock tuple for update. Don't follow updates when tuple cannot be - * locked without doing so. A row locking conflict here means our - * previous conclusion that the tuple is conclusively committed is not - * true anymore. - */ - test = table_tuple_lock(relation, conflictTid, - context->estate->es_snapshot, - existing, context->estate->es_output_cid, - lockmode, LockWaitBlock, 0, - &tmfd); - switch (test) + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { - case TM_Ok: - /* success! */ - break; - - case TM_Invisible: - - /* - * This can occur when a just inserted tuple is updated again in - * the same command. E.g. because multiple rows with the same - * conflicting key values are inserted. - * - * This is somewhat similar to the ExecUpdate() TM_SelfModified - * case. We do not want to proceed because it would lead to the - * same row being updated a second time in some unspecified order, - * and in contrast to plain UPDATEs there's no historical behavior - * to break. - * - * It is the user's responsibility to prevent this situation from - * occurring. These problems are why the SQL standard similarly - * specifies that for SQL MERGE, an exception must be raised in - * the event of an attempt to update the same row twice. - */ - xminDatum = slot_getsysattr(existing, - MinTransactionIdAttributeNumber, - &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - - if (TransactionIdIsCurrentTransactionId(xmin)) - ereport(ERROR, - (errcode(ERRCODE_CARDINALITY_VIOLATION), - /* translator: %s is a SQL command name */ - errmsg("%s command cannot affect row a second time", - "ON CONFLICT DO UPDATE"), - errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); - - /* This shouldn't happen */ - elog(ERROR, "attempted to lock invisible tuple"); - break; - - case TM_SelfModified: - - /* - * This state should never be reached. As a dirty snapshot is used - * to find conflicting tuples, speculative insertion wouldn't have - * seen this row to conflict with. - */ - elog(ERROR, "unexpected self-updated tuple"); - break; - - case TM_Updated: - if (IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent update"))); - - /* - * As long as we don't support an UPDATE of INSERT ON CONFLICT for - * a partitioned table we shouldn't reach to a case where tuple to - * be lock is moved to another partition due to concurrent update - * of the partition key. - */ - Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); - - /* - * Tell caller to try again from the very start. - * - * It does not make sense to use the usual EvalPlanQual() style - * loop here, as the new version of the row might not conflict - * anymore, or the conflicting tuple has actually been deleted. - */ - ExecClearTuple(existing); - return false; - - case TM_Deleted: - if (IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent delete"))); - - /* see TM_Updated case */ - Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); - ExecClearTuple(existing); - return false; - - default: - elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + bool isnull; + tupleid = slot_getsysattr(existing, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + tupleid = PointerGetDatum(&existing->tts_tid); } - - /* Success, the tuple is locked. */ - - /* - * Verify that the tuple is visible to our MVCC snapshot if the current - * isolation level mandates that. - * - * It's not sufficient to rely on the check within ExecUpdate() as e.g. - * CONFLICT ... WHERE clause may prevent us from reaching that. - * - * This means we only ever continue when a new command in the current - * transaction could see the row, even though in READ COMMITTED mode the - * tuple will not be visible according to the current statement's - * snapshot. This is in line with the way UPDATE deals with newer tuple - * versions. - */ - ExecCheckTupleVisible(context->estate, relation, existing); /* * Make tuple and any needed join variables available to ExecQual and @@ -2621,7 +2414,7 @@ ExecOnConflictUpdate(ModifyTableContext *context, /* Execute UPDATE with projection */ *returning = ExecUpdate(context, resultRelInfo, - conflictTid, NULL, + tupleid, NULL, resultRelInfo->ri_onConflict->oc_ProjSlot, existing, canSetTag, true); @@ -2640,7 +2433,7 @@ ExecOnConflictUpdate(ModifyTableContext *context, */ static TupleTableSlot * ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool canSetTag) + Datum tupleid, HeapTuple oldtuple, bool canSetTag) { TupleTableSlot *rslot = NULL; bool matched; @@ -2706,7 +2499,7 @@ ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * update chain and we never switch from ExecMergeNotMatched() to * ExecMergeMatched(), there is no risk of a livelock. */ - matched = tupleid != NULL || oldtuple != NULL; + matched = DatumGetPointer(tupleid) != NULL || oldtuple != NULL; if (matched) rslot = ExecMergeMatched(context, resultRelInfo, tupleid, oldtuple, canSetTag, &matched); @@ -2766,7 +2559,7 @@ ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TupleTableSlot * ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool canSetTag, + Datum tupleid, HeapTuple oldtuple, bool canSetTag, bool *matched) { ModifyTableState *mtstate = context->mtstate; @@ -2806,7 +2599,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * either have the tupleid of the target row, or an old tuple from the * target wholerow junk attr. */ - Assert(tupleid != NULL || oldtuple != NULL); + Assert(DatumGetPointer(tupleid) != NULL || oldtuple != NULL); if (oldtuple != NULL) ExecForceStoreHeapTuple(oldtuple, resultRelInfo->ri_oldTupleSlot, false); @@ -2927,7 +2720,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok) { ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, - tupleid, NULL, newslot, + NULL, newslot, resultRelInfo->ri_oldTupleSlot); mtstate->mt_merge_updated += 1; } @@ -2963,7 +2756,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok) { - ExecDeleteEpilogue(context, resultRelInfo, tupleid, NULL, + ExecDeleteEpilogue(context, resultRelInfo, NULL, resultRelInfo->ri_oldTupleSlot, false); mtstate->mt_merge_deleted += 1; } @@ -3075,7 +2868,6 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, switch (result) { case TM_Ok: - /* * If the tuple was updated and migrated to * another partition concurrently, the current @@ -3117,9 +2909,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * join quals no longer pass and we switch to * the NOT MATCHED BY SOURCE case. */ - (void) ExecGetJunkAttribute(epqslot, - resultRelInfo->ri_RowIdAttNo, - &isNull); + /* + * Update tupleid to that of the new tuple, for + * the refetch we do at the top. + */ + tupleid = ExecGetJunkAttribute(epqslot, + resultRelInfo->ri_RowIdAttNo, + &isNull); if (isNull) *matched = false; @@ -3128,8 +2924,8 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * we need to switch to the NOT MATCHED BY * SOURCE case. */ - if (!table_tuple_fetch_row_version(resultRelationDesc, - &context->tmfd.ctid, + if (!isNull && !table_tuple_fetch_row_version(resultRelationDesc, + tupleid, SnapshotAny, resultRelInfo->ri_oldTupleSlot)) elog(ERROR, "failed to fetch the target tuple"); @@ -3146,6 +2942,11 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, /* * Loop back and process the MATCHED or NOT * MATCHED BY SOURCE actions from the start. + * A non-NULL ctid means that we are still dealing + * with MATCHED case. Restart the loop so that we + * apply all the MATCHED rules again, to ensure + * that the first qualifying WHEN MATCHED action + * is executed. */ goto lmerge_matched; @@ -3684,10 +3485,10 @@ ExecModifyTable(PlanState *pstate) PlanState *subplanstate; TupleTableSlot *slot; TupleTableSlot *oldSlot; + Datum tupleid; ItemPointerData tuple_ctid; HeapTupleData oldtupdata; HeapTuple oldtuple; - ItemPointer tupleid; CHECK_FOR_INTERRUPTS(); @@ -3736,6 +3537,8 @@ ExecModifyTable(PlanState *pstate) */ for (;;) { + RowRefType refType; + /* * Reset the per-output-tuple exprcontext. This is needed because * triggers expect to use that context as workspace. It's a bit ugly @@ -3811,7 +3614,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the caller. @@ -3855,7 +3658,8 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = context.planSlot; - tupleid = NULL; + refType = resultRelInfo->ri_RowRefType; + tupleid = PointerGetDatum(NULL); oldtuple = NULL; /* @@ -3898,7 +3702,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the @@ -3913,9 +3717,24 @@ ExecModifyTable(PlanState *pstate) elog(ERROR, "ctid is NULL"); } - tupleid = (ItemPointer) DatumGetPointer(datum); - tuple_ctid = *tupleid; /* be sure we don't free ctid!! */ - tupleid = &tuple_ctid; + if (refType == ROW_REF_TID) + { + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "ctid is NULL"); + + tuple_ctid = *((ItemPointer) DatumGetPointer(datum)); /* be sure we don't free ctid!! */ + tupleid = PointerGetDatum(&tuple_ctid); + } + else + { + Assert(refType == ROW_REF_ROWID); + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "rowid is NULL"); + + tupleid = datumCopy(datum, false, -1); + } } /* @@ -3955,7 +3774,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the @@ -4019,6 +3838,7 @@ ExecModifyTable(PlanState *pstate) /* Fetch the most recent version of old tuple. */ Relation relation = resultRelInfo->ri_RelationDesc; + Assert(DatumGetPointer(tupleid) != NULL); if (!table_tuple_fetch_row_version(relation, tupleid, SnapshotAny, oldSlot)) @@ -4053,6 +3873,9 @@ ExecModifyTable(PlanState *pstate) break; } + if (refType == ROW_REF_ROWID && DatumGetPointer(tupleid) != NULL) + pfree(DatumGetPointer(tupleid)); + /* * If we got a RETURNING result, return it to caller. We'll continue * the work on next call. @@ -4297,10 +4120,20 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) relkind == RELKIND_MATVIEW || relkind == RELKIND_PARTITIONED_TABLE) { - resultRelInfo->ri_RowIdAttNo = - ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); - if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) - elog(ERROR, "could not find junk ctid column"); + if (resultRelInfo->ri_RowRefType == ROW_REF_TID) + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk ctid column"); + } + else + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "rowid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk rowid column"); + } } else if (relkind == RELKIND_FOREIGN_TABLE) { @@ -4610,6 +4443,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_auxmodifytables = lcons(mtstate, estate->es_auxmodifytables); + + return mtstate; } diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index 864a9013b62..f4a124ac4eb 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -377,7 +377,7 @@ TidNext(TidScanState *node) if (node->tss_isCurrentOf) table_tuple_get_latest_tid(scan, &tid); - if (table_tuple_fetch_row_version(heapRelation, &tid, snapshot, slot)) + if (table_tuple_fetch_row_version(heapRelation, PointerGetDatum(&tid), snapshot, slot)) return slot; /* Bad TID or failed snapshot qual; try next */ diff --git a/src/backend/nodes/read.c b/src/backend/nodes/read.c index 4eb42445c52..ffa147ee4c8 100644 --- a/src/backend/nodes/read.c +++ b/src/backend/nodes/read.c @@ -205,6 +205,17 @@ pg_strtok(int *length) return ret_str; } +bool +pg_str_hasfield(void) +{ + const char *local_str = pg_strtok_ptr; + + while (*local_str == ' ' || *local_str == '\n' || *local_str == '\t') + local_str++; + + return (*local_str == ':'); +} + /* * debackslash - * create a palloc'd string holding the given token. diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 0c7273b9ccd..2d5b3978ca0 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2314,6 +2314,7 @@ preprocess_rowmarks(PlannerInfo *root) RowMarkClause *rc = lfirst_node(RowMarkClause, l); RangeTblEntry *rte = rt_fetch(rc->rti, parse->rtable); PlanRowMark *newrc; + RowRefType refType; /* * Currently, it is syntactically impossible to have FOR UPDATE et al @@ -2336,8 +2337,8 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = rc->rti; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - newrc->markType = select_rowmark_type(rte, rc->strength); - newrc->allMarkTypes = (1 << newrc->markType); + newrc->markType = select_rowmark_type(rte, rc->strength, &refType); + newrc->allRefTypes = (1 << refType); newrc->strength = rc->strength; newrc->waitPolicy = rc->waitPolicy; newrc->isParent = false; @@ -2353,6 +2354,7 @@ preprocess_rowmarks(PlannerInfo *root) { RangeTblEntry *rte = lfirst_node(RangeTblEntry, l); PlanRowMark *newrc; + RowRefType refType = ROW_REF_TID; i++; if (!bms_is_member(i, rels)) @@ -2361,8 +2363,8 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = i; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - newrc->markType = select_rowmark_type(rte, LCS_NONE); - newrc->allMarkTypes = (1 << newrc->markType); + newrc->markType = select_rowmark_type(rte, LCS_NONE, &refType); + newrc->allRefTypes = (1 << refType); newrc->strength = LCS_NONE; newrc->waitPolicy = LockWaitBlock; /* doesn't matter */ newrc->isParent = false; @@ -2377,11 +2379,13 @@ preprocess_rowmarks(PlannerInfo *root) * Select RowMarkType to use for a given table */ RowMarkType -select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength) +select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength, + RowRefType *refType) { if (rte->rtekind != RTE_RELATION) { /* If it's not a table at all, use ROW_MARK_COPY */ + *refType = ROW_REF_COPY; return ROW_MARK_COPY; } else if (rte->relkind == RELKIND_FOREIGN_TABLE) @@ -2392,10 +2396,12 @@ select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength) if (fdwroutine->GetForeignRowMarkType != NULL) return fdwroutine->GetForeignRowMarkType(rte, strength); /* Otherwise, use ROW_MARK_COPY by default */ + *refType = ROW_REF_COPY; return ROW_MARK_COPY; } else { + *refType = rte->reftype; /* Regular table, apply the appropriate lock type */ switch (strength) { diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c index 931b9c09bda..9c4671c817e 100644 --- a/src/backend/optimizer/prep/preptlist.c +++ b/src/backend/optimizer/prep/preptlist.c @@ -234,7 +234,7 @@ preprocess_targetlist(PlannerInfo *root) if (rc->rti != rc->prti) continue; - if (rc->allMarkTypes & ~(1 << ROW_MARK_COPY)) + if (rc->allRefTypes & (1 << ROW_REF_TID)) { /* Need to fetch TID */ var = makeVar(rc->rti, @@ -250,7 +250,23 @@ preprocess_targetlist(PlannerInfo *root) true); tlist = lappend(tlist, tle); } - if (rc->allMarkTypes & (1 << ROW_MARK_COPY)) + if (rc->allRefTypes & (1 << ROW_REF_ROWID)) + { + /* Need to fetch TID */ + var = makeVar(rc->rti, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "rowid%u", rc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(tlist) + 1, + pstrdup(resname), + true); + tlist = lappend(tlist, tle); + } + if (rc->allRefTypes & (1 << ROW_REF_COPY)) { /* Need the whole row as a junk var */ var = makeWholeRowVar(rt_fetch(rc->rti, range_table), diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c index 6ba4eba224a..ea012b2c164 100644 --- a/src/backend/optimizer/util/appendinfo.c +++ b/src/backend/optimizer/util/appendinfo.c @@ -895,17 +895,35 @@ add_row_identity_columns(PlannerInfo *root, Index rtindex, relkind == RELKIND_MATVIEW || relkind == RELKIND_PARTITIONED_TABLE) { + RowRefType refType = ROW_REF_TID; + + refType = table_get_row_ref_type(target_relation); + /* * Emit CTID so that executor can find the row to merge, update or * delete. */ - var = makeVar(rtindex, - SelfItemPointerAttributeNumber, - TIDOID, - -1, - InvalidOid, - 0); - add_row_identity_var(root, var, rtindex, "ctid"); + if (refType == ROW_REF_TID) + { + var = makeVar(rtindex, + SelfItemPointerAttributeNumber, + TIDOID, + -1, + InvalidOid, + 0); + add_row_identity_var(root, var, rtindex, "ctid"); + } + else + { + Assert(refType == ROW_REF_ROWID); + var = makeVar(rtindex, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + add_row_identity_var(root, var, rtindex, "rowid"); + } } else if (relkind == RELKIND_FOREIGN_TABLE) { diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index c5b906a9d43..17c36c03202 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -16,6 +16,7 @@ #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/partition.h" #include "catalog/pg_inherits.h" #include "catalog/pg_type.h" @@ -91,7 +92,7 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, LOCKMODE lockmode; PlanRowMark *oldrc; bool old_isParent = false; - int old_allMarkTypes = 0; + int old_allRefTypes = 0; Assert(rte->inh); /* else caller error */ @@ -131,8 +132,8 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, { old_isParent = oldrc->isParent; oldrc->isParent = true; - /* Save initial value of allMarkTypes before children add to it */ - old_allMarkTypes = oldrc->allMarkTypes; + /* Save initial value of allRefTypes before children add to it */ + old_allRefTypes = oldrc->allRefTypes; } /* Scan the inheritance set and expand it */ @@ -239,15 +240,15 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, */ if (oldrc) { - int new_allMarkTypes = oldrc->allMarkTypes; + int new_allRefTypes = oldrc->allRefTypes; Var *var; TargetEntry *tle; char resname[32]; List *newvars = NIL; /* Add TID junk Var if needed, unless we had it already */ - if (new_allMarkTypes & ~(1 << ROW_MARK_COPY) && - !(old_allMarkTypes & ~(1 << ROW_MARK_COPY))) + if (new_allRefTypes & (1 << ROW_REF_TID) && + !(old_allRefTypes & (1 << ROW_REF_TID))) { /* Need to fetch TID */ var = makeVar(oldrc->rti, @@ -266,8 +267,8 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, } /* Add whole-row junk Var if needed, unless we had it already */ - if ((new_allMarkTypes & (1 << ROW_MARK_COPY)) && - !(old_allMarkTypes & (1 << ROW_MARK_COPY))) + if ((new_allRefTypes & (1 << ROW_REF_COPY)) && + !(old_allRefTypes & (1 << ROW_REF_COPY))) { var = makeWholeRowVar(planner_rt_fetch(oldrc->rti, root), oldrc->rti, @@ -282,6 +283,24 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, newvars = lappend(newvars, var); } + if ((new_allRefTypes & (1 << ROW_REF_ROWID)) && + !(old_allRefTypes & (1 << ROW_REF_ROWID))) + { + var = makeVar(oldrc->rti, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "rowid%u", oldrc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(root->processed_tlist) + 1, + pstrdup(resname), + true); + root->processed_tlist = lappend(root->processed_tlist, tle); + newvars = lappend(newvars, var); + } + /* Add tableoid junk Var, unless we had it already */ if (!old_isParent) { @@ -450,7 +469,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, * where the hierarchy is flattened during RTE expansion.) * * PlanRowMarks still carry the top-parent's RTI, and the top-parent's - * allMarkTypes field still accumulates values from all descendents. + * allRefTypes field still accumulates values from all descendents. * * "parentrte" and "parentRTindex" are immediate parent's RTE and * RTI. "top_parentrc" is top parent's PlanRowMark. @@ -494,6 +513,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Assert(parentrte->rtekind == RTE_RELATION); /* else this is dubious */ childrte->relid = childOID; childrte->relkind = childrel->rd_rel->relkind; + childrte->reftype = table_get_row_ref_type(childrel); /* A partitioned child will need to be expanded further. */ if (childrte->relkind == RELKIND_PARTITIONED_TABLE) { @@ -583,14 +603,16 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, if (top_parentrc) { PlanRowMark *childrc = makeNode(PlanRowMark); + RowRefType refType; childrc->rti = childRTindex; childrc->prti = top_parentrc->rti; childrc->rowmarkId = top_parentrc->rowmarkId; /* Reselect rowmark type, because relkind might not match parent */ childrc->markType = select_rowmark_type(childrte, - top_parentrc->strength); - childrc->allMarkTypes = (1 << childrc->markType); + top_parentrc->strength, + &refType); + childrc->allRefTypes = (1 << refType); childrc->strength = top_parentrc->strength; childrc->waitPolicy = top_parentrc->waitPolicy; @@ -601,8 +623,8 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, */ childrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); - /* Include child's rowmark type in top parent's allMarkTypes */ - top_parentrc->allMarkTypes |= childrc->allMarkTypes; + /* Include child's rowmark type in top parent's allRefTypes */ + top_parentrc->allRefTypes |= childrc->allRefTypes; root->rowMarks = lappend(root->rowMarks, childrc); } diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 2f64eaf0e37..37d9b072b38 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -20,6 +20,7 @@ #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/heap.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" @@ -1503,6 +1504,7 @@ addRangeTableEntry(ParseState *pstate, rte->inh = inh; rte->relkind = rel->rd_rel->relkind; rte->rellockmode = lockmode; + rte->reftype = table_get_row_ref_type(rel); /* * Build the list of effective column names using user-supplied aliases @@ -1588,6 +1590,7 @@ addRangeTableEntryForRelation(ParseState *pstate, rte->inh = inh; rte->relkind = rel->rd_rel->relkind; rte->rellockmode = lockmode; + rte->reftype = table_get_row_ref_type(rel); /* * Build the list of effective column names using user-supplied aliases @@ -1656,6 +1659,7 @@ addRangeTableEntryForSubquery(ParseState *pstate, rte->rtekind = RTE_SUBQUERY; rte->subquery = subquery; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias("unnamed_subquery", NIL); numaliases = list_length(eref->colnames); @@ -1763,6 +1767,7 @@ addRangeTableEntryForFunction(ParseState *pstate, rte->functions = NIL; /* we'll fill this list below */ rte->funcordinality = rangefunc->ordinality; rte->alias = alias; + rte->reftype = ROW_REF_COPY; /* * Choose the RTE alias name. We default to using the first function's @@ -2079,6 +2084,7 @@ addRangeTableEntryForTableFunc(ParseState *pstate, rte->coltypmods = tf->coltypmods; rte->colcollations = tf->colcollations; rte->alias = alias; + rte->reftype = ROW_REF_COPY; refname = alias ? alias->aliasname : pstrdup(tf->functype == TFT_XMLTABLE ? "xmltable" : "json_table"); @@ -2156,6 +2162,7 @@ addRangeTableEntryForValues(ParseState *pstate, rte->coltypmods = coltypmods; rte->colcollations = colcollations; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias(refname, NIL); @@ -2252,6 +2259,7 @@ addRangeTableEntryForJoin(ParseState *pstate, rte->joinrightcols = rightcols; rte->join_using_alias = join_using_alias; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias("unnamed_join", NIL); numaliases = list_length(eref->colnames); @@ -2332,6 +2340,7 @@ addRangeTableEntryForCTE(ParseState *pstate, rte->rtekind = RTE_CTE; rte->ctename = cte->ctename; rte->ctelevelsup = levelsup; + rte->reftype = ROW_REF_COPY; /* Self-reference if and only if CTE's parse analysis isn't completed */ rte->self_reference = !IsA(cte->ctequery, Query); @@ -2494,6 +2503,7 @@ addRangeTableEntryForENR(ParseState *pstate, * if they access transition tables linked to a table that is altered. */ rte->relid = enrmd->reliddesc; + rte->reftype = ROW_REF_COPY; /* * Build the list of effective column names using user-supplied aliases @@ -3262,6 +3272,9 @@ get_rte_attribute_name(RangeTblEntry *rte, AttrNumber attnum) attnum > 0 && attnum <= list_length(rte->alias->colnames)) return strVal(list_nth(rte->alias->colnames, attnum - 1)); + if (attnum == RowIdAttributeNumber) + return "rowid"; + /* * If the RTE is a relation, go to the system catalogs not the * eref->colnames list. This is a little slower but it will give the diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 8f27026d193..287c6d31556 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -2680,7 +2680,9 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW || ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE); - relopts = extractRelOptions(tup, pg_class_desc, NULL); + relopts = extractRelOptions(tup, pg_class_desc, + GetTableAmRoutineByAmOid(((Form_pg_class) GETSTRUCT(tup))->relam), + NULL); if (relopts == NULL) return NULL; diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 6d59a2bb8dc..e9696b52d9f 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -23,6 +23,7 @@ #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/dependency.h" #include "commands/trigger.h" #include "executor/executor.h" diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 62601a6d80c..9760febe7cc 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -244,6 +244,7 @@ RI_FKey_check(TriggerData *trigdata) TupleTableSlot *newslot; RI_QueryKey qkey; SPIPlanPtr qplan; + Relation rel = trigdata->tg_relation; riinfo = ri_FetchConstraintInfo(trigdata->tg_trigger, trigdata->tg_relation, false); @@ -261,7 +262,7 @@ RI_FKey_check(TriggerData *trigdata) * and lock on the buffer to call HeapTupleSatisfiesVisibility. Caller * should be holding pin, but not lock. */ - if (!table_tuple_satisfies_snapshot(trigdata->tg_relation, newslot, SnapshotSelf)) + if (!table_tuple_satisfies_snapshot(rel, newslot, SnapshotSelf)) return PointerGetDatum(NULL); /* @@ -1327,7 +1328,7 @@ RI_FKey_fk_upd_check_required(Trigger *trigger, Relation fk_rel, * this if we knew the INSERT trigger already fired, but there is no easy * way to know that.) */ - if (slot_is_current_xact_tuple(oldslot)) + if (table_tuple_is_current(fk_rel, oldslot)) return true; /* If all old and new key values are equal, no check is needed */ diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 930cc03ee20..558e428e9b7 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -33,6 +33,7 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/parallel.h" +#include "access/relation.h" #include "access/reloptions.h" #include "access/sysattr.h" #include "access/table.h" @@ -319,6 +320,7 @@ static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid, StrategyNumber numSupport); static void RelationCacheInitFileRemoveInDir(const char *tblspcpath); static void unlink_initfile(const char *initfilename, int elevel); +static void release_rd_amcache(Relation rel); /* @@ -463,8 +465,9 @@ AllocateRelationDesc(Form_pg_class relp) static void RelationParseRelOptions(Relation relation, HeapTuple tuple) { - bytea *options; - amoptions_function amoptsfn; + bytea *options; + amoptions_function amoptsfn; + const TableAmRoutine *tableam = NULL; relation->rd_options = NULL; @@ -476,9 +479,10 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) { case RELKIND_RELATION: case RELKIND_TOASTVALUE: - case RELKIND_VIEW: case RELKIND_MATVIEW: + case RELKIND_VIEW: case RELKIND_PARTITIONED_TABLE: + tableam = relation->rd_tableam; amoptsfn = NULL; break; case RELKIND_INDEX: @@ -490,11 +494,12 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) } /* - * Fetch reloptions from tuple; have to use a hardwired descriptor because - * we might not have any other for pg_class yet (consider executing this - * code for pg_class itself) - */ - options = extractRelOptions(tuple, GetPgClassDescriptor(), amoptsfn); + * Fetch reloptions from tuple; have to use a hardwired descriptor because + * we might not have any other for pg_class yet (consider executing this + * code for pg_class itself) + */ + options = extractRelOptions(tuple, GetPgClassDescriptor(), + tableam, amoptsfn); /* * Copy parsed data into CacheMemoryContext. To guard against the @@ -2270,9 +2275,7 @@ RelationReloadIndexInfo(Relation relation) RelationCloseSmgr(relation); /* Must free any AM cached data upon relcache flush */ - if (relation->rd_amcache) - pfree(relation->rd_amcache); - relation->rd_amcache = NULL; + release_rd_amcache(relation); /* * If it's a shared index, we might be called before backend startup has @@ -2492,8 +2495,7 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) pfree(relation->rd_options); if (relation->rd_indextuple) pfree(relation->rd_indextuple); - if (relation->rd_amcache) - pfree(relation->rd_amcache); + release_rd_amcache(relation); if (relation->rd_fdwroutine) pfree(relation->rd_fdwroutine); if (relation->rd_indexcxt) @@ -2580,9 +2582,7 @@ RelationClearRelation(Relation relation, bool rebuild) RelationCloseSmgr(relation); /* Free AM cached data, if any */ - if (relation->rd_amcache) - pfree(relation->rd_amcache); - relation->rd_amcache = NULL; + release_rd_amcache(relation); /* * Treat nailed-in system relations separately, they always need to be @@ -6896,3 +6896,9 @@ ResOwnerReleaseRelation(Datum res) RelationCloseCleanup((Relation) res); } + +static void +release_rd_amcache(Relation rel) +{ + table_free_rd_amcache(rel); +} diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 947a868e569..d3a41533552 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -1100,6 +1100,36 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward, } } +/* + * Same as tuplestore_gettupleslot(), but foces tuple storage to slot. Thus, + * it can work with slot types different than minimal tuple. + */ +bool +tuplestore_force_gettupleslot(Tuplestorestate *state, bool forward, + bool copy, TupleTableSlot *slot) +{ + MinimalTuple tuple; + bool should_free; + + tuple = (MinimalTuple) tuplestore_gettuple(state, forward, &should_free); + + if (tuple) + { + if (copy && !should_free) + { + tuple = heap_copy_minimal_tuple(tuple); + should_free = true; + } + ExecForceStoreMinimalTuple(tuple, slot, should_free); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + /* * tuplestore_advance - exported function to adjust position without fetching * diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index 81829b8270a..8ddc75df287 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -21,6 +21,7 @@ #include "access/amapi.h" #include "access/htup.h" +#include "access/tableam.h" #include "access/tupdesc.h" #include "nodes/pg_list.h" #include "storage/lock.h" @@ -224,6 +225,7 @@ extern Datum transformRelOptions(Datum oldOptions, List *defList, bool acceptOidsOff, bool isReset); extern List *untransformRelOptions(Datum options); extern bytea *extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, + const TableAmRoutine *tableam, amoptions_function amoptions); extern void *build_reloptions(Datum reloptions, bool validate, relopt_kind kind, diff --git a/src/include/access/sysattr.h b/src/include/access/sysattr.h index e88dec71ee9..867b5eb489e 100644 --- a/src/include/access/sysattr.h +++ b/src/include/access/sysattr.h @@ -24,6 +24,7 @@ #define MaxTransactionIdAttributeNumber (-4) #define MaxCommandIdAttributeNumber (-5) #define TableOidAttributeNumber (-6) -#define FirstLowInvalidHeapAttributeNumber (-7) +#define RowIdAttributeNumber (-7) +#define FirstLowInvalidHeapAttributeNumber (-8) #endif /* SYSATTR_H */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 504cd383f57..f30c507abb1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -17,11 +17,15 @@ #ifndef TABLEAM_H #define TABLEAM_H +#include "access/amapi.h" #include "access/relscan.h" #include "access/sdir.h" #include "access/xact.h" #include "executor/tuptable.h" #include "storage/read_stream.h" +#include "nodes/execnodes.h" +#include "storage/bufmgr.h" +#include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" @@ -40,6 +44,16 @@ struct TBMIterateResult; struct VacuumParams; struct ValidateIndexState; +typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, + double *totaldeadrows); + +/* in commands/analyze.c */ +extern int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); + /* * Bitmask values for the flags argument to the scan_begin callback. */ @@ -308,6 +322,9 @@ typedef struct TableAmRoutine */ const TupleTableSlotOps *(*slot_callbacks) (Relation rel); + RowRefType (*get_row_ref_type) (Relation rel); + + void (*free_rd_amcache) (Relation rel); /* ------------------------------------------------------------------------ * Table scan callbacks. @@ -477,7 +494,7 @@ typedef struct TableAmRoutine * test, returns true, false otherwise. */ bool (*tuple_fetch_row_version) (Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot); @@ -513,23 +530,19 @@ typedef struct TableAmRoutine */ /* see table_tuple_insert() for reference about parameters */ - void (*tuple_insert) (Relation rel, TupleTableSlot *slot, + TupleTableSlot *(*tuple_insert) (Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate); - /* see table_tuple_insert_speculative() for reference about parameters */ - void (*tuple_insert_speculative) (Relation rel, - TupleTableSlot *slot, - CommandId cid, - int options, - struct BulkInsertStateData *bistate, - uint32 specToken); - - /* see table_tuple_complete_speculative() for reference about parameters */ - void (*tuple_complete_speculative) (Relation rel, - TupleTableSlot *slot, - uint32 specToken, - bool succeeded); + TupleTableSlot *(*tuple_insert_with_arbiter) (ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot); /* see table_multi_insert() for reference about parameters */ void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots, @@ -537,7 +550,7 @@ typedef struct TableAmRoutine /* see table_tuple_delete() for reference about parameters */ TM_Result (*tuple_delete) (Relation rel, - ItemPointer tid, + Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, @@ -548,7 +561,7 @@ typedef struct TableAmRoutine /* see table_tuple_update() for reference about parameters */ TM_Result (*tuple_update) (Relation rel, - ItemPointer otid, + Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, @@ -561,7 +574,7 @@ typedef struct TableAmRoutine /* see table_tuple_lock() for reference about parameters */ TM_Result (*tuple_lock) (Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, @@ -880,6 +893,14 @@ typedef struct TableAmRoutine struct SampleScanState *scanstate, TupleTableSlot *slot); + /* Check if tuple in the slot belongs to the current transaction */ + bool (*tuple_is_current) (Relation rel, TupleTableSlot *slot); + + void (*analyze_table) (Relation relation, + AcquireSampleRowsFunc *func, + BlockNumber *totalpages); + + bytea *(*reloptions) (char relkind, Datum reloptions, bool validate); } TableAmRoutine; @@ -1293,7 +1314,7 @@ extern bool table_index_fetch_tuple_check(Relation rel, */ static inline bool table_tuple_fetch_row_version(Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot) { @@ -1305,7 +1326,7 @@ table_tuple_fetch_row_version(Relation rel, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding"); - return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); + return rel->rd_tableam->tuple_fetch_row_version(rel, tupleid, snapshot, slot); } /* @@ -1405,45 +1426,32 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) * insertion. But note that any toasting of fields within the slot is NOT * reflected in the slots contents. */ -static inline void +static inline TupleTableSlot * table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate) { - rel->rd_tableam->tuple_insert(rel, slot, cid, options, - bistate); -} - -/* - * Perform a "speculative insertion". These can be backed out afterwards - * without aborting the whole transaction. Other sessions can wait for the - * speculative insertion to be confirmed, turning it into a regular tuple, or - * aborted, as if it never existed. Speculatively inserted tuples behave as - * "value locks" of short duration, used to implement INSERT .. ON CONFLICT. - * - * A transaction having performed a speculative insertion has to either abort, - * or finish the speculative insertion with - * table_tuple_complete_speculative(succeeded = ...). - */ -static inline void -table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot, - CommandId cid, int options, - struct BulkInsertStateData *bistate, - uint32 specToken) -{ - rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options, - bistate, specToken); + return rel->rd_tableam->tuple_insert(rel, slot, cid, options, bistate); } -/* - * Complete "speculative insertion" started in the same transaction. If - * succeeded is true, the tuple is fully inserted, if false, it's removed. - */ -static inline void -table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot, - uint32 specToken, bool succeeded) +static inline TupleTableSlot * +table_tuple_insert_with_arbiter(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot) { - rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken, - succeeded); + Relation rel = resultRelInfo->ri_RelationDesc; + + return rel->rd_tableam->tuple_insert_with_arbiter(resultRelInfo, + slot, cid, options, + bistate, arbiterIndexes, + estate, + lockmode, lockedSlot, + tempSlot); } /* @@ -1505,12 +1513,12 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, * TM_FailureData for additional info. */ static inline TM_Result -table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, +table_tuple_delete(Relation rel, Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, bool changingPart, TupleTableSlot *oldSlot) { - return rel->rd_tableam->tuple_delete(rel, tid, cid, + return rel->rd_tableam->tuple_delete(rel, tupleid, cid, snapshot, crosscheck, options, tmfd, changingPart, oldSlot); @@ -1561,13 +1569,13 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * for additional info. */ static inline TM_Result -table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, +table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot) { - return rel->rd_tableam->tuple_update(rel, otid, slot, + return rel->rd_tableam->tuple_update(rel, tupleid, slot, cid, snapshot, crosscheck, options, tmfd, lockmode, update_indexes, @@ -1608,12 +1616,12 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * comments for struct TM_FailureData for additional info. */ static inline TM_Result -table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, +table_tuple_lock(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd) { - return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot, + return rel->rd_tableam->tuple_lock(rel, tupleid, snapshot, slot, cid, mode, wait_policy, flags, tmfd); } @@ -2077,6 +2085,11 @@ table_scan_sample_next_tuple(TableScanDesc scan, slot); } +static inline bool +table_tuple_is_current(Relation rel, TupleTableSlot *slot) +{ + return rel->rd_tableam->tuple_is_current(rel, slot); +} /* ---------------------------------------------------------------------------- * Functions to make modifications a bit simpler. @@ -2131,12 +2144,60 @@ extern void table_block_relation_estimate_size(Relation rel, */ extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler); +extern const TableAmRoutine *GetTableAmRoutineByAmOid(Oid amoid); +extern const TableAmRoutine *GetHeapamTableAmRoutine(void); -/* ---------------------------------------------------------------------------- - * Functions in heapam_handler.c - * ---------------------------------------------------------------------------- - */ +static inline RowRefType +table_get_row_ref_type(Relation rel) +{ + if (rel->rd_tableam) + return rel->rd_tableam->get_row_ref_type(rel); + else + return ROW_REF_TID; +} -extern const TableAmRoutine *GetHeapamTableAmRoutine(void); +static inline void +table_free_rd_amcache(Relation rel) +{ + if (rel->rd_tableam) + { + rel->rd_tableam->free_rd_amcache(rel); + } + else + { + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; + } +} + +static inline void +table_analyze(Relation relation, AcquireSampleRowsFunc *func, + BlockNumber *totalpages) +{ + if (relation->rd_tableam->analyze_table) + { + relation->rd_tableam->analyze_table(relation, func, totalpages); + } + else + { + *func = acquire_sample_rows; + *totalpages = RelationGetNumberOfBlocks(relation); + } +} + +static inline bytea * +table_reloptions(Relation rel, char relkind, + Datum reloptions, bool validate) +{ + return rel->rd_tableam->reloptions(relkind, reloptions, validate); +} + +static inline bytea * +tableam_reloptions(const TableAmRoutine *tableam, char relkind, + Datum reloptions, bool validate) +{ + return tableam->reloptions(relkind, reloptions, validate); +} #endif /* TABLEAM_H */ diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index cb968d03ecd..c16e6b6e5a0 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -209,7 +209,7 @@ extern void ExecASDeleteTriggers(EState *estate, extern bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, @@ -231,7 +231,7 @@ extern void ExecASUpdateTriggers(EState *estate, extern bool ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 759f9a87d38..dfea1e93e33 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -378,6 +378,9 @@ extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc); extern void analyze_rel(Oid relid, RangeVar *relation, VacuumParams *params, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy); +extern int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); extern bool std_typanalyze(VacAttrStats *stats); /* in utils/misc/sampling.c --- duplicate of declarations in utils/sampling.h */ diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index fcde3876b28..777e59c86e9 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -13,6 +13,7 @@ #define FDWAPI_H #include "access/parallel.h" +#include "access/tableam.h" #include "nodes/execnodes.h" #include "nodes/pathnodes.h" @@ -148,11 +149,6 @@ typedef void (*ExplainForeignModify_function) (ModifyTableState *mtstate, typedef void (*ExplainDirectModify_function) (ForeignScanState *node, struct ExplainState *es); -typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, - double *totaldeadrows); - typedef bool (*AnalyzeForeignTable_function) (Relation relation, AcquireSampleRowsFunc *func, BlockNumber *totalpages); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index cd1b16296b5..48c7fec14ac 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -457,6 +457,8 @@ typedef struct ResultRelInfo /* relation descriptor for result relation */ Relation ri_RelationDesc; + RowRefType ri_RowRefType; + /* # of indices existing on result relation */ int ri_NumIndices; @@ -754,6 +756,7 @@ typedef struct ExecRowMark Index prti; /* parent range table index, if child */ Index rowmarkId; /* unique identifier for resjunk columns */ RowMarkType markType; /* see enum in nodes/plannodes.h */ + RowRefType refType; LockClauseStrength strength; /* LockingClause's strength, or LCS_NONE */ LockWaitPolicy waitPolicy; /* NOWAIT and SKIP LOCKED */ bool ermActive; /* is this mark relevant for current tuple? */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 67c90a2bd32..ddc80007b34 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1103,6 +1103,7 @@ typedef struct RangeTblEntry Index perminfoindex pg_node_attr(query_jumble_ignore); /* sampling info, or NULL */ struct TableSampleClause *tablesample; + RowRefType reftype; /* * Fields valid for a subquery RTE (else NULL): diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 1aeeaec95e1..9b41e298b0b 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1353,7 +1353,7 @@ typedef enum RowMarkType * child relations will also have entries with isParent = true. The child * entries have rti == child rel's RT index and prti == top parent's RT index, * and can therefore be recognized as children by the fact that prti != rti. - * The parent's allMarkTypes field gets the OR of (1< Date: Mon, 13 Dec 2021 00:19:41 +0300 Subject: [PATCH 08/56] Hook for custom error cleanup --- src/backend/access/transam/xact.c | 2 ++ src/backend/postmaster/autovacuum.c | 1 + src/backend/postmaster/auxprocess.c | 1 + src/backend/postmaster/bgwriter.c | 1 + src/backend/postmaster/checkpointer.c | 2 ++ src/backend/postmaster/walwriter.c | 1 + src/backend/replication/walsender.c | 1 + src/backend/storage/lmgr/proc.c | 2 ++ src/backend/utils/error/elog.c | 9 +++++++++ src/include/utils/elog.h | 6 ++++++ 10 files changed, 26 insertions(+) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 4cecf630060..da7b20b3f05 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2770,6 +2770,7 @@ AbortTransaction(void) * while cleaning up! */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Clear wait information and command progress indicator */ pgstat_report_wait_end(); @@ -5180,6 +5181,7 @@ AbortSubTransaction(void) * Buffer locks, for example? I don't think so but I'm not sure. */ LWLockReleaseAll(); + CustomErrorCleanup(); pgstat_report_wait_end(); pgstat_progress_end_command(); diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 287c6d31556..0f85dc13407 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -458,6 +458,7 @@ AutoVacLauncherMain(char *startup_data, size_t startup_data_len) * transaction. */ LWLockReleaseAll(); + CustomErrorCleanup(); pgstat_report_wait_end(); UnlockBuffers(); /* this is probably dead code, but let's be safe: */ diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index 78f4263eeb1..4dae7ce9c3c 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -101,6 +101,7 @@ static void ShutdownAuxiliaryProcess(int code, Datum arg) { LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); } diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 0f75548759a..74cc63cc89f 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -167,6 +167,7 @@ BackgroundWriterMain(char *startup_data, size_t startup_data_len) * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); UnlockBuffers(); ReleaseAuxProcessResources(false); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 199f008bcda..ef75041de37 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -204,6 +204,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len) */ pqsignal(SIGCHLD, SIG_DFL); + /* * Initialize so that first time-driven event happens at the correct time. */ @@ -266,6 +267,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len) * files. */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); UnlockBuffers(); diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 6e7918a78d4..3cb439d377a 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -164,6 +164,7 @@ WalWriterMain(char *startup_data, size_t startup_data_len) * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); UnlockBuffers(); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index c3181e3295e..71be0e15f61 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -327,6 +327,7 @@ void WalSndErrorCleanup(void) { LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index ce29da90121..d083a102178 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -861,6 +861,7 @@ ProcKill(int code, Datum arg) * facility by releasing our PGPROC ... */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); @@ -982,6 +983,7 @@ AuxiliaryProcKill(int code, Datum arg) /* Release any LW locks I am holding (see notes above) */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index b924b524d0b..9524530282e 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -3770,3 +3770,12 @@ write_stderr(const char *fmt,...) #endif va_end(ap); } + +CustomErrorCleanupHookType CustomErrorCleanupHook = NULL; + +void +CustomErrorCleanup(void) +{ + if (CustomErrorCleanupHook) + CustomErrorCleanupHook(); +} diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index e54eca5b489..f583eca37ee 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -537,4 +537,10 @@ extern void write_jsonlog(ErrorData *edata); */ extern void write_stderr(const char *fmt,...) pg_attribute_printf(1, 2); +typedef void (*CustomErrorCleanupHookType) (void); + +extern CustomErrorCleanupHookType CustomErrorCleanupHook; + +extern void CustomErrorCleanup(void); + #endif /* ELOG_H */ From ae09f7059ca6c1e602e98902df7cf9c7e6f72b29 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 01:51:03 +0300 Subject: [PATCH 09/56] Snapshot extension and hooks Snapshot have two pairing heap nodes: for data and system undos. --- src/backend/access/transam/xact.c | 11 ++++++++ src/backend/access/transam/xlog.c | 3 ++ src/backend/storage/ipc/procarray.c | 8 ++++++ src/backend/utils/time/snapmgr.c | 44 +++++++++++++++++++++++++++++ src/include/access/transam.h | 11 ++++++-- src/include/access/xlog.h | 1 + src/include/storage/proc.h | 1 + src/include/storage/procarray.h | 2 ++ src/include/utils/snapmgr.h | 11 +++++++- src/include/utils/snapshot.h | 13 +++++++++ 10 files changed, 102 insertions(+), 3 deletions(-) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index da7b20b3f05..c740d46023a 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -212,6 +212,7 @@ typedef struct TransactionStateData bool parallelChildXact; /* is any parent transaction parallel? */ bool chain; /* start a new block after this one */ bool topXidLogged; /* for a subxact: is top-level XID logged? */ + CommitSeqNo csn; struct TransactionStateData *parent; /* back link to parent */ } TransactionStateData; @@ -245,6 +246,7 @@ static TransactionStateData TopTransactionStateData = { .state = TRANS_DEFAULT, .blockState = TBLOCK_DEFAULT, .topXidLogged = false, + .csn = COMMITSEQNO_INPROGRESS }; /* @@ -2035,6 +2037,7 @@ StartTransaction(void) */ s->state = TRANS_START; s->fullTransactionId = InvalidFullTransactionId; /* until assigned */ + s->csn = COMMITSEQNO_INPROGRESS; /* Determine if statements are logged in this transaction */ xact_is_sampled = log_xact_sample_rate != 0 && @@ -2336,7 +2339,9 @@ CommitTransaction(void) * must be done _before_ releasing locks we hold and _after_ * RecordTransactionCommit. */ + MyProc->lastCommittedCSN = s->csn; ProcArrayEndTransaction(MyProc, latestXid); + s->csn = MyProc->lastCommittedCSN; /* * This is all post-commit cleanup. Note that if an error is raised here, @@ -6384,3 +6389,9 @@ xact_redo(XLogReaderState *record) else elog(PANIC, "xact_redo: unknown op code %u", info); } + +CommitSeqNo +GetCurrentCSN(void) +{ + return TopTransactionStateData.csn; +} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7f136026277..ac21ddc22fe 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -135,6 +135,7 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +CommitSeqNo startupCommitSeqNo = COMMITSEQNO_FIRST_NORMAL + 1; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -5068,6 +5069,7 @@ BootStrapXLOG(void) TransamVariables->nextXid = checkPoint.nextXid; TransamVariables->nextOid = checkPoint.nextOid; TransamVariables->oidCount = 0; + pg_atomic_write_u64(&TransamVariables->nextCommitSeqNo, COMMITSEQNO_FIRST_NORMAL + 1); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -5544,6 +5546,7 @@ StartupXLOG(void) TransamVariables->nextXid = checkPoint.nextXid; TransamVariables->nextOid = checkPoint.nextOid; TransamVariables->oidCount = 0; + pg_atomic_write_u64(&TransamVariables->nextCommitSeqNo, startupCommitSeqNo); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 9fc930e98f8..8da12c98346 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -306,6 +306,8 @@ static GlobalVisState GlobalVisTempRels; */ static TransactionId ComputeXidHorizonsResultLastXmin; +snapshot_hook_type snapshot_hook = NULL; + #ifdef XIDCACHE_DEBUG /* counters for XidCache measurement */ @@ -749,6 +751,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) proc->delayChkptFlags = 0; proc->recoveryConflictPending = false; + proc->lastCommittedCSN = pg_atomic_fetch_add_u64(&TransamVariables->nextCommitSeqNo, 1); /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ @@ -2234,6 +2237,8 @@ GetSnapshotData(Snapshot snapshot) if (GetSnapshotDataReuse(snapshot)) { + if (snapshot_hook) + snapshot_hook(snapshot); LWLockRelease(ProcArrayLock); return snapshot; } @@ -2415,6 +2420,9 @@ GetSnapshotData(Snapshot snapshot) if (!TransactionIdIsValid(MyProc->xmin)) MyProc->xmin = TransactionXmin = xmin; + if (snapshot_hook) + snapshot_hook(snapshot); + LWLockRelease(ProcArrayLock); /* maintain state for GlobalVis* */ diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 7d2b34d4f20..0ad250a959b 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -101,6 +101,10 @@ TransactionId RecentXmin = FirstNormalTransactionId; /* (table, ctid) => (cmin, cmax) mapping during timetravel */ static HTAB *tuplecid_data = NULL; +snapshot_hook_type snapshot_register_hook = NULL; +snapshot_hook_type snapshot_deregister_hook = NULL; +reset_xmin_hook_type reset_xmin_hook = NULL; + /* * Elements of the active snapshot stack. * @@ -201,6 +205,11 @@ typedef struct SerializedSnapshotData CommandId curcid; TimestampTz whenTaken; XLogRecPtr lsn; + CommitSeqNo snapshotcsn; + uint64 undoRegularLocation; + uint64 undoRegularXmin; + uint64 undoSystemLocation; + uint64 undoSystemXmin; } SerializedSnapshotData; /* @@ -263,6 +272,8 @@ GetTransactionSnapshot(void) /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(FirstXactSnapshot); } else CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); @@ -403,6 +414,8 @@ GetNonHistoricCatalogSnapshot(Oid relid) * CatalogSnapshot pointer is already valid. */ pairingheap_add(&RegisteredSnapshots, &CatalogSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(CatalogSnapshot); } return CatalogSnapshot; @@ -424,6 +437,8 @@ InvalidateCatalogSnapshot(void) if (CatalogSnapshot) { pairingheap_remove(&RegisteredSnapshots, &CatalogSnapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(CatalogSnapshot); CatalogSnapshot = NULL; SnapshotResetXmin(); } @@ -558,6 +573,8 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(FirstXactSnapshot); } FirstSnapshotSet = true; @@ -820,7 +837,11 @@ RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner) ResourceOwnerRememberSnapshot(owner, snap); if (snap->regd_count == 1) + { pairingheap_add(&RegisteredSnapshots, &snap->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(snap); + } return snap; } @@ -863,7 +884,11 @@ UnregisterSnapshotNoOwner(Snapshot snapshot) snapshot->regd_count--; if (snapshot->regd_count == 0) + { pairingheap_remove(&RegisteredSnapshots, &snapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(snapshot); + } if (snapshot->regd_count == 0 && snapshot->active_count == 0) { @@ -915,6 +940,9 @@ SnapshotResetXmin(void) { Snapshot minSnapshot; + if (reset_xmin_hook) + reset_xmin_hook(); + if (ActiveSnapshot != NULL) return; @@ -1008,6 +1036,8 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) Assert(FirstXactSnapshot->regd_count > 0); Assert(!pairingheap_is_empty(&RegisteredSnapshots)); pairingheap_remove(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(FirstXactSnapshot); } FirstXactSnapshot = NULL; @@ -1039,6 +1069,8 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) pairingheap_remove(&RegisteredSnapshots, &esnap->snapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(esnap->snapshot); } exportedSnapshots = NIL; @@ -1167,6 +1199,8 @@ ExportSnapshot(Snapshot snapshot) snapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &snapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(snapshot); /* * Fill buf with a text serialization of the snapshot, plus identification @@ -1729,6 +1763,11 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; + serialized_snapshot.snapshotcsn = snapshot->snapshotcsn; + serialized_snapshot.undoRegularXmin = snapshot->undoRegularLocationPhNode.xmin; + serialized_snapshot.undoRegularLocation = snapshot->undoRegularLocationPhNode.undoLocation; + serialized_snapshot.undoSystemXmin = snapshot->undoSystemLocationPhNode.xmin; + serialized_snapshot.undoSystemLocation = snapshot->undoSystemLocationPhNode.undoLocation; /* * Ignore the SubXID array if it has overflowed, unless the snapshot was @@ -1804,6 +1843,11 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; + snapshot->snapshotcsn = serialized_snapshot.snapshotcsn; + snapshot->undoRegularLocationPhNode.xmin = serialized_snapshot.undoRegularXmin; + snapshot->undoRegularLocationPhNode.undoLocation = serialized_snapshot.undoRegularLocation; + snapshot->undoSystemLocationPhNode.xmin = serialized_snapshot.undoSystemXmin; + snapshot->undoSystemLocationPhNode.undoLocation = serialized_snapshot.undoSystemLocation; /* Copy XIDs, if present. */ if (serialized_snapshot.xcnt > 0) diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 2ce2fe4dc3f..bd6430c2865 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -15,7 +15,9 @@ #define TRANSAM_H #include "access/xlogdefs.h" - +#ifndef FRONTEND +#include "port/atomics.h" +#endif /* ---------------- * Special transaction ID values @@ -268,9 +270,13 @@ typedef struct TransamVariablesData */ TransactionId oldestClogXid; /* oldest it's safe to look up in clog */ +#ifndef FRONTEND + pg_atomic_uint64 nextCommitSeqNo; +#else + CommitSeqNo nextCommitSeqNo; +#endif } TransamVariablesData; - /* ---------------- * extern declarations * ---------------- @@ -310,6 +316,7 @@ extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); extern void StopGeneratingPinnedObjectIds(void); +extern CommitSeqNo GetCurrentCSN(void); #ifdef USE_ASSERT_CHECKING extern void AssertTransactionIdInAllowableRange(TransactionId xid); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 2c507ea618c..a88968ca648 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -56,6 +56,7 @@ extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +extern PGDLLIMPORT CommitSeqNo startupCommitSeqNo; /* Archive modes */ typedef enum ArchiveMode diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 9488bf1857c..2ba462202dc 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -298,6 +298,7 @@ struct PGPROC bool fpVXIDLock; /* are we holding a fast-path VXID lock? */ LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID * lock */ + CommitSeqNo lastCommittedCSN; /* * Support for lock groups. Use LockHashPartitionLockByProc on the group diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 8ca60504622..5d065eebd42 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -100,4 +100,6 @@ extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, TransactionId *catalog_xmin); +extern snapshot_hook_type snapshot_hook; + #endif /* PROCARRAY_H */ diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 9398a84051c..3f6952d9895 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -18,6 +18,9 @@ #include "utils/resowner.h" #include "utils/snapshot.h" +#ifndef SNAPSHOT_H +typedef void (*snapshot_hook_type) (Snapshot snapshot); +#endif extern PGDLLIMPORT bool FirstSnapshotSet; @@ -78,7 +81,7 @@ extern void PushActiveSnapshotWithLevel(Snapshot snapshot, int snap_level); extern void PushCopiedSnapshot(Snapshot snapshot); extern void UpdateActiveSnapshotCommandId(void); extern void PopActiveSnapshot(void); -extern Snapshot GetActiveSnapshot(void); +extern PGDLLIMPORT Snapshot GetActiveSnapshot(void); extern bool ActiveSnapshotSet(void); extern Snapshot RegisterSnapshot(Snapshot snapshot); @@ -127,4 +130,10 @@ extern void SerializeSnapshot(Snapshot snapshot, char *start_address); extern Snapshot RestoreSnapshot(char *start_address); extern void RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc); +typedef void (*reset_xmin_hook_type) (void); + +extern snapshot_hook_type snapshot_register_hook; +extern snapshot_hook_type snapshot_deregister_hook; +extern reset_xmin_hook_type reset_xmin_hook; + #endif /* SNAPMGR_H */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 8d1e31e888e..49c913b12f8 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -122,6 +122,13 @@ typedef struct SnapshotData *Snapshot; #define InvalidSnapshot ((Snapshot) NULL) +typedef struct +{ + uint64 undoLocation; /* undo log location retained by this snapshot */ + uint64 xmin; + pairingheap_node ph_node; +} RetainUndoLocationPHNode; + /* * Struct representing all kind of possible snapshots. * @@ -214,6 +221,12 @@ typedef struct SnapshotData * transactions completed since the last GetSnapshotData(). */ uint64 snapXactCompletionCount; + + RetainUndoLocationPHNode undoRegularLocationPhNode; + RetainUndoLocationPHNode undoSystemLocationPhNode; + CommitSeqNo snapshotcsn; } SnapshotData; +typedef void (*snapshot_hook_type) (Snapshot snapshot); + #endif /* SNAPSHOT_H */ From 41c631243653521879e2b7037b3575e5d9ab09a9 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 01:57:10 +0300 Subject: [PATCH 10/56] Hooks for builtin functions and datatypes and orioledb recovery * Added SearchCatCacheInternal_hook, SearchCatCacheList_hook * Added SysCacheGetAttr_hook --- src/backend/commands/indexcmds.c | 4 ++++ src/backend/executor/execExpr.c | 2 ++ src/backend/utils/cache/catcache.c | 25 +++++++++++++++++++++++++ src/backend/utils/cache/syscache.c | 10 ++++++++-- src/backend/utils/cache/typcache.c | 14 ++++++++++++++ src/backend/utils/fmgr/fmgr.c | 4 ++-- src/include/commands/defrem.h | 3 +++ src/include/utils/catcache.h | 24 ++++++++++++++++++++++++ src/include/utils/fmgrtab.h | 3 +++ src/include/utils/typcache.h | 5 +++++ 10 files changed, 90 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index b987e023849..ec6e0df200d 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -71,6 +71,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" +GetDefaultOpClass_hook_type GetDefaultOpClass_hook = NULL; /* non-export function prototypes */ static bool CompareOpclassOptions(const Datum *opts1, const Datum *opts2, int natts); @@ -2317,6 +2318,9 @@ GetDefaultOpClass(Oid type_id, Oid am_id) /* If it's a domain, look at the base type instead */ type_id = getBaseType(type_id); + if (GetDefaultOpClass_hook) + return GetDefaultOpClass_hook(type_id, am_id); + tcategory = TypeCategory(type_id); /* diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c index a5395536a13..6913e3b7a6d 100644 --- a/src/backend/executor/execExpr.c +++ b/src/backend/executor/execExpr.c @@ -48,6 +48,8 @@ #include "utils/array.h" #include "utils/builtins.h" #include "utils/jsonfuncs.h" +#include "utils/json.h" +#include "utils/jsonb.h" #include "utils/jsonpath.h" #include "utils/lsyscache.h" #include "utils/typcache.h" diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 111d8a280a0..6f15161e426 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -64,6 +64,10 @@ /* Cache management header --- pointer is NULL until created */ static CatCacheHeader *CacheHdr = NULL; +SearchCatCacheInternal_hook_type SearchCatCacheInternal_hook = NULL; +SearchCatCacheList_hook_type SearchCatCacheList_hook = NULL; +GetCatCacheHashValue_hook_type GetCatCacheHashValue_hook = NULL; + static inline HeapTuple SearchCatCacheInternal(CatCache *cache, int nkeys, Datum v1, Datum v2, @@ -1324,6 +1328,14 @@ SearchCatCacheInternal(CatCache *cache, dlist_head *bucket; CatCTup *ct; + if (SearchCatCacheInternal_hook) + { + ct = SearchCatCacheInternal_hook(cache, nkeys, v1, v2, v3, v4); + + if (ct) + return &ct->tuple; + } + /* Make sure we're in an xact, even if this ends up being a cache hit */ Assert(IsTransactionState()); @@ -1616,6 +1628,11 @@ GetCatCacheHashValue(CatCache *cache, Datum v3, Datum v4) { + if (GetCatCacheHashValue_hook) + { + return GetCatCacheHashValue_hook(cache, cache->cc_nkeys, + v1, v2, v3, v4); + } /* * one-time startup overhead for each cache */ @@ -1666,6 +1683,14 @@ SearchCatCacheList(CatCache *cache, MemoryContext oldcxt; int i; + if (SearchCatCacheList_hook) + { + cl = SearchCatCacheList_hook(cache, nkeys, v1, v2, v3); + + if (cl) + return cl; + } + /* * one-time startup overhead for each cache */ diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 3e03dfc9910..802ec4b218f 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -94,6 +94,7 @@ static int SysCacheSupportingRelOidSize; static int oid_compare(const void *a, const void *b); +SysCacheGetAttr_hook_type SysCacheGetAttr_hook = NULL; /* * InitCatalogCache - initialize the caches @@ -480,6 +481,7 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull) { + TupleDesc cc_tupdesc = SysCache[cacheId]->cc_tupdesc; /* * We just need to get the TupleDesc out of the cache entry, and then we * can apply heap_getattr(). Normally the cache control data is already @@ -489,14 +491,18 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, if (cacheId < 0 || cacheId >= SysCacheSize || !PointerIsValid(SysCache[cacheId])) elog(ERROR, "invalid cache ID: %d", cacheId); - if (!PointerIsValid(SysCache[cacheId]->cc_tupdesc)) + + if (!PointerIsValid(cc_tupdesc) && SysCacheGetAttr_hook) + cc_tupdesc = SysCacheGetAttr_hook(SysCache[cacheId]); + if (!PointerIsValid(cc_tupdesc)) { InitCatCachePhase2(SysCache[cacheId], false); Assert(PointerIsValid(SysCache[cacheId]->cc_tupdesc)); + cc_tupdesc = SysCache[cacheId]->cc_tupdesc; } return heap_getattr(tup, attributeNumber, - SysCache[cacheId]->cc_tupdesc, + cc_tupdesc, isNull); } diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index aa4720cb598..b18e50df27d 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -292,6 +292,8 @@ static int32 NextRecordTypmod = 0; /* number of entries used */ * as identifiers, so we start the counter at INVALID_TUPLEDESC_IDENTIFIER. */ static uint64 tupledesc_id_counter = INVALID_TUPLEDESC_IDENTIFIER; +load_typcache_tupdesc_hook_type load_typcache_tupdesc_hook = NULL; +load_enum_cache_data_hook_type load_enum_cache_data_hook = NULL; static void load_typcache_tupdesc(TypeCacheEntry *typentry); static void load_rangetype_info(TypeCacheEntry *typentry); @@ -881,6 +883,12 @@ load_typcache_tupdesc(TypeCacheEntry *typentry) { Relation rel; + if (load_typcache_tupdesc_hook) + { + load_typcache_tupdesc_hook(typentry); + return; + } + if (!OidIsValid(typentry->typrelid)) /* should not happen */ elog(ERROR, "invalid typrelid for composite type %u", typentry->type_id); @@ -2563,6 +2571,12 @@ load_enum_cache_data(TypeCacheEntry *tcache) int bm_size, start_pos; + if (load_enum_cache_data_hook) + { + load_enum_cache_data_hook(tcache); + return; + } + /* Check that this is actually an enum */ if (tcache->typtype != TYPTYPE_ENUM) ereport(ERROR, diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c index e48a86be54b..5b7888c705f 100644 --- a/src/backend/utils/fmgr/fmgr.c +++ b/src/backend/utils/fmgr/fmgr.c @@ -72,7 +72,7 @@ extern Datum fmgr_security_definer(PG_FUNCTION_ARGS); * or name, but search by Oid is much faster. */ -static const FmgrBuiltin * +const FmgrBuiltin * fmgr_isbuiltin(Oid id) { uint16 index; @@ -97,7 +97,7 @@ fmgr_isbuiltin(Oid id) * the array with the same name, but they should all point to the same * routine. */ -static const FmgrBuiltin * +const FmgrBuiltin * fmgr_lookupByName(const char *name) { int i; diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 29c511e3196..a1ade77b732 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -158,4 +158,7 @@ extern int defGetTypeLength(DefElem *def); extern List *defGetStringList(DefElem *def); extern void errorConflictingDefElem(DefElem *defel, ParseState *pstate) pg_attribute_noreturn(); +typedef Oid (*GetDefaultOpClass_hook_type)(Oid type_id, Oid am_id); +extern PGDLLIMPORT GetDefaultOpClass_hook_type GetDefaultOpClass_hook; + #endif /* DEFREM_H */ diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h index 3fb9647b87c..8b692cafea1 100644 --- a/src/include/utils/catcache.h +++ b/src/include/utils/catcache.h @@ -227,4 +227,28 @@ extern void PrepareToInvalidateCacheTuple(Relation relation, HeapTuple newtuple, void (*function) (int, uint32, Oid)); +typedef CatCTup *(*SearchCatCacheInternal_hook_type)(CatCache *cache, + int nkeys, + Datum v1, Datum v2, + Datum v3, Datum v4); +extern SearchCatCacheInternal_hook_type SearchCatCacheInternal_hook; + +typedef CatCList *(*SearchCatCacheList_hook_type)(CatCache *cache, + int nkeys, + Datum v1, + Datum v2, + Datum v3); +extern SearchCatCacheList_hook_type SearchCatCacheList_hook; + +typedef TupleDesc (*SysCacheGetAttr_hook_type)(CatCache *SysCache); +extern SysCacheGetAttr_hook_type SysCacheGetAttr_hook; + +typedef uint32 (*GetCatCacheHashValue_hook_type)(CatCache *cache, + int nkeys, + Datum v1, + Datum v2, + Datum v3, + Datum v4); +extern GetCatCacheHashValue_hook_type GetCatCacheHashValue_hook; + #endif /* CATCACHE_H */ diff --git a/src/include/utils/fmgrtab.h b/src/include/utils/fmgrtab.h index 151dd74055d..f8666ba7087 100644 --- a/src/include/utils/fmgrtab.h +++ b/src/include/utils/fmgrtab.h @@ -46,4 +46,7 @@ extern PGDLLIMPORT const Oid fmgr_last_builtin_oid; /* highest function OID in #define InvalidOidBuiltinMapping PG_UINT16_MAX extern PGDLLIMPORT const uint16 fmgr_builtin_oid_index[]; +extern const FmgrBuiltin *fmgr_isbuiltin(Oid id); +extern const FmgrBuiltin *fmgr_lookupByName(const char *name); + #endif /* FMGRTAB_H */ diff --git a/src/include/utils/typcache.h b/src/include/utils/typcache.h index f506cc4aa35..7c84978b7fa 100644 --- a/src/include/utils/typcache.h +++ b/src/include/utils/typcache.h @@ -207,4 +207,9 @@ extern void SharedRecordTypmodRegistryInit(SharedRecordTypmodRegistry *, extern void SharedRecordTypmodRegistryAttach(SharedRecordTypmodRegistry *); +typedef void (*load_typcache_tupdesc_hook_type)(TypeCacheEntry *typentry); +extern PGDLLIMPORT load_typcache_tupdesc_hook_type load_typcache_tupdesc_hook; +typedef void (*load_enum_cache_data_hook_type)(TypeCacheEntry *tcache); +extern PGDLLIMPORT load_enum_cache_data_hook_type load_enum_cache_data_hook; + #endif /* TYPCACHE_H */ From c13889ae21fdf0dd91a6cc1a6b870943611a5304 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:01:16 +0300 Subject: [PATCH 11/56] Recovery and checkpointer hooks --- src/backend/access/transam/transam.c | 1 + src/backend/access/transam/xact.c | 4 ++++ src/backend/access/transam/xlog.c | 18 ++++++++++++++++++ src/backend/access/transam/xlogrecovery.c | 2 ++ src/backend/storage/buffer/bufmgr.c | 6 +++++- src/include/access/xact.h | 3 +++ src/include/access/xlog.h | 10 ++++++++++ 7 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 75b5325df8b..95647a357ea 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -22,6 +22,7 @@ #include "access/clog.h" #include "access/subtrans.h" #include "access/transam.h" +#include "storage/proc.h" #include "utils/snapmgr.h" /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c740d46023a..198f1b403c5 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -325,6 +325,7 @@ typedef struct SubXactCallbackItem static SubXactCallbackItem *SubXact_callbacks = NULL; +xact_redo_hook_type xact_redo_hook = NULL; /* local function prototypes */ static void AssignTransactionId(TransactionState s); @@ -6080,6 +6081,9 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, TransactionId max_xid; TimestampTz commit_time; + if (xact_redo_hook) + xact_redo_hook(xid, lsn); + Assert(TransactionIdIsValid(xid)); max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ac21ddc22fe..d0759f4dad1 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -143,6 +143,11 @@ bool XLOG_DEBUG = false; int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; +/* Hook for plugins to get control in CheckPointGuts() */ +CheckPoint_hook_type CheckPoint_hook = NULL; +double CheckPointProgress; +after_checkpoint_cleanup_hook_type after_checkpoint_cleanup_hook = NULL; + /* * Number of WAL insertion locks to use. A higher value allows more insertions * to happen concurrently, but adds some CPU overhead to flushing the WAL, @@ -5417,6 +5422,7 @@ StartupXLOG(void) XLogRecPtr missingContrecPtr; TransactionId oldestActiveXID; bool promoted = false; + bool wasInRecovery; /* * We should have an aux process resource owner to use, and we should not @@ -6045,6 +6051,8 @@ StartupXLOG(void) */ PreallocXlogFiles(EndOfLog, newTLI); + wasInRecovery = InRecovery; + /* * Okay, we're officially UP. */ @@ -6123,6 +6131,9 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + if (wasInRecovery && after_checkpoint_cleanup_hook) + after_checkpoint_cleanup_hook(EndOfLog, 0); + /* * All done with end-of-recovery actions. * @@ -7318,6 +7329,9 @@ CreateCheckPoint(int flags) if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + if (after_checkpoint_cleanup_hook) + after_checkpoint_cleanup_hook(ProcLastRecPtr, flags); + /* Real work is done; log and update stats. */ LogCheckpointEnd(false); @@ -7477,6 +7491,8 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { + if (CheckPoint_hook) + CheckPoint_hook(checkPointRedo, flags); CheckPointRelationMap(); CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN); CheckPointSnapBuild(); @@ -9478,3 +9494,5 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } + +void (*RedoShutdownHook) (void) = NULL; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index b45b8331720..a3e7fa810f8 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1856,6 +1856,8 @@ PerformWalRecovery(void) * exit with special return code to request shutdown of * postmaster. Log messages issued from postmaster. */ + if (RedoShutdownHook != NULL) + RedoShutdownHook(); proc_exit(3); case RECOVERY_TARGET_ACTION_PAUSE: diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 61816730955..e179056de9f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3086,6 +3086,7 @@ BufferSync(int flags) BufferDesc *bufHdr = NULL; CkptTsStatus *ts_stat = (CkptTsStatus *) DatumGetPointer(binaryheap_first(ts_heap)); + double progress; buf_id = CkptBufferIds[ts_stat->index].buf_id; Assert(buf_id != -1); @@ -3140,7 +3141,10 @@ BufferSync(int flags) * * (This will check for barrier events even if it doesn't sleep.) */ - CheckpointWriteDelay(flags, (double) num_processed / num_to_scan); + progress = (double) num_processed / num_to_scan; + progress = CheckPointProgress + progress * (1 - CheckPointProgress); + + CheckpointWriteDelay(flags, progress); } /* diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 6d4439f0524..327328da54c 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -527,4 +527,7 @@ extern void EnterParallelMode(void); extern void ExitParallelMode(void); extern bool IsInParallelMode(void); +typedef void (*xact_redo_hook_type) (TransactionId xid, XLogRecPtr lsn); +extern xact_redo_hook_type xact_redo_hook; + #endif /* XACT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a88968ca648..bd9eff2709a 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -308,4 +308,14 @@ extern SessionBackupState get_backup_status(void); /* files to signal promotion to primary */ #define PROMOTE_SIGNAL_FILE "promote" +typedef void (*CheckPoint_hook_type) (XLogRecPtr checkPointRedo, int flags); +extern PGDLLIMPORT CheckPoint_hook_type CheckPoint_hook; +extern double CheckPointProgress; +typedef void (*after_checkpoint_cleanup_hook_type)(XLogRecPtr checkPointRedo, + int flags); +extern PGDLLIMPORT after_checkpoint_cleanup_hook_type + after_checkpoint_cleanup_hook; + +extern void (*RedoShutdownHook) (void); + #endif /* XLOG_H */ From d27315d52fe7f7d579b1378ce25448e79c0214c8 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:07:13 +0300 Subject: [PATCH 12/56] Allow skipping logging for AccessExclusiveLock --- src/backend/storage/lmgr/lock.c | 14 ++++++++++++-- src/include/storage/lock.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 0400a507779..a4cd2aaf626 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -784,7 +784,7 @@ LockAcquireExtended(const LOCKTAG *locktag, bool reportMemoryError, LOCALLOCK **locallockp) { - LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LOCKMETHODID lockmethodid; LockMethod lockMethodTable; LOCALLOCKTAG localtag; LOCALLOCK *locallock; @@ -796,6 +796,15 @@ LockAcquireExtended(const LOCKTAG *locktag, LWLock *partitionLock; bool found_conflict; bool log_lock = false; + bool no_log_lock = false; + + if (locktag->locktag_lockmethodid == NO_LOG_LOCKMETHOD) + { + ((LOCKTAG *)locktag)->locktag_lockmethodid = DEFAULT_LOCKMETHOD; + no_log_lock = true; + } + + lockmethodid = locktag->locktag_lockmethodid; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -910,7 +919,8 @@ LockAcquireExtended(const LOCKTAG *locktag, if (lockmode >= AccessExclusiveLock && locktag->locktag_type == LOCKTAG_RELATION && !RecoveryInProgress() && - XLogStandbyInfoActive()) + XLogStandbyInfoActive() && + !no_log_lock) { LogAccessExclusiveLockPrepare(); log_lock = true; diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index cc1f6e78c39..29b7226fe50 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -124,6 +124,7 @@ typedef uint16 LOCKMETHODID; /* These identify the known lock methods */ #define DEFAULT_LOCKMETHOD 1 #define USER_LOCKMETHOD 2 +#define NO_LOG_LOCKMETHOD 255 /* Skip logging of AccessExclusiveLock */ /* * LOCKTAG is the key information needed to look up a LOCK item in the From 0e79d27645c6a6bce493ed26bb827f7f743fb1cc Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:08:21 +0300 Subject: [PATCH 13/56] Add convenience functions IsFatalError() have_backup_in_progress() SnapBuildNextPhaseAt() DoLocalLockExist() --- src/backend/access/transam/xlog.c | 13 +++++++++++++ src/backend/postmaster/postmaster.c | 6 ++++++ src/backend/replication/logical/snapbuild.c | 11 +++++++++++ src/backend/storage/lmgr/lock.c | 21 +++++++++++++++++++++ src/include/access/xlog.h | 1 + src/include/postmaster/postmaster.h | 1 + src/include/replication/snapbuild.h | 1 + src/include/storage/lock.h | 1 + 8 files changed, 55 insertions(+) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index d0759f4dad1..42312c75830 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9087,6 +9087,19 @@ get_backup_status(void) return sessionBackupState; } +/* + * Check if there is a backup in progress. + * + * We do this check without lock assuming 32-bit reads are atomic. In fact, + * the false result means that there was at least a moment of time when there + * were no backups. + */ +bool +have_backup_in_progress(void) +{ + return (XLogCtl->Insert.runningBackups > 0); +} + /* * do_pg_backup_stop * diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index bf0241aed0c..0d703ad35f7 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -482,6 +482,12 @@ int postmaster_alive_fds[2] = {-1, -1}; HANDLE PostmasterHandle; #endif +bool +IsFatalError(void) +{ + return FatalError; +} + /* * Postmaster main entry point */ diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index ae676145e60..f4a81de2f4e 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -418,6 +418,17 @@ SnapBuildCurrentState(SnapBuild *builder) return builder->state; } +/* + * An which transaction id the next phase of initial snapshot building will + * happen? + */ +TransactionId +SnapBuildNextPhaseAt(SnapBuild *builder) +{ + return builder->next_phase_at; +} + + /* * Return the LSN at which the two-phase decoding was first enabled. */ diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index a4cd2aaf626..8c5b1c70b54 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -635,6 +635,27 @@ GetLockMethodLocalHash(void) } #endif +/* + * Returns true if any LOCKMODE lock with given locktag exist in LocalMethodLocalHash. + */ +bool +DoLocalLockExist(const LOCKTAG *locktag) +{ + HASH_SEQ_STATUS scan_status; + LOCALLOCK* locallock; + + hash_seq_init(&scan_status, LockMethodLocalHash); + while ((locallock = (LOCALLOCK *) hash_seq_search(&scan_status)) != NULL) + { + if (memcmp(&locallock->tag.lock, locktag, sizeof(LOCKTAG)) == 0) + { + hash_seq_term(&scan_status); + return true; + } + } + return false; +} + /* * LockHasWaiters -- look up 'locktag' and check if releasing this * lock would wake up other processes waiting for it. diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index bd9eff2709a..da077b00ee1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -293,6 +293,7 @@ extern void do_pg_backup_start(const char *backupidstr, bool fast, StringInfo tblspcmapfile); extern void do_pg_backup_stop(BackupState *state, bool waitforarchive); extern void do_pg_abort_backup(int code, Datum arg); +extern bool have_backup_in_progress(void); extern void register_persistent_abort_backup_handler(void); extern SessionBackupState get_backup_status(void); diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 89ad13b788b..9f1d8d7cd6c 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -54,6 +54,7 @@ extern PGDLLIMPORT const char *progname; extern PGDLLIMPORT bool LoadedSSL; +extern bool IsFatalError(void); extern void PostmasterMain(int argc, char *argv[]) pg_attribute_noreturn(); extern void ClosePostmasterPorts(bool am_syslogger); extern void InitProcessGlobals(void); diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index caa5113ff81..6eee98557ad 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -74,6 +74,7 @@ extern void SnapBuildClearExportedSnapshot(void); extern void SnapBuildResetExportedSnapshotState(void); extern SnapBuildState SnapBuildCurrentState(SnapBuild *builder); +extern TransactionId SnapBuildNextPhaseAt(SnapBuild *builder); extern Snapshot SnapBuildGetOrBuildSnapshot(SnapBuild *builder); extern bool SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index 29b7226fe50..d271c32cd31 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -550,6 +550,7 @@ extern LockMethod GetLocksMethodTable(const LOCK *lock); extern LockMethod GetLockTagsMethodTable(const LOCKTAG *locktag); extern uint32 LockTagHashCode(const LOCKTAG *locktag); extern bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2); +extern bool DoLocalLockExist(const LOCKTAG *locktag); extern LockAcquireResult LockAcquire(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock, From f5b575767557afb3b783a1a0bcbd93674e358908 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:11:14 +0300 Subject: [PATCH 14/56] PERFORM_DELETION_OF_RELATION flag for object hooks --- src/backend/catalog/dependency.c | 36 +++++++++++++++++++++++++++++++- src/include/catalog/dependency.h | 2 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 0489cbabcb8..b3873fbd2ac 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -186,6 +186,7 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, int flags) { int i; + bool *depends_on_relation; /* * Keep track of objects for event triggers, if necessary. @@ -213,6 +214,33 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, } } + depends_on_relation = palloc0(sizeof(bool) * targetObjects->numrefs); + + for (i = targetObjects->numrefs - 1; i >= 0; i--) + { + ObjectAddressExtra *thisextra = targetObjects->extras + i; + int j; + + if (thisextra->dependee.classId == RelationRelationId && + thisextra->dependee.objectSubId == 0) + { + depends_on_relation[i] = true; + continue; + } + + for (j = i + 1; j < targetObjects->numrefs; j++) + { + ObjectAddress *depobj = targetObjects->refs + j; + if (depobj->classId == thisextra->dependee.classId && + depobj->objectId == thisextra->dependee.objectId && + depobj->objectSubId == thisextra->dependee.objectSubId) + { + depends_on_relation[i] = depends_on_relation[j]; + break; + } + } + } + /* * Delete all the objects in the proper order, except that if told to, we * should skip the original object(s). @@ -221,13 +249,19 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, { ObjectAddress *thisobj = targetObjects->refs + i; ObjectAddressExtra *thisextra = targetObjects->extras + i; + int temp_flags = flags; if ((flags & PERFORM_DELETION_SKIP_ORIGINAL) && (thisextra->flags & DEPFLAG_ORIGINAL)) continue; - deleteOneObject(thisobj, depRel, flags); + if (depends_on_relation[i]) + temp_flags |= PERFORM_DELETION_OF_RELATION; + + deleteOneObject(thisobj, depRel, temp_flags); } + + pfree(depends_on_relation); } /* diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 6908ca7180a..c9b59706373 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -96,6 +96,8 @@ typedef struct ObjectAddresses ObjectAddresses; #define PERFORM_DELETION_SKIP_EXTENSIONS 0x0010 /* keep extensions */ #define PERFORM_DELETION_CONCURRENT_LOCK 0x0020 /* normal drop with * concurrent lock mode */ +#define PERFORM_DELETION_OF_RELATION 0x0040 /* used for orioledb + * extension */ /* in dependency.c */ From 1317a3e91c90d8fd638229caa88517590231f82f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:20:32 +0300 Subject: [PATCH 15/56] Expose existing planning funcs and structs --- src/backend/catalog/index.c | 5 +---- src/backend/commands/explain.c | 14 +++----------- src/backend/commands/indexcmds.c | 8 ++------ src/backend/optimizer/path/indxpath.c | 13 +------------ src/backend/optimizer/plan/createplan.c | 16 ++++++++++------ src/include/catalog/index.h | 2 ++ src/include/commands/defrem.h | 4 ++++ src/include/commands/explain.h | 8 ++++++++ src/include/optimizer/paths.h | 12 ++++++++++++ src/include/optimizer/planmain.h | 5 +++++ 10 files changed, 48 insertions(+), 39 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index a819b4197ce..3e88bd877af 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -119,9 +119,6 @@ static void UpdateIndexRelation(Oid indexoid, Oid heapoid, bool immediate, bool isvalid, bool isready); -static void index_update_stats(Relation rel, - bool hasindex, - double reltuples); static void IndexCheckExclusion(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo); @@ -2777,7 +2774,7 @@ FormIndexDatum(IndexInfo *indexInfo, * index. When updating an index, it's important because some index AMs * expect a relcache flush to occur after REINDEX. */ -static void +void index_update_stats(Relation rel, bool hasindex, double reltuples) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 8086607710e..e6c989aea19 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -81,9 +81,6 @@ static void report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es); static double elapsed_time(instr_time *starttime); static bool ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used); -static void ExplainNode(PlanState *planstate, List *ancestors, - const char *relationship, const char *plan_name, - ExplainState *es); static void show_plan_tlist(PlanState *planstate, List *ancestors, ExplainState *es); static void show_expression(Node *node, const char *qlabel, @@ -92,9 +89,6 @@ static void show_expression(Node *node, const char *qlabel, static void show_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, bool useprefix, ExplainState *es); -static void show_scan_qual(List *qual, const char *qlabel, - PlanState *planstate, List *ancestors, - ExplainState *es); static void show_upper_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es); @@ -131,8 +125,6 @@ static void show_memoize_info(MemoizeState *mstate, List *ancestors, static void show_hashagg_info(AggState *aggstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); -static void show_instrumentation_count(const char *qlabel, int which, - PlanState *planstate, ExplainState *es); static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); @@ -1363,7 +1355,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) * to the nesting depth of logical output groups, and therefore is controlled * by ExplainOpenGroup/ExplainCloseGroup. */ -static void +void ExplainNode(PlanState *planstate, List *ancestors, const char *relationship, const char *plan_name, ExplainState *es) @@ -2527,7 +2519,7 @@ show_qual(List *qual, const char *qlabel, /* * Show a qualifier expression for a scan plan node */ -static void +void show_scan_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es) @@ -3618,7 +3610,7 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es) * * "which" identifies which instrumentation counter to print */ -static void +void show_instrumentation_count(const char *qlabel, int which, PlanState *planstate, ExplainState *es) { diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index ec6e0df200d..78d8bda48a0 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -92,11 +92,7 @@ static void ComputeIndexAttrs(IndexInfo *indexInfo, Oid ddl_userid, int ddl_sec_context, int *ddl_save_nestlevel); -static char *ChooseIndexName(const char *tabname, Oid namespaceId, - const List *colnames, const List *exclusionOpNames, - bool primary, bool isconstraint); static char *ChooseIndexNameAddition(const List *colnames); -static List *ChooseIndexColumnNames(const List *indexElems); static void ReindexIndex(const ReindexStmt *stmt, const ReindexParams *params, bool isTopLevel); static void RangeVarCallbackForReindexIndex(const RangeVar *relation, @@ -2536,7 +2532,7 @@ ChooseRelationName(const char *name1, const char *name2, * * The argument list is pretty ad-hoc :-( */ -static char * +char * ChooseIndexName(const char *tabname, Oid namespaceId, const List *colnames, const List *exclusionOpNames, bool primary, bool isconstraint) @@ -2625,7 +2621,7 @@ ChooseIndexNameAddition(const List *colnames) * * Returns a List of plain strings (char *, not String nodes). */ -static List * +List * ChooseIndexColumnNames(const List *indexElems) { List *result = NIL; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index c0fcc7d78df..7c043c53133 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -48,14 +48,6 @@ typedef enum ST_ANYSCAN, /* either is okay */ } ScanTypeControl; -/* Data structure for collecting qual clauses that match an index */ -typedef struct -{ - bool nonempty; /* True if lists are not all empty */ - /* Lists of IndexClause nodes, one list per index column */ - List *indexclauses[INDEX_MAX_KEYS]; -} IndexClauseSet; - /* Per-path data used within choose_bitmap_and() */ typedef struct { @@ -129,9 +121,6 @@ static double adjust_rowcount_for_semijoins(PlannerInfo *root, Index outer_relid, double rowcount); static double approximate_joinrel_size(PlannerInfo *root, Relids relids); -static void match_restriction_clauses_to_index(PlannerInfo *root, - IndexOptInfo *index, - IndexClauseSet *clauseset); static void match_join_clauses_to_index(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauseset, @@ -1964,7 +1953,7 @@ approximate_joinrel_size(PlannerInfo *root, Relids relids) * Identify restriction clauses for the rel that match the index. * Matching clauses are added to *clauseset. */ -static void +void match_restriction_clauses_to_index(PlannerInfo *root, IndexOptInfo *index, IndexClauseSet *clauseset) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index c0af10ebd34..4bb56f50c16 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -165,16 +165,12 @@ static MergeJoin *create_mergejoin_plan(PlannerInfo *root, MergePath *best_path) static HashJoin *create_hashjoin_plan(PlannerInfo *root, HashPath *best_path); static Node *replace_nestloop_params(PlannerInfo *root, Node *expr); static Node *replace_nestloop_params_mutator(Node *node, PlannerInfo *root); -static void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, - List **stripped_indexquals_p, - List **fixed_indexquals_p); static List *fix_indexorderby_references(PlannerInfo *root, IndexPath *index_path); static Node *fix_indexqual_clause(PlannerInfo *root, IndexOptInfo *index, int indexcol, Node *clause, List *indexcolnos); static Node *fix_indexqual_operand(Node *node, IndexOptInfo *index, int indexcol); static List *get_switched_clauses(List *clauses, Relids outerrelids); -static List *order_qual_clauses(PlannerInfo *root, List *clauses); static void copy_generic_path_info(Plan *dest, Path *src); static void copy_plan_costsize(Plan *dest, Plan *src); static void label_sort_with_costsize(PlannerInfo *root, Sort *plan, @@ -4939,6 +4935,14 @@ replace_nestloop_params(PlannerInfo *root, Node *expr) return replace_nestloop_params_mutator(expr, root); } +Node * +replace_nestloop_params_compat(PlannerInfo *root, Node *expr) +{ + /* No setup needed for tree walk, so away we go */ + return replace_nestloop_params_mutator(expr, root); +} + + static Node * replace_nestloop_params_mutator(Node *node, PlannerInfo *root) { @@ -5019,7 +5023,7 @@ replace_nestloop_params_mutator(Node *node, PlannerInfo *root) * are subplans in it (we need two separate copies of the subplan tree, or * things will go awry). */ -static void +void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, List **stripped_indexquals_p, List **fixed_indexquals_p) { @@ -5312,7 +5316,7 @@ get_switched_clauses(List *clauses, Relids outerrelids) * instead of bare clauses. This is another reason why trying to consider * selectivity in the ordering would likely do the wrong thing. */ -static List * +List * order_qual_clauses(PlannerInfo *root, List *clauses) { typedef struct diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 7d434f8e653..0beab397c79 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -215,4 +215,6 @@ itemptr_decode(ItemPointer itemptr, int64 encoded) ItemPointerSet(itemptr, block, offset); } +extern void index_update_stats(Relation rel, bool hasindex, double reltuples); + #endif /* INDEX_H */ diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index a1ade77b732..628e43dc33f 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -41,6 +41,10 @@ extern char *makeObjectName(const char *name1, const char *name2, extern char *ChooseRelationName(const char *name1, const char *name2, const char *label, Oid namespaceid, bool isconstraint); +extern List *ChooseIndexColumnNames(const List *indexElems); +extern char *ChooseIndexName(const char *tabname, Oid namespaceId, + const List *colnames, const List *exclusionOpNames, + bool primary, bool isconstraint); extern bool CheckIndexCompatible(Oid oldId, const char *accessMethodName, const List *attributeList, diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 9b8b351d9a2..5a6fabe8ed9 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -107,6 +107,14 @@ extern void ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, const instr_time *planduration, const BufferUsage *bufusage, const MemoryContextCounters *mem_counters); +extern void ExplainNode(PlanState *planstate, List *ancestors, + const char *relationship, const char *plan_name, + ExplainState *es); +extern void show_scan_qual(List *qual, const char *qlabel, + PlanState *planstate, List *ancestors, + ExplainState *es); +extern void show_instrumentation_count(const char *qlabel, int which, + PlanState *planstate, ExplainState *es); extern void ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc); extern void ExplainPrintTriggers(ExplainState *es, QueryDesc *queryDesc); diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 5e88c0224a4..58a2deb0094 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -64,6 +64,14 @@ extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, extern void generate_partitionwise_join_paths(PlannerInfo *root, RelOptInfo *rel); +/* Data structure for collecting qual clauses that match an index */ +typedef struct +{ + bool nonempty; /* True if lists are not all empty */ + /* Lists of IndexClause nodes, one list per index column */ + List *indexclauses[INDEX_MAX_KEYS]; +} IndexClauseSet; + /* * indxpath.c * routines to generate index paths @@ -79,6 +87,10 @@ extern bool match_index_to_operand(Node *operand, int indexcol, IndexOptInfo *index); extern void check_index_predicates(PlannerInfo *root, RelOptInfo *rel); +extern void match_restriction_clauses_to_index(PlannerInfo *root, + IndexOptInfo *index, + IndexClauseSet *clauseset); + /* * tidpath.c * routines to generate tid paths diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index aafc1737921..ef7658c378e 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -39,6 +39,11 @@ extern void preprocess_minmax_aggregates(PlannerInfo *root); * prototypes for plan/createplan.c */ extern Plan *create_plan(PlannerInfo *root, Path *best_path); +extern List *order_qual_clauses(PlannerInfo *root, List *clauses); +extern void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, + List **stripped_indexquals_p, + List **fixed_indexquals_p); +extern Node *replace_nestloop_params_compat(PlannerInfo *root, Node *expr); extern ForeignScan *make_foreignscan(List *qptlist, List *qpqual, Index scanrelid, List *fdw_exprs, List *fdw_private, List *fdw_scan_tlist, List *fdw_recheck_quals, From a19e20cb7e3820e3ae672ee5bc2f61fa3358b21f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 02:22:17 +0300 Subject: [PATCH 16/56] Allow locks in checkpointer --- src/backend/postmaster/checkpointer.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index ef75041de37..b6767a39911 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -54,11 +54,20 @@ #include "storage/proc.h" #include "storage/procsignal.h" #include "storage/shmem.h" +#include "storage/sinvaladt.h" #include "storage/smgr.h" #include "storage/spin.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner.h" +#include "utils/syscache.h" + +/* + * Included for InitializeTimeouts and RegisterTimeout functions that + * needed for correct working of OrioleDB checkpoint. + * See comment for InitializeTimeouts call in CheckpointerMain for details. + */ +#include "utils/timeout.h" /*---------- @@ -204,6 +213,20 @@ CheckpointerMain(char *startup_data, size_t startup_data_len) */ pqsignal(SIGCHLD, SIG_DFL); + /* + * To use OrioleDB checkpoint, we must initialize the data for the primary + * lock mechanism (lock.h) to work correctly. Because locks of this type are + * needed by the OrioleDB module for debug events and relation locks, but + * they are not used by the postgres checkpointer and are not initialized + * for it. + */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + InitDeadLockChecking(); + RegisterTimeout(DEADLOCK_TIMEOUT, CheckDeadLockAlert); + RelationCacheInitialize(); + InitCatalogCache(); + SharedInvalBackendInit(false); + /* * Initialize so that first time-driven event happens at the correct time. From f316acfe92b06236657ad3bccd3c94358339325a Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:14:57 +0300 Subject: [PATCH 17/56] Add base_init_startup_hook and HandleStartupProcInterrupts_hook --- src/backend/postmaster/startup.c | 5 +++++ src/backend/utils/init/postinit.c | 5 ++++- src/include/postmaster/postmaster.h | 4 ++++ src/include/postmaster/startup.h | 3 +++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index ef6f98ebcd7..5cea0f97a30 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -75,6 +75,8 @@ static volatile sig_atomic_t startup_progress_timer_expired = false; */ int log_startup_progress_interval = 10000; /* 10 sec */ +HandleStartupProcInterrupts_hook_type HandleStartupProcInterrupts_hook = NULL; + /* Signal handlers */ static void StartupProcTriggerHandler(SIGNAL_ARGS); static void StartupProcSigHupHandler(SIGNAL_ARGS); @@ -157,6 +159,9 @@ HandleStartupProcInterrupts(void) static uint32 postmaster_poll_count = 0; #endif + if (HandleStartupProcInterrupts_hook) + HandleStartupProcInterrupts_hook(); + /* * Process any requests or signals received recently. */ diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 0805398e24d..e4f9e14a91e 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -81,7 +81,7 @@ static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); static void process_settings(Oid databaseid, Oid roleid); - +base_init_startup_hook_type base_init_startup_hook = NULL; /*** InitPostgres support ***/ @@ -657,6 +657,9 @@ BaseInit(void) */ InitFileAccess(); + if (base_init_startup_hook) + base_init_startup_hook(); + /* * Initialize statistics reporting. This needs to happen early to ensure * that pgstat's shutdown callback runs after the shutdown callbacks of diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 9f1d8d7cd6c..67c9b39423a 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -65,6 +65,10 @@ extern bool PostmasterMarkPIDForWorkerNotify(int); extern void processCancelRequest(int backendPID, int32 cancelAuthCode); +typedef void (*base_init_startup_hook_type)(void); + +extern PGDLLIMPORT base_init_startup_hook_type base_init_startup_hook; + #ifdef EXEC_BACKEND extern Size ShmemBackendArraySize(void); extern void ShmemBackendArrayAllocation(void); diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h index dde7ebde881..17b60012a90 100644 --- a/src/include/postmaster/startup.h +++ b/src/include/postmaster/startup.h @@ -23,7 +23,10 @@ ereport(LOG, errmsg(msg, secs, (usecs / 10000), __VA_ARGS__ )); \ } while(0) +typedef void (*HandleStartupProcInterrupts_hook_type)(void); + extern PGDLLIMPORT int log_startup_progress_interval; +extern PGDLLIMPORT HandleStartupProcInterrupts_hook_type HandleStartupProcInterrupts_hook; extern void HandleStartupProcInterrupts(void); extern void StartupProcessMain(char *startup_data, size_t startup_data_len) pg_attribute_noreturn(); From 118de6f3290b79d2bfe367c9aed443b714e1240b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:17:36 +0300 Subject: [PATCH 18/56] Don't cancel recovery processes because of deadlocks --- src/backend/storage/lmgr/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index d083a102178..bbfafd2a73e 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -1253,7 +1253,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait) * If InHotStandby we set lock waits slightly later for clarity with other * code. */ - if (!InHotStandby) + if (!InHotStandby && !InRecovery) { if (LockTimeout > 0) { @@ -1613,7 +1613,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait) * already caused QueryCancelPending to become set, we want the cancel to * be reported as a lock timeout, not a user cancel. */ - if (!InHotStandby) + if (!InHotStandby && !InRecovery) { if (LockTimeout > 0) { From 6b6eb10f5c9eff8cffb9eb2efec0bd87f5bac222 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 03:42:18 +0300 Subject: [PATCH 19/56] set_plain_rel_pathlist_hook --- src/backend/optimizer/path/allpaths.c | 7 +++++-- src/include/optimizer/paths.h | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 4895cee9944..7e02b670931 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -83,6 +83,7 @@ int min_parallel_index_scan_size; /* Hook for plugins to get control in set_rel_pathlist() */ set_rel_pathlist_hook_type set_rel_pathlist_hook = NULL; +set_plain_rel_pathlist_hook_type set_plain_rel_pathlist_hook = NULL; /* Hook for plugins to replace standard_join_search() */ join_search_hook_type join_search_hook = NULL; @@ -772,8 +773,10 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) */ required_outer = rel->lateral_relids; - /* Consider sequential scan */ - add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); + if (!set_plain_rel_pathlist_hook || + set_plain_rel_pathlist_hook(root, rel, rte)) + /* Consider sequential scan */ + add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); /* If appropriate, consider parallel sequential scan */ if (rel->consider_parallel && required_outer == NULL) diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 58a2deb0094..e15e83bc7a3 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -32,6 +32,10 @@ typedef void (*set_rel_pathlist_hook_type) (PlannerInfo *root, Index rti, RangeTblEntry *rte); extern PGDLLIMPORT set_rel_pathlist_hook_type set_rel_pathlist_hook; +typedef bool (*set_plain_rel_pathlist_hook_type)(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); +extern PGDLLIMPORT set_plain_rel_pathlist_hook_type set_plain_rel_pathlist_hook; /* Hook for plugins to get control in add_paths_to_joinrel() */ typedef void (*set_join_pathlist_hook_type) (PlannerInfo *root, From 190e648ed9325e7d07b93b8c17771e78dd75a47a Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 13 Dec 2021 14:17:57 +0300 Subject: [PATCH 20/56] Let locker tolerate being removed from the waiting queue without obtaining a lock. --- src/backend/storage/lmgr/lock.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 8c5b1c70b54..98421b6dda5 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -1118,6 +1118,8 @@ LockAcquireExtended(const LOCKTAG *locktag, */ if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) { + int i; + AbortStrongLockAcquire(); if (dontWait) @@ -1167,7 +1169,27 @@ LockAcquireExtended(const LOCKTAG *locktag, PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock); LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode); LWLockRelease(partitionLock); - elog(ERROR, "LockAcquire failed"); + /* + * We've been removed from the queue without obtaining a lock. + * That's OK, we're going to return LOCKACQUIRE_NOT_AVAIL, but + * need to release a local lock first. + */ + locallock->nLocks--; + for (i = 0; i < locallock->numLockOwners; i++) + { + if (locallock->lockOwners[i].owner == owner) + { + locallock->lockOwners[i].nLocks--; + if (locallock->lockOwners[i].nLocks == 0) + { + ResourceOwnerForgetLock(owner, locallock); + locallock->lockOwners[i] = locallock->lockOwners[--locallock->numLockOwners]; + } + break; + } + } + + return LOCKACQUIRE_NOT_AVAIL; } } PROCLOCK_PRINT("LockAcquire: granted", proclock); @@ -4677,8 +4699,8 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) LWLockRelease(&proc->fpInfoLock); /* Time to wait. */ - (void) LockAcquire(&tag, ShareLock, false, false); - + if (LockAcquire(&tag, ShareLock, false, false) == LOCKACQUIRE_NOT_AVAIL) + return false; LockRelease(&tag, ShareLock, false); return XactLockForVirtualXact(vxid, xid, wait); } From fd1de8402bd61a8233d4956361e5fbb550543be3 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 17 Feb 2022 07:46:49 +0300 Subject: [PATCH 21/56] Count extension wait events in pg_isolation_test_session_is_blocked() --- src/backend/utils/adt/lockfuncs.c | 3 +++ src/backend/utils/adt/waitfuncs.c | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index e790f856ab3..b26e51246c1 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -16,8 +16,11 @@ #include "funcapi.h" #include "miscadmin.h" #include "storage/predicate_internals.h" +#include "storage/proc.h" +#include "storage/procarray.h" #include "utils/array.h" #include "utils/builtins.h" +#include "utils/wait_event.h" /* diff --git a/src/backend/utils/adt/waitfuncs.c b/src/backend/utils/adt/waitfuncs.c index e135c9e5e45..c68b36121e3 100644 --- a/src/backend/utils/adt/waitfuncs.c +++ b/src/backend/utils/adt/waitfuncs.c @@ -38,6 +38,7 @@ Datum pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) { + PGPROC *blocked_proc; int blocked_pid = PG_GETARG_INT32(0); ArrayType *interesting_pids_a = PG_GETARG_ARRAYTYPE_P(1); PGPROC *proc; @@ -109,5 +110,9 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) if (GetSafeSnapshotBlockingPids(blocked_pid, &dummy, 1) > 0) PG_RETURN_BOOL(true); + blocked_proc = BackendPidGetProc(blocked_pid); + if ((blocked_proc->wait_event_info & 0xFF000000) == PG_WAIT_EXTENSION) + PG_RETURN_BOOL(true); + PG_RETURN_BOOL(false); } From 9733d32f282e12d1554a3ccf12f3ef8a8b1fc2a8 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 24 Feb 2022 03:19:39 +0300 Subject: [PATCH 22/56] Support for custom table AM in pgbench --- src/bin/pgbench/pgbench.c | 45 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 86ffb3c8683..a53cd9fd236 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -210,6 +210,11 @@ double throttle_delay = 0; */ int64 latency_limit = 0; +/* + * tableam selection + */ +char *tableam = NULL; + /* * tablespace selection */ @@ -893,6 +898,7 @@ usage(void) " --partition-method=(range|hash)\n" " partition pgbench_accounts with this method (default: range)\n" " --partitions=NUM partition pgbench_accounts into NUM parts (default: 0)\n" + " --tableam=TABLEAM create tables using the specified tableam\n" " --tablespace=TABLESPACE create tables in the specified tablespace\n" " --unlogged-tables create tables as unlogged tables\n" "\nOptions to select what to run:\n" @@ -4778,14 +4784,34 @@ createPartitions(PGconn *con) appendPQExpBufferStr(&query, "maxvalue"); appendPQExpBufferChar(&query, ')'); + + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } } else if (partition_method == PART_HASH) + { printfPQExpBuffer(&query, "create%s table pgbench_accounts_%d\n" " partition of pgbench_accounts\n" " for values with (modulus %d, remainder %d)", unlogged_tables ? " unlogged" : "", p, partitions, p - 1); + + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } + } else /* cannot get there */ Assert(0); @@ -4872,10 +4898,20 @@ initCreateTables(PGconn *con) if (partition_method != PART_NONE && strcmp(ddl->table, "pgbench_accounts") == 0) appendPQExpBuffer(&query, " partition by %s (aid)", PARTITION_METHOD[partition_method]); - else if (ddl->declare_fillfactor) + else { + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } + /* fillfactor is only expected on actual tables */ - appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); + if (ddl->declare_fillfactor) + appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); } if (tablespace != NULL) @@ -6663,6 +6699,7 @@ main(int argc, char **argv) {"verbose-errors", no_argument, NULL, 15}, {"exit-on-abort", no_argument, NULL, 16}, {"debug", no_argument, NULL, 17}, + {"tableam", required_argument, NULL, 18}, {NULL, 0, NULL, 0} }; @@ -7003,6 +7040,10 @@ main(int argc, char **argv) case 17: /* debug */ pg_logging_increase_verbosity(); break; + case 18: /* tableam */ + initialization_option_set = true; + tableam = pg_strdup(optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); From af1759617093c5f12edeb4c49b2ecb36f7e29408 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 2 Mar 2022 14:49:29 +0300 Subject: [PATCH 23/56] Support for outline atomics on aarch64 Outline-atomics is a gcc compilation flag that enables runtime detection of CPU support for atomic instructions. Performance on CPUs that do support atomic instructions is improved, while compatibility and performance on CPUs without atomic instructions is not hurt. Discussion: https://postgr.es/m/flat/099F69EE-51D3-4214-934A-1F28C0A1A7A7%40amazon.com Author: Tsahi Zidenberg --- configure | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++ configure.ac | 4 +++ 2 files changed, 97 insertions(+) diff --git a/configure b/configure index 6db03e4a228..921417232e6 100755 --- a/configure +++ b/configure @@ -6663,6 +6663,99 @@ fi if test -n "$NOT_THE_CFLAGS"; then CFLAGS="$CFLAGS -Wno-cast-function-type-strict" fi + if test x"$host_cpu" == x"aarch64"; then + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -moutline-atomics, for CFLAGS" >&5 +$as_echo_n "checking whether ${CC} supports -moutline-atomics, for CFLAGS... " >&6; } +if ${pgac_cv_prog_CC_cflags__moutline_atomics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CC} +CFLAGS="${CFLAGS} -moutline-atomics" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CC_cflags__moutline_atomics=yes +else + pgac_cv_prog_CC_cflags__moutline_atomics=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__moutline_atomics" >&5 +$as_echo "$pgac_cv_prog_CC_cflags__moutline_atomics" >&6; } +if test x"$pgac_cv_prog_CC_cflags__moutline_atomics" = x"yes"; then + CFLAGS="${CFLAGS} -moutline-atomics" +fi + + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CXX} supports -moutline-atomics, for CXXFLAGS" >&5 +$as_echo_n "checking whether ${CXX} supports -moutline-atomics, for CXXFLAGS... " >&6; } +if ${pgac_cv_prog_CXX_cxxflags__moutline_atomics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CXXFLAGS=$CXXFLAGS +pgac_save_CXX=$CXX +CXX=${CXX} +CXXFLAGS="${CXXFLAGS} -moutline-atomics" +ac_save_cxx_werror_flag=$ac_cxx_werror_flag +ac_cxx_werror_flag=yes +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + pgac_cv_prog_CXX_cxxflags__moutline_atomics=yes +else + pgac_cv_prog_CXX_cxxflags__moutline_atomics=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_cxx_werror_flag=$ac_save_cxx_werror_flag +CXXFLAGS="$pgac_save_CXXFLAGS" +CXX="$pgac_save_CXX" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CXX_cxxflags__moutline_atomics" >&5 +$as_echo "$pgac_cv_prog_CXX_cxxflags__moutline_atomics" >&6; } +if test x"$pgac_cv_prog_CXX_cxxflags__moutline_atomics" = x"yes"; then + CXXFLAGS="${CXXFLAGS} -moutline-atomics" +fi + + + fi elif test "$ICC" = yes; then # Intel's compiler has a bug/misoptimization in checking for # division by NAN (NaN == 0), -mp1 fixes it, so add it to the CFLAGS. diff --git a/configure.ac b/configure.ac index 7531366b758..1fb80d13f8a 100644 --- a/configure.ac +++ b/configure.ac @@ -580,6 +580,10 @@ if test "$GCC" = yes -a "$ICC" = no; then if test -n "$NOT_THE_CFLAGS"; then CFLAGS="$CFLAGS -Wno-cast-function-type-strict" fi + if test x"$host_cpu" == x"aarch64"; then + PGAC_PROG_CC_CFLAGS_OPT([-moutline-atomics]) + PGAC_PROG_CXX_CFLAGS_OPT([-moutline-atomics]) + fi elif test "$ICC" = yes; then # Intel's compiler has a bug/misoptimization in checking for # division by NAN (NaN == 0), -mp1 fixes it, so add it to the CFLAGS. From 710d0cd020ba61fae7143b406143a299ad3a7109 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 17 Feb 2022 08:43:32 +0300 Subject: [PATCH 24/56] OrioleDB specific CI --- .github/workflows/build.yml | 31 +++++++++++++++++++++++++++++++ ci/build.sh | 21 +++++++++++++++++++++ ci/check.sh | 11 +++++++++++ ci/check_output.sh | 30 ++++++++++++++++++++++++++++++ ci/prerequisites.sh | 22 ++++++++++++++++++++++ configure | 5 +++++ configure.ac | 4 ++++ meson.build | 1 + src/Makefile.global.in | 3 +++ src/bin/pg_rewind/meson.build | 6 ++++++ src/makefiles/meson.build | 1 + 11 files changed, 135 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 ci/build.sh create mode 100644 ci/check.sh create mode 100644 ci/check_output.sh create mode 100644 ci/prerequisites.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000000..c6f1bef64aa --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,31 @@ +name: build + +on: + push: + pull_request: + +jobs: + test: + runs-on: + - ubuntu-20.04 + strategy: + fail-fast: false + matrix: + compiler: [clang, gcc] + check_type: [normal, debug] + env: + LLVM_VER: 10 + COMPILER: ${{ matrix.compiler }} + CHECK_TYPE: ${{ matrix.check_type }} + steps: + - name: Checkout code into workspace directory + uses: actions/checkout@v2 + - name: Setup prerequisites + run: bash ./ci/prerequisites.sh + - name: Build + run: bash ./ci/build.sh + - name: Check + run: bash ./ci/check.sh + - name: Check output + run: bash ./ci/check_output.sh + if: ${{ success() || failure() }} diff --git a/ci/build.sh b/ci/build.sh new file mode 100644 index 00000000000..f541929e69c --- /dev/null +++ b/ci/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -eu + +if [ $COMPILER = "clang" ]; then + export CC=clang-$LLVM_VER +else + export CC=gcc +fi + +# configure & build +if [ $CHECK_TYPE = "debug" ]; then + CFLAGS="-O0" ./configure --enable-debug --enable-cassert --enable-tap-tests --with-icu +else + ./configure --disable-debug --disable-cassert --enable-tap-tests --with-icu +fi + +make -sj4 +cd contrib +make -sj4 +cd .. diff --git a/ci/check.sh b/ci/check.sh new file mode 100644 index 00000000000..faa8c25e84a --- /dev/null +++ b/ci/check.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -eu + +# unsets limit for coredumps size +ulimit -c unlimited -S +# sets a coredump file pattern +mkdir -p /tmp/cores-$GITHUB_SHA-$TIMESTAMP +sudo sh -c "echo \"/tmp/cores-$GITHUB_SHA-$TIMESTAMP/%t_%p_%s.core\" > /proc/sys/kernel/core_pattern" + +make check-world -j4 diff --git a/ci/check_output.sh b/ci/check_output.sh new file mode 100644 index 00000000000..ae26cf63d68 --- /dev/null +++ b/ci/check_output.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -eu + +status=0 + +# show diff if it exists +for f in ` find . -name regression.diffs ` ; do + echo "========= Contents of $f" + cat $f + status=1 +done + +# check core dumps if any +cores=$(find /tmp/cores-$GITHUB_SHA-$TIMESTAMP/ -name '*.core' 2>/dev/null) + +if [ -n "$cores" ]; then + for corefile in $cores ; do + if [[ $corefile != *_3.core ]]; then + binary=$(gdb -quiet -core $corefile -batch -ex 'info auxv' | grep AT_EXECFN | perl -pe "s/^.*\"(.*)\"\$/\$1/g") + echo dumping $corefile for $binary + gdb --batch --quiet -ex "thread apply all bt full" -ex "quit" $binary $corefile + status=1 + fi + done +fi + +rm -rf /tmp/cores-$GITHUB_SHA-$TIMESTAMP + +exit $status diff --git a/ci/prerequisites.sh b/ci/prerequisites.sh new file mode 100644 index 00000000000..b26251b711c --- /dev/null +++ b/ci/prerequisites.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu + +# print the hostname to be able to identify runner by logs +echo "HOSTNAME=`hostname`" +TIMESTAMP=$(date +%s) +echo "TIMESTAMP=$TIMESTAMP" >> $GITHUB_ENV +echo "TIMESTAMP=$TIMESTAMP" + +sudo apt-get -y install -qq wget ca-certificates + +sudo apt-get update -qq + +apt_packages="build-essential flex bison pkg-config libreadline-dev make gdb libipc-run-perl libicu-dev python3 python3-dev python3-pip python3-setuptools python3-testresources" + +if [ $COMPILER = "clang" ]; then + apt_packages="$apt_packages llvm-$LLVM_VER clang-$LLVM_VER clang-tools-$LLVM_VER" +fi + +# install required packages +sudo apt-get -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" -y install -qq $apt_packages diff --git a/configure b/configure index 921417232e6..3c22c471a38 100755 --- a/configure +++ b/configure @@ -628,6 +628,7 @@ ac_includes_default="\ ac_subst_vars='LTLIBOBJS vpath_build PG_SYSROOT +ORIOLEDB_PATCHSET_VERSION PG_VERSION_NUM LDFLAGS_EX_BE PROVE @@ -19296,6 +19297,10 @@ _ACEOF +# Needed to check postgresql patches git tag during orioledb extension build +ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2` + + # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not # literally, so that it's possible to override it at build time using # a command like "make ... PG_SYSROOT=path". This has to be done after diff --git a/configure.ac b/configure.ac index 1fb80d13f8a..34c74e947c0 100644 --- a/configure.ac +++ b/configure.ac @@ -2451,6 +2451,10 @@ $AWK '{printf "%d%04d", $1, $2}'`"] AC_DEFINE_UNQUOTED(PG_VERSION_NUM, $PG_VERSION_NUM, [PostgreSQL version as a number]) AC_SUBST(PG_VERSION_NUM) +# Needed to check postgresql patches git tag during orioledb extension build +[ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2`] +AC_SUBST(ORIOLEDB_PATCHSET_VERSION) + # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not # literally, so that it's possible to override it at build time using # a command like "make ... PG_SYSROOT=path". This has to be done after diff --git a/meson.build b/meson.build index 4c2769dee0a..c5b87441c58 100644 --- a/meson.build +++ b/meson.build @@ -153,6 +153,7 @@ cdata.set('PG_VERSION_NUM', pg_version_num) # PG_VERSION_STR is built later, it depends on compiler test results cdata.set_quoted('CONFIGURE_ARGS', '') +orioledb_patchset_version = '22' ############################################################### diff --git a/src/Makefile.global.in b/src/Makefile.global.in index a00c909681e..8c7ee1c7217 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -44,6 +44,9 @@ VERSION_NUM = @PG_VERSION_NUM@ PACKAGE_URL = @PACKAGE_URL@ +# OrioleDB patchset git tag number +ORIOLEDB_PATCHSET_VERSION = @ORIOLEDB_PATCHSET_VERSION@ + # Set top_srcdir, srcdir, and VPATH. ifdef PGXS top_srcdir = $(top_builddir) diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build index e0f88bde221..e56d5ae24f6 100644 --- a/src/bin/pg_rewind/meson.build +++ b/src/bin/pg_rewind/meson.build @@ -2,6 +2,7 @@ pg_rewind_sources = files( 'datapagemap.c', + 'extension.c', 'file_ops.c', 'filemap.c', 'libpq_source.c', @@ -23,6 +24,7 @@ pg_rewind = executable('pg_rewind', pg_rewind_sources, dependencies: [frontend_code, libpq, lz4, zstd], c_args: ['-DFRONTEND'], # needed for xlogreader et al + export_dynamic: true, kwargs: default_bin_args, ) bin_targets += pg_rewind @@ -48,3 +50,7 @@ tests += { } subdir('po', if_found: libintl) + +install_headers( + 'pg_rewind_ext.h' +) \ No newline at end of file diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index 5618050b306..192d3303f55 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -37,6 +37,7 @@ pgxs_kv = { 'PACKAGE_VERSION': pg_version, 'PG_MAJORVERSION': pg_version_major, 'PG_VERSION_NUM': pg_version_num, + 'ORIOLEDB_PATCHSET_VERSION': orioledb_patchset_version, 'configure_input': 'meson', 'vpath_build': 'yes', From 7081c30f668f212b6c51047e6b6b6ba0a7368100 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 9 Apr 2023 01:57:21 +0300 Subject: [PATCH 25/56] Close indices in AttachPartitionEnsureIndexes() before DefineIndex() --- src/backend/commands/tablecmds.c | 43 +++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 75fde5a75d8..80e9048d6de 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -18641,12 +18641,14 @@ static void AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) { List *idxes; + List *buildIdxes = NIL; List *attachRelIdxs; Relation *attachrelIdxRels; IndexInfo **attachInfos; ListCell *cell; MemoryContext cxt; MemoryContext oldcxt; + AttrMap *attmap; cxt = AllocSetContextCreate(CurrentMemoryContext, "AttachPartitionEnsureIndexes", @@ -18695,6 +18697,10 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) goto out; } + attmap = build_attrmap_by_name(RelationGetDescr(attachrel), + RelationGetDescr(rel), + false); + /* * For each index on the partitioned table, find a matching one in the * partition-to-be; if one is not found, create one. @@ -18704,7 +18710,6 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) Oid idx = lfirst_oid(cell); Relation idxRel = index_open(idx, AccessShareLock); IndexInfo *info; - AttrMap *attmap; bool found = false; Oid constraintOid; @@ -18720,9 +18725,6 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) /* construct an indexinfo to compare existing indexes against */ info = BuildIndexInfo(idxRel); - attmap = build_attrmap_by_name(RelationGetDescr(attachrel), - RelationGetDescr(rel), - false); constraintOid = get_relation_idx_constraint_oid(RelationGetRelid(rel), idx); /* @@ -18788,19 +18790,7 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) * now. */ if (!found) - { - IndexStmt *stmt; - Oid conOid; - - stmt = generateClonedIndexStmt(NULL, - idxRel, attmap, - &conOid); - DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, - RelationGetRelid(idxRel), - conOid, - -1, - true, false, false, false, false); - } + buildIdxes = lappend_oid(buildIdxes, RelationGetRelid(idxRel)); index_close(idxRel, AccessShareLock); } @@ -18809,6 +18799,25 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) /* Clean up. */ for (int i = 0; i < list_length(attachRelIdxs); i++) index_close(attachrelIdxRels[i], AccessShareLock); + + foreach(cell, buildIdxes) + { + Oid idx = lfirst_oid(cell); + Relation idxRel = index_open(idx, AccessShareLock); + IndexStmt *stmt; + Oid conOid; + + stmt = generateClonedIndexStmt(NULL, + idxRel, attmap, + &conOid); + DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, + RelationGetRelid(idxRel), + conOid, + -1, + true, false, false, false, false); + index_close(idxRel, AccessShareLock); + } + MemoryContextSwitchTo(oldcxt); MemoryContextDelete(cxt); } From e5acf68ef3e4a23517db871759d0b06125c0bb06 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Fri, 30 Jun 2023 01:35:54 +0300 Subject: [PATCH 26/56] New BGWORKER_CLASS_SYSTEM bgworkers class They are allowed to stay during shutdown checkpointing and help checkpointer do its work. --- src/backend/postmaster/postmaster.c | 39 +++++++++++++++++++++-------- src/include/postmaster/bgworker.h | 6 +++++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 0d703ad35f7..5f0a09d465e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -137,7 +137,8 @@ #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ -#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ +#define BACKEND_TYPE_SYSTEM_BGWORKER 0x0010 /* system bgworker process */ +#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */ /* * List of active backends (or child processes anyway; we don't actually @@ -1894,8 +1895,9 @@ processCancelRequest(int backendPID, int32 cancelAuthCode) /* * canAcceptConnections --- check to see if database state allows connections * of the specified type. backend_type can be BACKEND_TYPE_NORMAL, - * BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet - * know whether a NORMAL connection might turn into a walsender.) + * BACKEND_TYPE_AUTOVAC, BACKEND_TYPE_BGWORKER or BACKEND_TYPE_SYSTEM_BGWORKER. + * (Note that we don't yet know whether a NORMAL connection might turn into + * a walsender.) */ static CAC_state canAcceptConnections(int backend_type) @@ -1909,7 +1911,8 @@ canAcceptConnections(int backend_type) * bgworker_should_start_now() decided whether the DB state allows them. */ if (pmState != PM_RUN && pmState != PM_HOT_STANDBY && - backend_type != BACKEND_TYPE_BGWORKER) + backend_type != BACKEND_TYPE_BGWORKER && + backend_type != BACKEND_TYPE_SYSTEM_BGWORKER) { if (Shutdown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ @@ -2540,6 +2543,13 @@ process_pm_child_exit(void) if (PgArchPID != 0) signal_child(PgArchPID, SIGUSR2); + /* + * Terminate system background workers since checpoint is + * complete. + */ + SignalSomeChildren(SIGTERM, + BACKEND_TYPE_SYSTEM_BGWORKER); + /* * Waken walsenders for the last time. No regular backends * should be around anymore. @@ -2971,7 +2981,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) * Background workers were already processed above; ignore them * here. */ - if (bp->bkend_type == BACKEND_TYPE_BGWORKER) + if (bp->bkend_type == BACKEND_TYPE_BGWORKER || + bp->bkend_type == BACKEND_TYPE_SYSTEM_BGWORKER) continue; if (take_action) @@ -3162,7 +3173,7 @@ PostmasterStateMachine(void) /* Signal all backend children except walsenders */ SignalSomeChildren(SIGTERM, - BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); + BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND - BACKEND_TYPE_SYSTEM_BGWORKER); /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); @@ -3204,7 +3215,7 @@ PostmasterStateMachine(void) * here. Walsenders and archiver are also disregarded, they will be * terminated later after writing the checkpoint record. */ - if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && + if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND - BACKEND_TYPE_SYSTEM_BGWORKER) == 0 && StartupPID == 0 && WalReceiverPID == 0 && WalSummarizerPID == 0 && @@ -4306,16 +4317,20 @@ do_start_bgworker(RegisteredBgWorker *rw) * specified start_time? */ static bool -bgworker_should_start_now(BgWorkerStartTime start_time) +bgworker_should_start_now(BgWorkerStartTime start_time, int flags) { switch (pmState) { case PM_NO_CHILDREN: case PM_WAIT_DEAD_END: case PM_SHUTDOWN_2: + break; + case PM_SHUTDOWN: case PM_WAIT_BACKENDS: case PM_STOP_BACKENDS: + if (flags & BGWORKER_CLASS_SYSTEM) + return true; break; case PM_RUN: @@ -4390,7 +4405,10 @@ assign_backendlist_entry(RegisteredBgWorker *rw) bn->cancel_key = MyCancelKey; bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); - bn->bkend_type = BACKEND_TYPE_BGWORKER; + if (rw->rw_worker.bgw_flags & BGWORKER_CLASS_SYSTEM) + bn->bkend_type = BACKEND_TYPE_SYSTEM_BGWORKER; + else + bn->bkend_type = BACKEND_TYPE_BGWORKER; bn->dead_end = false; bn->bgworker_notify = false; @@ -4488,7 +4506,8 @@ maybe_start_bgworkers(void) } } - if (bgworker_should_start_now(rw->rw_worker.bgw_start_time)) + if (bgworker_should_start_now(rw->rw_worker.bgw_start_time, + rw->rw_worker.bgw_flags)) { /* reset crash time before trying to start worker */ rw->rw_crashed_at = 0; diff --git a/src/include/postmaster/bgworker.h b/src/include/postmaster/bgworker.h index 22fc49ec27f..9a1cac58dee 100644 --- a/src/include/postmaster/bgworker.h +++ b/src/include/postmaster/bgworker.h @@ -66,6 +66,12 @@ * background workers should not use this class. */ #define BGWORKER_CLASS_PARALLEL 0x0010 + +/* + * This class of bgworkers are allowed to stay working during shutdown + * checkpointing. + */ +#define BGWORKER_CLASS_SYSTEM 0x0020 /* add additional bgworker classes here */ From 188c810728ab437dac6494d12a8922d0454720c4 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Thu, 7 Sep 2023 21:33:03 +0200 Subject: [PATCH 27/56] Add pg_newlocale_from_collation_hook to perform stricter collation checks --- src/backend/utils/adt/pg_locale.c | 7 ++++++- src/include/utils/pg_locale.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 1c57f12695e..b3b396b6a78 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -133,6 +133,7 @@ typedef struct static HTAB *collation_cache = NULL; +pg_newlocale_from_collation_hook_type pg_newlocale_from_collation_hook = NULL; #if defined(WIN32) && defined(LC_MESSAGES) static char *IsoLocaleName(const char *); @@ -1673,6 +1674,7 @@ pg_newlocale_from_collation(Oid collid) { char *actual_versionstr; char *collversionstr; + int level = WARNING; collversionstr = TextDatumGetCString(datum); @@ -1695,8 +1697,11 @@ pg_newlocale_from_collation(Oid collid) NameStr(collform->collname)))); } + if (pg_newlocale_from_collation_hook && pg_newlocale_from_collation_hook()) + level = ERROR; + if (strcmp(actual_versionstr, collversionstr) != 0) - ereport(WARNING, + ereport(level, (errmsg("collation \"%s\" has version mismatch", NameStr(collform->collname)), errdetail("The collation in the database was created using version %s, " diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 040968d6ff2..9ce2a266dce 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -101,6 +101,8 @@ extern void make_icu_collator(const char *iculocstr, extern bool pg_locale_deterministic(pg_locale_t locale); extern pg_locale_t pg_newlocale_from_collation(Oid collid); +typedef bool (*pg_newlocale_from_collation_hook_type)(); +extern pg_newlocale_from_collation_hook_type pg_newlocale_from_collation_hook; extern char *get_collation_actual_version(char collprovider, const char *collcollate); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); From bfeade10e88309e87bc2bf8a7d6133924ee6cf6b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 12 Jul 2023 23:40:12 +0300 Subject: [PATCH 28/56] Archive preload callback --- src/backend/postmaster/pgarch.c | 16 ++++++++++++++++ src/include/archive/archive_module.h | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 02f91431f5f..35af55cd678 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -760,6 +760,22 @@ pgarch_readyXlog(char *xlog) for (int i = 0; i < arch_files->arch_files_size; i++) arch_files->arch_files[i] = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); + /* + * Preload the WAL files if the relevant callback is provided. + */ + if (ArchiveCallbacks->archive_preload_file_cb) + { + for (int i = 0; i < arch_files->arch_files_size; i++) + { + char *xlog1 = arch_files->arch_files[i]; + char pathname[MAXPGPATH]; + + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog1); + ArchiveCallbacks->archive_preload_file_cb(archive_module_state, + xlog1, pathname); + } + } + /* Return the highest priority file. */ arch_files->arch_files_size--; strcpy(xlog, arch_files->arch_files[arch_files->arch_files_size]); diff --git a/src/include/archive/archive_module.h b/src/include/archive/archive_module.h index 763af76e542..d73b9661a4f 100644 --- a/src/include/archive/archive_module.h +++ b/src/include/archive/archive_module.h @@ -37,13 +37,17 @@ typedef struct ArchiveModuleState */ typedef void (*ArchiveStartupCB) (ArchiveModuleState *state); typedef bool (*ArchiveCheckConfiguredCB) (ArchiveModuleState *state); -typedef bool (*ArchiveFileCB) (ArchiveModuleState *state, const char *file, const char *path); +typedef void (*ArchivePreloadFileCB) (ArchiveModuleState *state, + const char *file, const char *path); +typedef bool (*ArchiveFileCB) (ArchiveModuleState *state, + const char *file, const char *path); typedef void (*ArchiveShutdownCB) (ArchiveModuleState *state); typedef struct ArchiveModuleCallbacks { ArchiveStartupCB startup_cb; ArchiveCheckConfiguredCB check_configured_cb; + ArchivePreloadFileCB archive_preload_file_cb; ArchiveFileCB archive_file_cb; ArchiveShutdownCB shutdown_cb; } ArchiveModuleCallbacks; From c542112051362a6454a8eda2bd8a6795dc329379 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 18 Feb 2024 06:10:50 +0200 Subject: [PATCH 29/56] Remove pthread_is_threaded_np() call To use curl during shared_preload_libraries initialization. --- configure | 2 +- configure.ac | 1 - meson.build | 1 - src/backend/postmaster/postmaster.c | 46 ----------------------------- src/include/pg_config.h.in | 3 -- 5 files changed, 1 insertion(+), 52 deletions(-) diff --git a/configure b/configure index 3c22c471a38..9da3f6a9af1 100755 --- a/configure +++ b/configure @@ -15357,7 +15357,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l +for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.ac b/configure.ac index 34c74e947c0..c1531abdd38 100644 --- a/configure.ac +++ b/configure.ac @@ -1762,7 +1762,6 @@ AC_CHECK_FUNCS(m4_normalize([ memset_s posix_fallocate ppoll - pthread_is_threaded_np setproctitle setproctitle_fast strchrnul diff --git a/meson.build b/meson.build index c5b87441c58..8ded179b23e 100644 --- a/meson.build +++ b/meson.build @@ -2690,7 +2690,6 @@ func_checks = [ ['posix_fallocate'], ['ppoll'], ['pthread_barrier_wait', {'dependencies': [thread_dep]}], - ['pthread_is_threaded_np', {'dependencies': [thread_dep]}], ['sem_init', {'dependencies': [rt_dep, thread_dep], 'skip': sema_kind != 'unnamed_posix', 'define': false}], ['setproctitle', {'dependencies': [util_dep]}], ['setproctitle_fast'], diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 5f0a09d465e..82b23791f31 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -85,10 +85,6 @@ #include #endif -#ifdef HAVE_PTHREAD_IS_THREADED_NP -#include -#endif - #include "access/xlog.h" #include "access/xlogrecovery.h" #include "common/file_perm.h" @@ -1330,24 +1326,6 @@ PostmasterMain(int argc, char *argv[]) */ } -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * On macOS, libintl replaces setlocale() with a version that calls - * CFLocaleCopyCurrent() when its second argument is "" and every relevant - * environment variable is unset or empty. CFLocaleCopyCurrent() makes - * the process multithreaded. The postmaster calls sigprocmask() and - * calls fork() without an immediate exec(), both of which have undefined - * behavior in a multithreaded program. A multithreaded postmaster is the - * normal case on Windows, which offers neither fork() nor sigprocmask(). - */ - if (pthread_is_threaded_np() != 0) - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("postmaster became multithreaded during startup"), - errhint("Set the LC_ALL environment variable to a valid locale."))); -#endif - /* * Remember postmaster startup time */ @@ -1756,15 +1734,6 @@ ServerLoop(void) if (StartWorkerNeeded || HaveCrashedWorker) maybe_start_bgworkers(); -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * With assertions enabled, check regularly for appearance of - * additional threads. All builds check at start and exit. - */ - Assert(pthread_is_threaded_np() == 0); -#endif - /* * Lastly, check to see if it's time to do some things that we don't * want to do every single time through the loop, because they're a @@ -3684,21 +3653,6 @@ report_fork_failure_to_client(ClientSocket *client_sock, int errnum) static void ExitPostmaster(int status) { -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * There is no known cause for a postmaster to become multithreaded after - * startup. Recheck to account for the possibility of unknown causes. - * This message uses LOG level, because an unclean shutdown at this point - * would usually not look much different from a clean shutdown. - */ - if (pthread_is_threaded_np() != 0) - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg_internal("postmaster became multithreaded"), - errdetail("Please report this to <%s>.", PACKAGE_BUGREPORT))); -#endif - /* should cleanup shared memory and kill all backends */ /* diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 6d9a0d001f4..7b7604d912a 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -327,9 +327,6 @@ /* Define to 1 if you have the `pthread_barrier_wait' function. */ #undef HAVE_PTHREAD_BARRIER_WAIT -/* Define to 1 if you have the `pthread_is_threaded_np' function. */ -#undef HAVE_PTHREAD_IS_THREADED_NP - /* Have PTHREAD_PRIO_INHERIT. */ #undef HAVE_PTHREAD_PRIO_INHERIT From a29f91179c1dfb9b580aa7e7e6c8df55b6bf5d39 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 8 Dec 2023 01:37:02 +0100 Subject: [PATCH 30/56] Added option to pg_rewind to perform extension specific rewind - added option --extension for pg_rewind - extracted SimpleXLogRead from extractPageMap for generic wal iteration in pg_rewind --- doc/src/sgml/ref/pg_rewind.sgml | 5 ++ src/bin/pg_rewind/Makefile | 7 +- src/bin/pg_rewind/extension.c | 132 ++++++++++++++++++++++++++++++ src/bin/pg_rewind/filemap.c | 40 +++++++++ src/bin/pg_rewind/parsexlog.c | 36 +++++--- src/bin/pg_rewind/pg_rewind.c | 15 +++- src/bin/pg_rewind/pg_rewind.h | 10 +++ src/bin/pg_rewind/pg_rewind_ext.h | 44 ++++++++++ 8 files changed, 273 insertions(+), 16 deletions(-) create mode 100644 src/bin/pg_rewind/extension.c create mode 100644 src/bin/pg_rewind/pg_rewind_ext.h diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index dc039d87566..0c8e7dd2cc3 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -302,6 +302,11 @@ PostgreSQL documentation This option has no effect when is used. + + + + + Load shared library that performs custom rewind for postgres extension. The path may be full or relative to PKGLIBDIR. File extension is optional. Multiple extensions can be selected by multiple switches. diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index 12b138b2f2c..4f93864cf7e 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -21,6 +21,7 @@ LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) OBJS = \ $(WIN32RES) \ datapagemap.o \ + extension.o \ file_ops.o \ filemap.o \ libpq_source.o \ @@ -35,19 +36,21 @@ EXTRA_CLEAN = xlogreader.c all: pg_rewind pg_rewind: $(OBJS) | submake-libpq submake-libpgport - $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LDFLAGS_EX_BE) $(LIBS) -o $@$(X) xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% rm -f $@ && $(LN_S) $< . install: all installdirs $(INSTALL_PROGRAM) pg_rewind$(X) '$(DESTDIR)$(bindir)/pg_rewind$(X)' + $(INSTALL_DATA) $(srcdir)/pg_rewind_ext.h '$(DESTDIR)$(includedir)' installdirs: - $(MKDIR_P) '$(DESTDIR)$(bindir)' + $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(includedir)' uninstall: rm -f '$(DESTDIR)$(bindir)/pg_rewind$(X)' + rm -f '$(DESTDIR)$(includedir)/pg_rewind_ext.h' clean distclean: rm -f pg_rewind$(X) $(OBJS) xlogreader.c diff --git a/src/bin/pg_rewind/extension.c b/src/bin/pg_rewind/extension.c new file mode 100644 index 00000000000..29ec4b5a6f6 --- /dev/null +++ b/src/bin/pg_rewind/extension.c @@ -0,0 +1,132 @@ +/*------------------------------------------------------------------------- + * + * extension.c + * Functions for processing shared libraries loaded by pg_rewind. + * + * Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#ifndef WIN32 +#include + +/* + * On macOS, insists on including . If we're not + * using stdbool, undef bool to undo the damage. + */ +#ifndef PG_USE_STDBOOL +#ifdef bool +#undef bool +#endif +#endif +#endif /* !WIN32 */ + +#include + +#include "access/xlog_internal.h" +#include "pg_rewind.h" + +/* signature for pg_rewind extension library rewind function */ +typedef void (*PG_rewind_t) (const char *datadir_target, char *datadir_source, + char *connstr_source, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug); + +static bool +file_exists(const char *argv0, const char *name) +{ + struct stat st; + + Assert(name != NULL); + + if (stat(name, &st) == 0) + return !S_ISDIR(st.st_mode); + else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES)) + { + const char *progname; + + progname = get_progname(argv0); + pg_log_error("could not access file \"%s\": %m", name); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + return false; +} + +static char * +expand_dynamic_library_name(const char *argv0, const char *name) +{ + char *full; + char my_exec_path[MAXPGPATH]; + char pkglib_path[MAXPGPATH]; + + Assert(name); + + if (find_my_exec(argv0, my_exec_path) < 0) + pg_fatal("%s: could not locate my own executable path", argv0); + get_pkglib_path(my_exec_path, pkglib_path); + full = palloc(strlen(pkglib_path) + 1 + strlen(name) + 1); + sprintf(full, "%s/%s", pkglib_path, name); + if (file_exists(argv0, full)) + return full; + pfree(full); + + full = palloc(strlen(pkglib_path) + 1 + strlen(name) + 1 + + strlen(DLSUFFIX) + 1); + sprintf(full, "%s/%s%s", pkglib_path, name, DLSUFFIX); + if (file_exists(argv0, full)) + return full; + pfree(full); + + return pstrdup(name); +} + +void +process_extensions(SimpleStringList *extensions, const char *datadir_target, + char *datadir_source, char *connstr_source, + XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug) +{ + SimpleStringListCell *cell; + + if (extensions->head == NULL) + return; /* nothing to do */ + + for (cell = extensions->head; cell; cell = cell->next) + { + char *filename = cell->val; + char *fullname; + void *lib_handle; + PG_rewind_t PG_rewind; + char *load_error; + + fullname = expand_dynamic_library_name(argv0, filename); + + lib_handle = dlopen(fullname, RTLD_NOW | RTLD_GLOBAL); + if (lib_handle == NULL) + { + load_error = dlerror(); + pg_fatal("could not load library \"%s\": %s", fullname, load_error); + } + + PG_rewind = dlsym(lib_handle, "_PG_rewind"); + + if (PG_rewind == NULL) + pg_fatal("could not find function \"_PG_rewind\" in \"%s\"", + fullname); + pfree(fullname); + + if (showprogress) + pg_log_info("performing rewind for '%s' extension", filename); + PG_rewind(datadir_target, datadir_source, connstr_source, startpoint, + tliIndex, endpoint, restoreCommand, argv0, debug); + + pg_log_debug("loaded library \"%s\"", filename); + } +} diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 4458324c9d8..83a2476a7e1 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -53,6 +53,7 @@ #define FILEHASH_INITIAL_SIZE 1000 static filehash_hash *filehash; +static SimpleStringList extensions_exclude = {NULL, NULL}; static bool isRelDataFile(const char *path); static char *datasegpath(RelFileLocator rlocator, ForkNumber forknum, @@ -260,6 +261,8 @@ process_target_file(const char *path, file_type_t type, size_t size, * from the target data folder all paths which have been filtered out from * the source data folder when processing the source files. */ + if (check_file_excluded(path, false)) + return; /* * Like in process_source_file, pretend that pg_wal is always a directory. @@ -404,6 +407,31 @@ check_file_excluded(const char *path, bool is_source) } } + /* + * Exclude extensions directories + */ + if (extensions_exclude.head != NULL) + { + SimpleStringListCell *cell; + + for (cell = extensions_exclude.head; cell; cell = cell->next) + { + char *exclude_dir = cell->val; + + snprintf(localpath, sizeof(localpath), "%s/", exclude_dir); + if (strstr(path, localpath) == path) + { + if (is_source) + pg_log_debug("entry \"%s\" excluded from source file list", + path); + else + pg_log_debug("entry \"%s\" excluded from target file list", + path); + return true; + } + } + } + return false; } @@ -820,3 +848,15 @@ decide_file_actions(void) return filemap; } + +void +extensions_exclude_add(char **exclude_dirs) +{ + int i; + + for (i = 0; exclude_dirs[i] != NULL; i++) + { + simple_string_list_append(&extensions_exclude, + pstrdup(exclude_dirs[i])); + } +} diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 22f7351fdcd..ca8ec05220e 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -38,7 +38,7 @@ static const char *const RmgrNames[RM_MAX_ID + 1] = { #define RmgrName(rmid) (((rmid) <= RM_MAX_BUILTIN_ID) ? \ RmgrNames[rmid] : "custom") -static void extractPageInfo(XLogReaderState *record); +static void extractPageInfo(XLogReaderState *record, void *arg); static int xlogreadfd = -1; static XLogSegNo xlogreadsegno = 0; @@ -54,17 +54,11 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf); -/* - * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline - * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of - * the data blocks touched by the WAL records, and return them in a page map. - * - * 'endpoint' is the end of the last record to read. The record starting at - * 'endpoint' is the first one that is not read. - */ void -extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, - XLogRecPtr endpoint, const char *restoreCommand) +SimpleXLogRead(const char *datadir, XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, const char *restoreCommand, + void (*page_callback) (XLogReaderState *, void *arg), + void *arg) { XLogRecord *record; XLogReaderState *xlogreader; @@ -97,7 +91,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, LSN_FORMAT_ARGS(errptr)); } - extractPageInfo(xlogreader); + page_callback(xlogreader, arg); } while (xlogreader->EndRecPtr < endpoint); /* @@ -116,6 +110,22 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, } } +/* + * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline + * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of + * the data blocks touched by the WAL records, and return them in a page map. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. + */ +void +extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, const char *restoreCommand) +{ + SimpleXLogRead(datadir, startpoint, tliIndex, endpoint, restoreCommand, + extractPageInfo, NULL); +} + /* * Reads one WAL record. Returns the end position of the record, without * doing anything with the record itself. @@ -365,7 +375,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, * Extract information on which blocks the current record modifies. */ static void -extractPageInfo(XLogReaderState *record) +extractPageInfo(XLogReaderState *record, void *arg) { int block_id; RmgrId rmid = XLogRecGetRmid(record); diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 052c83b8757..9b0a340d14b 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -77,6 +77,8 @@ bool do_sync = true; bool restore_wal = false; DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; +static SimpleStringList extensions = {NULL, NULL}; + /* Target history */ TimeLineHistoryEntry *targetHistory; int targetNentries; @@ -110,6 +112,7 @@ usage(const char *progname) printf(_(" --debug write a lot of debug messages\n")); printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); + printf(_(" -e, --extension=PATH path to library performing rewind for extension\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); @@ -135,6 +138,7 @@ main(int argc, char **argv) {"progress", no_argument, NULL, 'P'}, {"debug", no_argument, NULL, 3}, {"sync-method", required_argument, NULL, 6}, + {"extension", required_argument, NULL, 'e'}, {NULL, 0, NULL, 0} }; int option_index; @@ -173,7 +177,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "cD:nNPRe", long_options, &option_index)) != -1) { switch (c) { @@ -225,6 +229,9 @@ main(int argc, char **argv) case 6: if (!parse_sync_method(optarg, &sync_method)) exit(1); + + case 'e': /* -e or --extension */ + simple_string_list_append(&extensions, optarg); break; default: @@ -463,6 +470,12 @@ main(int argc, char **argv) /* Initialize the hash table to track the status of each file */ filehash_init(); + if (extensions.head != NULL) + process_extensions(&extensions, datadir_target, datadir_source, + connstr_source, chkptrec, lastcommontliIndex, + target_wal_endrec, restore_command, argv[0], + debug); + /* * Collect information about all files in the both data directories. */ diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index ec43cbe2c67..4397259e0d0 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -15,7 +15,9 @@ #include "common/logging.h" #include "common/file_utils.h" #include "datapagemap.h" +#include "fe_utils/simple_list.h" #include "libpq-fe.h" +#include "pg_rewind_ext.h" #include "storage/block.h" #include "storage/relfilelocator.h" @@ -55,4 +57,12 @@ extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries); +/* in extension.c */ +extern void process_extensions(SimpleStringList *extensions, + const char *datadir_target, char *datadir_source, + char *connstr_source, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug); + #endif /* PG_REWIND_H */ diff --git a/src/bin/pg_rewind/pg_rewind_ext.h b/src/bin/pg_rewind/pg_rewind_ext.h new file mode 100644 index 00000000000..3616d94f588 --- /dev/null +++ b/src/bin/pg_rewind/pg_rewind_ext.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * pg_rewind_ext.h + * + * + * Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#ifndef PG_REWIND_EXT_H +#define PG_REWIND_EXT_H + +#include "access/xlogreader.h" + +/* in parsexlog.c */ +/* + * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline + * index 'tliIndex' in target timeline history, until 'endpoint'. + * Pass all WAL records to 'page_callback'. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. + */ +extern void SimpleXLogRead(const char *datadir, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, + void (*page_callback) (XLogReaderState *, + void *arg), + void *arg); + + +/* in filemap.c */ +/* Add NULL-terminated list of dirs that pg_rewind can skip copying */ +extern void extensions_exclude_add(char **exclude_dirs); + +/* signature for pg_rewind extension library rewind function */ +extern PGDLLEXPORT void _PG_rewind(const char *datadir_target, + char *datadir_source, char *connstr_source, + XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, + const char *restoreCommand, + const char *argv0, bool debug); + +#endif /* PG_REWIND_EXT_H */ From a8790b51886bf7dde9d7a2ab2b66209e89358f81 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 3 May 2024 22:05:35 +0200 Subject: [PATCH 31/56] Index scan and index only scan with rowid --- src/backend/access/heap/heapam_handler.c | 3 +- src/backend/access/index/genam.c | 2 + src/backend/access/index/indexam.c | 88 +++++++++++++++++++++--- src/backend/access/table/tableam.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/nodeIndexonlyscan.c | 33 +++++++-- src/backend/utils/adt/selfuncs.c | 28 ++++++-- src/include/access/genam.h | 3 + src/include/access/relscan.h | 2 + src/include/access/tableam.h | 6 +- 10 files changed, 147 insertions(+), 22 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 2c2c7061189..7d6828db403 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -132,7 +132,7 @@ heapam_index_fetch_end(IndexFetchTableData *scan) static bool heapam_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead) @@ -140,6 +140,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; bool got_heap_tuple; + ItemPointer tid = DatumGetItemPointer(tupleid); Assert(TTS_IS_BUFFERTUPLE(slot)); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index de751e8e4a3..e162df6dfd1 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -101,6 +101,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->orderByData = NULL; scan->xs_want_itup = false; /* may be set later */ + scan->xs_want_rowid = false; /* may be set later */ /* * During recovery we ignore killed tuples and don't bother to kill them @@ -122,6 +123,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_rowid.isnull = true; return scan; } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index dcd04b813d8..596773a5c11 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -610,6 +610,55 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } +/* ---------------- + * index_getnext_rowid - get the next ROWID from a scan + * + * The result is the next ROWID satisfying the scan keys, + * or isnull if no more matching tuples exist. + * ---------------- + */ +NullableDatum +index_getnext_rowid(IndexScanDesc scan, ScanDirection direction) +{ + NullableDatum result; + bool found; + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amgettuple); + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * The AM's amgettuple proc finds the next index entry matching the scan + * keys, and puts the TID into scan->xs_heaptid. It should also set + * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we + * pay no attention to those fields here. + */ + found = scan->indexRelation->rd_indam->amgettuple(scan, direction); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + scan->xs_heap_continue = false; + + /* If we're out of index entries, we're done */ + if (!found) + { + /* release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + result.isnull = true; + return result; + } + /* Assert(RowidIsValid(&scan->xs_rowid)); */ + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Return the ROWID of the tuple we found. */ + return scan->xs_rowid; +} + /* ---------------- * index_fetch_heap - get the scan's next heap tuple * @@ -633,8 +682,17 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) { bool all_dead = false; bool found; + Datum tupleid; + + if (scan->xs_want_rowid) + { + Assert(!scan->xs_rowid.isnull); + tupleid = scan->xs_rowid.value; + } + else + tupleid = PointerGetDatum(&scan->xs_heaptid); - found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, + found = table_index_fetch_tuple(scan->xs_heapfetch, tupleid, scan->xs_snapshot, slot, &scan->xs_heap_continue, &all_dead); @@ -676,16 +734,30 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * { if (!scan->xs_heap_continue) { - ItemPointer tid; + if (scan->xs_want_rowid) + { + NullableDatum rowid; + /* Time to fetch the next TID from the index */ + rowid = index_getnext_rowid(scan, direction); - /* Time to fetch the next TID from the index */ - tid = index_getnext_tid(scan, direction); + /* If we're out of index entries, we're done */ + if (rowid.isnull) + break; - /* If we're out of index entries, we're done */ - if (tid == NULL) - break; + /* Assert(RowidEquals(rowid, &scan->xs_rowid)); */ + } + else + { + ItemPointer tid; + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scan, direction); - Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + } } /* diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index fed5ac6cd01..d5eb19be3bf 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -217,7 +217,7 @@ table_index_fetch_tuple_check(Relation rel, slot = table_slot_create(rel, NULL); scan = table_index_fetch_begin(rel); - found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, + found = table_index_fetch_tuple(scan, PointerGetDatum(tid), snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); ExecDropSingleTupleTableSlot(slot); diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index f7dc42f7452..ea5a1f365b1 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -109,7 +109,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); bool call_again = false; - if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, + if (!table_index_fetch_tuple(scan, PointerGetDatum(&tmptid), SnapshotSelf, slot, &call_again, NULL)) { /* diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index b49194c0167..a8424922ccc 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -65,7 +65,7 @@ IndexOnlyNext(IndexOnlyScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; - ItemPointer tid; + ItemPointer tid = NULL; /* * extract necessary information from index scan node @@ -117,12 +117,36 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (true) { bool tuple_from_heap = false; CHECK_FOR_INTERRUPTS(); + if (scandesc->xs_want_rowid) + { + NullableDatum rowid; + /* Time to fetch the next TID from the index */ + rowid = index_getnext_rowid(scandesc, direction); + + /* If we're out of index entries, we're done */ + if (rowid.isnull) + break; + + /* Assert(RowidEquals(rowid, &scan->xs_rowid)); */ + } + else + { + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scandesc, direction); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid)); + } + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, @@ -157,7 +181,8 @@ IndexOnlyNext(IndexOnlyScanState *node) * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, + if (!scandesc->xs_want_rowid && + !VM_ALL_VISIBLE(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { @@ -242,7 +267,7 @@ IndexOnlyNext(IndexOnlyScanState *node) * If we didn't access the heap, then we'll need to take a predicate * lock explicitly, as if we had. For now we do that at page level. */ - if (!tuple_from_heap) + if (!tuple_from_heap && !scandesc->xs_want_rowid) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 5f5d7959d8e..884d12da88c 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6339,12 +6339,32 @@ get_actual_variable_endpoint(Relation heapRel, index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); - /* Fetch first/next tuple in specified direction */ - while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) + while (true) { - BlockNumber block = ItemPointerGetBlockNumber(tid); + BlockNumber block = InvalidBlockNumber; - if (!VM_ALL_VISIBLE(heapRel, + /* Fetch first/next tuple in specified direction */ + if (index_scan->xs_want_rowid) + { + NullableDatum rowid; + rowid = index_getnext_rowid(index_scan, indexscandir); + + if (rowid.isnull) + break; + } + else + { + tid = index_getnext_tid(index_scan, indexscandir); + + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &index_scan->xs_heaptid)); + block = ItemPointerGetBlockNumber(tid); + } + + if (!index_scan->xs_want_rowid && + !VM_ALL_VISIBLE(heapRel, block, &vmbuffer)) { diff --git a/src/include/access/genam.h b/src/include/access/genam.h index fdcfbe8db74..1d74c87cd20 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -176,6 +176,9 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, ParallelIndexScanDesc pscan); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); +extern NullableDatum index_getnext_rowid(IndexScanDesc scan, + ScanDirection direction); +extern Datum index_getnext_tupleid(IndexScanDesc scan, ScanDirection direction); struct TupleTableSlot; extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot); extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 521043304ab..24b04709012 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -122,6 +122,7 @@ typedef struct IndexScanDescData struct ScanKeyData *keyData; /* array of index qualifier descriptors */ struct ScanKeyData *orderByData; /* array of ordering op descriptors */ bool xs_want_itup; /* caller requests index tuples */ + bool xs_want_rowid; /* caller requests index tuples */ bool xs_temp_snap; /* unregister snapshot at scan end? */ /* signaling to index AM about killing index tuples */ @@ -145,6 +146,7 @@ typedef struct IndexScanDescData struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */ ItemPointerData xs_heaptid; /* result */ + NullableDatum xs_rowid; /* result if xs_want_rowid */ bool xs_heap_continue; /* T if must keep walking, potential * further results */ IndexFetchTableData *xs_heapfetch; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index f30c507abb1..d681c7636cb 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -477,7 +477,7 @@ typedef struct TableAmRoutine * future searches. */ bool (*index_fetch_tuple) (struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead); @@ -1267,7 +1267,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan) */ static inline bool table_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead) @@ -1280,7 +1280,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding"); - return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, + return scan->rel->rd_tableam->index_fetch_tuple(scan, tupleid, snapshot, slot, call_again, all_dead); } From 7b7268e2067e7dac26bfc07d3af0679f4abc9370 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Wed, 8 May 2024 04:09:19 +0200 Subject: [PATCH 32/56] Remove primary index am check --- src/backend/access/index/indexam.c | 3 ++- src/backend/catalog/index.c | 3 --- src/backend/parser/parse_utilcmd.c | 13 ------------- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 596773a5c11..8e6fbd2555d 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -765,7 +765,8 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * * If we don't find anything, loop around and grab the next TID from * the index. */ - Assert(ItemPointerIsValid(&scan->xs_heaptid)); + if (!scan->xs_want_rowid) + Assert(ItemPointerIsValid(&scan->xs_heaptid)); if (index_fetch_heap(scan, slot)) return true; } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 3e88bd877af..91a115003b9 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2648,9 +2648,6 @@ BuildSpeculativeIndexInfo(Relation index, IndexInfo *ii) */ Assert(ii->ii_Unique); - if (index->rd_rel->relam != BTREE_AM_OID) - elog(ERROR, "unexpected non-btree speculative unique index"); - ii->ii_UniqueOps = (Oid *) palloc(sizeof(Oid) * indnkeyatts); ii->ii_UniqueProcs = (Oid *) palloc(sizeof(Oid) * indnkeyatts); ii->ii_UniqueStrats = (uint16 *) palloc(sizeof(uint16) * indnkeyatts); diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index eaf46ab6871..ad207acae60 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -2310,19 +2310,6 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) errdetail("Cannot create a non-deferrable constraint using a deferrable index."), parser_errposition(cxt->pstate, constraint->location))); - /* - * Insist on it being a btree. That's the only kind that supports - * uniqueness at the moment anyway; but we must have an index that - * exactly matches what you'd get from plain ADD CONSTRAINT syntax, - * else dump and reload will produce a different index (breaking - * pg_upgrade in particular). - */ - if (index_rel->rd_rel->relam != get_index_am_oid(DEFAULT_INDEX_TYPE, false)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("index \"%s\" is not a btree", index_name), - parser_errposition(cxt->pstate, constraint->location))); - /* Must get indclass the hard way */ indclassDatum = SysCacheGetAttrNotNull(INDEXRELID, index_rel->rd_indextuple, From 836031358a1b128fa445f94d11f1f099bc3a53cc Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Mon, 13 May 2024 20:33:54 +0200 Subject: [PATCH 33/56] Passing tupleid to insert now --- contrib/bloom/blinsert.c | 3 ++- contrib/bloom/bloom.h | 2 +- src/backend/access/brin/brin.c | 3 ++- src/backend/access/common/toast_internals.c | 2 +- src/backend/access/gin/gininsert.c | 3 ++- src/backend/access/gist/gist.c | 3 ++- src/backend/access/hash/hash.c | 3 ++- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/access/index/indexam.c | 4 ++-- src/backend/access/nbtree/nbtree.c | 3 ++- src/backend/access/spgist/spginsert.c | 3 ++- src/backend/catalog/indexing.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/execIndexing.c | 18 +++++++++++++++--- src/backend/executor/nodeModifyTable.c | 4 ++-- src/include/access/amapi.h | 2 +- src/include/access/brin_internal.h | 2 +- src/include/access/genam.h | 2 +- src/include/access/gin_private.h | 2 +- src/include/access/gist_private.h | 2 +- src/include/access/hash.h | 2 +- src/include/access/nbtree.h | 2 +- src/include/access/spgist.h | 2 +- .../modules/dummy_index_am/dummy_index_am.c | 2 +- 24 files changed, 47 insertions(+), 28 deletions(-) diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index f8a1061abb9..7873118d112 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -172,7 +172,7 @@ blbuildempty(Relation index) */ bool blinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -189,6 +189,7 @@ blinsert(Relation index, Datum *values, bool *isnull, BlockNumber blkno = InvalidBlockNumber; OffsetNumber nStart; GenericXLogState *state; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Bloom insert temporary context", diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h index fba3ba77711..b9aaca16fa2 100644 --- a/contrib/bloom/bloom.h +++ b/contrib/bloom/bloom.h @@ -189,7 +189,7 @@ extern bool blvalidate(Oid opclassoid); /* index access method interface functions */ extern bool blinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 6467bed604a..c33c3636801 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -333,7 +333,7 @@ initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo) */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, - ItemPointer heaptid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -348,6 +348,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls, MemoryContext tupcxt = NULL; MemoryContext oldcxt = CurrentMemoryContext; bool autosummarize = BrinGetAutoSummarize(idxRel); + ItemPointer heaptid = DatumGetItemPointer(tupleid); /* * If first time through in this statement, initialize the insert state diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 538a554c917..2b4fa1fb25a 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -338,7 +338,7 @@ toast_save_datum(Relation rel, Datum value, /* Only index relations marked as ready can be updated */ if (toastidxs[i]->rd_index->indisready) index_insert(toastidxs[i], t_values, t_isnull, - &(toasttup->t_self), + ItemPointerGetDatum(&(toasttup->t_self)), toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 71f38be90c3..690c744d9a9 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -481,7 +481,7 @@ ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, bool gininsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -490,6 +490,7 @@ gininsert(Relation index, Datum *values, bool *isnull, MemoryContext oldCtx; MemoryContext insertCtx; int i; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* Initialize GinState cache if first call in this statement */ if (ginstate == NULL) diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index ed4ffa63a77..0da8ab31046 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -157,7 +157,7 @@ gistbuildempty(Relation index) */ bool gistinsert(Relation r, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -165,6 +165,7 @@ gistinsert(Relation r, Datum *values, bool *isnull, GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache; IndexTuple itup; MemoryContext oldCxt; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* Initialize GISTSTATE cache if first call in this statement */ if (giststate == NULL) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 01d06b7c328..1dc15d2a53b 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -249,7 +249,7 @@ hashbuildCallback(Relation index, */ bool hashinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -257,6 +257,7 @@ hashinsert(Relation rel, Datum *values, bool *isnull, Datum index_values[1]; bool index_isnull[1]; IndexTuple itup; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* convert data to a hash key; on failure, do not insert anything */ if (!_hash_convert_tuple(rel, diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 7d6828db403..6f0464896c0 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2313,7 +2313,7 @@ heapam_index_validate_scan(Relation heapRelation, index_insert(indexRelation, values, isnull, - &rootTuple, + ItemPointerGetDatum(&rootTuple), heapRelation, indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 8e6fbd2555d..d4d1bf52739 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -213,7 +213,7 @@ bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_t_ctid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, @@ -228,7 +228,7 @@ index_insert(Relation indexRelation, InvalidBlockNumber); return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, - heap_t_ctid, heapRelation, + tupleid, heapRelation, checkUnique, indexUnchanged, indexInfo); } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 59155a7bea6..4acb3c73089 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -180,13 +180,14 @@ btbuildempty(Relation index) */ bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) { bool result; IndexTuple itup; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* generate an index tuple */ itup = index_form_tuple(RelationGetDescr(rel), values, isnull); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 1bec19c2b88..57004e79f54 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -181,7 +181,7 @@ spgbuildempty(Relation index) */ bool spginsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -189,6 +189,7 @@ spginsert(Relation index, Datum *values, bool *isnull, SpGistState spgstate; MemoryContext oldCtx; MemoryContext insertCtx; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); insertCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST insert temporary context", diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index d0d1abda58a..cd78b1ea55e 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -170,7 +170,7 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, index_insert(index, /* index relation */ values, /* array of index Datums */ isnull, /* is-null flags */ - &(heapTuple->t_self), /* tid of heap tuple */ + ItemPointerGetDatum(&(heapTuple->t_self)), /* tid of heap tuple */ heapRelation, index->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index ea5a1f365b1..43618646861 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -171,7 +171,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) * the row is now dead, because that is the TID the index will know * about. */ - index_insert(indexRel, values, isnull, &checktid, + index_insert(indexRel, values, isnull, ItemPointerGetDatum(&checktid), trigdata->tg_relation, UNIQUE_CHECK_EXISTING, false, indexInfo); diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 9f05b3654c1..a176a69a76b 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -304,7 +304,6 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, List *arbiterIndexes, bool onlySummarizing) { - ItemPointer tupleid = &slot->tts_tid; List *result = NIL; int i; int numIndices; @@ -314,8 +313,20 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; - Assert(ItemPointerIsValid(tupleid)); + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } /* * Get information from the result relation info structure. @@ -462,6 +473,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; + ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -482,7 +494,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - tupleid, values, isnull, + raw_tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 4b3fbbe2009..7f74d0961c8 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1752,8 +1752,8 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, /* Tuple routing starts from the root table. */ context->cpUpdateReturningSlot = - ExecInsert(context, mtstate->rootResultRelInfo, slot, canSetTag, - inserted_tuple, insert_destrel); + ExecInsert(context, mtstate->rootResultRelInfo, + slot, canSetTag, inserted_tuple, insert_destrel); /* * Reset the transition state that may possibly have been written by diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index f25c9d58a7d..7fc4ff9a379 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -107,7 +107,7 @@ typedef void (*ambuildempty_function) (Relation indexRelation); typedef bool (*aminsert_function) (Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_tid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h index a5a9772621c..442d2c96b7b 100644 --- a/src/include/access/brin_internal.h +++ b/src/include/access/brin_internal.h @@ -92,7 +92,7 @@ extern IndexBuildResult *brinbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void brinbuildempty(Relation index); extern bool brininsert(Relation idxRel, Datum *values, bool *nulls, - ItemPointer heaptid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 1d74c87cd20..fd569bdd5f0 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,7 +144,7 @@ extern void index_close(Relation relation, LOCKMODE lockmode); extern bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_t_ctid, + Datum tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 3013a44bae1..2e81017f014 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -115,7 +115,7 @@ extern IndexBuildResult *ginbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void ginbuildempty(Relation index); extern bool gininsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 7b8749c8db0..284fb49c517 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -401,7 +401,7 @@ typedef struct GiSTOptions /* gist.c */ extern void gistbuildempty(Relation index); extern bool gistinsert(Relation r, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 9c7d81525b4..e787974a3cf 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -364,7 +364,7 @@ extern IndexBuildResult *hashbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void hashbuildempty(Relation index); extern bool hashinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 74930433480..049ebf72b7b 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1164,7 +1164,7 @@ typedef struct BTOptions */ extern void btbuildempty(Relation index); extern bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index d6a49531200..b9cc48aba37 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -197,7 +197,7 @@ extern IndexBuildResult *spgbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void spgbuildempty(Relation index); extern bool spginsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 18185d02067..80c6668666a 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -164,7 +164,7 @@ dibuildempty(Relation index) */ static bool diinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) From db45c025b15859a9477d1e97e7a88ef1a0a386c6 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 17 May 2024 00:27:02 +0200 Subject: [PATCH 34/56] Methods for index update and delete Also validates compatability of index AM with table AM at index creation --- src/backend/access/index/indexam.c | 60 ++++ src/backend/executor/execIndexing.c | 401 +++++++++++++++++++++++++ src/backend/executor/nodeModifyTable.c | 22 +- src/backend/parser/gram.y | 16 +- src/include/access/amapi.h | 23 ++ src/include/access/genam.h | 15 + src/include/executor/executor.h | 10 + src/include/nodes/parsenodes.h | 1 + 8 files changed, 542 insertions(+), 6 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index d4d1bf52739..fe1efe283c2 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -247,6 +247,66 @@ index_insert_cleanup(Relation indexRelation, indexRelation->rd_indam->aminsertcleanup(indexRelation, indexInfo); } +/* ---------------- + * index_update - update an index tuple in a relation + * ---------------- + */ +bool +index_update(Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amupdate); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->amupdate(indexRelation, + new_valid, old_valid, + values, isnull, tupleid, + valuesOld, isnullOld, oldTupleid, + heapRelation, + checkUnique, + indexInfo); +} + + +/* ---------------- + * index_delete - delete an index tuple from a relation + * ---------------- + */ +bool +index_delete(Relation indexRelation, + Datum *values, bool *isnull, Datum tupleid, + Relation heapRelation, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amdelete); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->amdelete(indexRelation, + values, isnull, tupleid, + heapRelation, + indexInfo); +} + /* * index_beginscan - start a scan of an index with amgettuple * diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index a176a69a76b..b2a8412205a 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -518,6 +518,407 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, return result; } +List * +ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *oldSlot, + EState *estate, + bool noDupErr, + bool *specConflict, + List *arbiterIndexes, + bool onlySummarizing) +{ + List *result = NIL; + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + bool applyNoDupErr; + IndexUniqueCheck checkUnique; + bool satisfiesConstraint; + bool new_valid = true; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + /* + * Skip processing of non-summarizing indexes if we only update + * summarizing indexes + */ + if (onlySummarizing && !indexInfo->ii_Summarizing) + continue; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + { + if (!indexRelation->rd_indam->ammvccaware) + continue; + new_valid = false; + } + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* Check whether to apply noDupErr to this index */ + applyNoDupErr = noDupErr && + (arbiterIndexes == NIL || + list_member_oid(arbiterIndexes, + indexRelation->rd_index->indexrelid)); + + /* + * The index AM does the actual insertion, plus uniqueness checking. + * + * For an immediate-mode unique index, we just tell the index AM to + * throw error if not unique. + * + * For a deferrable unique index, we tell the index AM to just detect + * possible non-uniqueness, and we add the index OID to the result + * list if further checking is needed. + * + * For a speculative insertion (used by INSERT ... ON CONFLICT), do + * the same as for a deferrable unique index. + */ + if (!indexRelation->rd_index->indisunique) + checkUnique = UNIQUE_CHECK_NO; + else if (applyNoDupErr) + checkUnique = UNIQUE_CHECK_PARTIAL; + else if (indexRelation->rd_index->indimmediate) + checkUnique = UNIQUE_CHECK_YES; + else + checkUnique = UNIQUE_CHECK_PARTIAL; + + if (indexRelation->rd_indam->ammvccaware) + { + Datum valuesOld[INDEX_MAX_KEYS]; + bool isnullOld[INDEX_MAX_KEYS]; + Datum oldTupleid; + bool old_valid = true; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + oldTupleid = slot_getsysattr(oldSlot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&oldSlot->tts_tid)); + oldTupleid = PointerGetDatum(&oldSlot->tts_tid); + } + + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = oldSlot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + old_valid = false; + } + + FormIndexDatum(indexInfo, + oldSlot, + estate, + valuesOld, + isnullOld); + + satisfiesConstraint = + index_update(indexRelation, /* index relation */ + new_valid, + old_valid, + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + valuesOld, + isnullOld, + oldTupleid, + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexInfo); /* index AM may need this */ + + } + else + { + bool indexUnchanged; + /* + * There's definitely going to be an index_insert() call for this + * index. If we're being called as part of an UPDATE statement, + * consider if the 'indexUnchanged' = true hint should be passed. + */ + indexUnchanged = index_unchanged_by_update(resultRelInfo, + estate, + indexInfo, + indexRelation); + + satisfiesConstraint = + index_insert(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexUnchanged, /* UPDATE without logical change? */ + indexInfo); /* index AM may need this */ + } + + /* + * If the index has an associated exclusion constraint, check that. + * This is simpler than the process for uniqueness checks since we + * always insert first and then check. If the constraint is deferred, + * we check now anyway, but don't throw error on violation or wait for + * a conclusive outcome from a concurrent insertion; instead we'll + * queue a recheck event. Similarly, noDupErr callers (speculative + * inserters) will recheck later, and wait for a conclusive outcome + * then. + * + * An index for an exclusion constraint can't also be UNIQUE (not an + * essential property, we just don't allow it in the grammar), so no + * need to preserve the prior state of satisfiesConstraint. + */ + if (indexInfo->ii_ExclusionOps != NULL) + { + bool violationOK; + CEOUC_WAIT_MODE waitMode; + ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); + + if (applyNoDupErr) + { + violationOK = true; + waitMode = CEOUC_LIVELOCK_PREVENTING_WAIT; + } + else if (!indexRelation->rd_index->indimmediate) + { + violationOK = true; + waitMode = CEOUC_NOWAIT; + } + else + { + violationOK = false; + waitMode = CEOUC_WAIT; + } + + satisfiesConstraint = + check_exclusion_or_unique_constraint(heapRelation, + indexRelation, indexInfo, + raw_tupleid, values, isnull, + estate, false, + waitMode, violationOK, NULL); + } + + if ((checkUnique == UNIQUE_CHECK_PARTIAL || + indexInfo->ii_ExclusionOps != NULL) && + !satisfiesConstraint) + { + /* + * The tuple potentially violates the uniqueness or exclusion + * constraint, so make a note of the index so that we can re-check + * it later. Speculative inserters are told if there was a + * speculative conflict, since that always requires a restart. + */ + result = lappend_oid(result, RelationGetRelid(indexRelation)); + if (indexRelation->rd_index->indimmediate && specConflict) + *specConflict = true; + } + } + + return result; +} + +void +ExecDeleteIndexTuples(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, + EState *estate) +{ + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + if (!indexRelation->rd_indam->ammvccaware) + continue; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + index_delete(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + indexInfo); /* index AM may need this */ + } +} + /* ---------------------------------------------------------------- * ExecCheckIndexConstraints * diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 7f74d0961c8..fb0997af2d4 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1231,6 +1231,14 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result) *result = TM_Ok; + /* + * Open the table's indexes, if we have not done so already, so that we + * can delete index entries. + */ + if (resultRelInfo->ri_RelationDesc->rd_rel->relhasindex && + resultRelInfo->ri_IndexRelationDescs == NULL) + ExecOpenIndices(resultRelInfo, false); + /* BEFORE ROW DELETE triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->trig_delete_before_row) @@ -1287,6 +1295,10 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, EState *estate = context->estate; TransitionCaptureState *ar_delete_trig_tcs; + /* delete index entries if necessary */ + if (resultRelInfo->ri_NumIndices > 0) + ExecDeleteIndexTuples(resultRelInfo, slot, context->estate); + /* * If this delete is the result of a partition key update that moved the * tuple to a new partition, put this row into the transition OLD TABLE, @@ -2014,11 +2026,15 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, /* insert index entries for tuple if necessary */ if (resultRelInfo->ri_NumIndices > 0 && (updateCxt->updateIndexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, context->estate, - true, false, + { + recheckIndexes = ExecUpdateIndexTuples(resultRelInfo, + slot, + oldSlot, + context->estate, + false, NULL, NIL, (updateCxt->updateIndexes == TU_Summarizing)); + } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index bca627c5463..b437e0f7dff 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -373,6 +373,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type OptSchemaEltList parameter_name_list %type am_type +%type opt_for_tableam %type TriggerForSpec TriggerForType %type TriggerActionTime @@ -5870,17 +5871,21 @@ row_security_cmd: /***************************************************************************** * * QUERY: - * CREATE ACCESS METHOD name HANDLER handler_name + * CREATE ACCESS METHOD name TYPE am_type + * [FOR tableam_name] + * HANDLER handler_name * *****************************************************************************/ -CreateAmStmt: CREATE ACCESS METHOD name TYPE_P am_type HANDLER handler_name +CreateAmStmt: CREATE ACCESS METHOD name TYPE_P am_type + opt_for_tableam HANDLER handler_name { CreateAmStmt *n = makeNode(CreateAmStmt); n->amname = $4; - n->handler_name = $8; n->amtype = $6; + n->tableam_name = $7; + n->handler_name = $9; $$ = (Node *) n; } ; @@ -5890,6 +5895,11 @@ am_type: | TABLE { $$ = AMTYPE_TABLE; } ; +opt_for_tableam: + FOR name { $$ = $2; } + | /*EMPTY*/ { $$ = NULL; } + ; + /***************************************************************************** * * QUERIES : diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 7fc4ff9a379..11f84bbaf50 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -112,6 +112,25 @@ typedef bool (*aminsert_function) (Relation indexRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); +/* update this tuple */ +typedef bool (*amupdate_function) (Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +/* delete this tuple */ +typedef bool (*amdelete_function) (Relation indexRelation, + Datum *values, bool *isnull, + Datum tupleid, + Relation heapRelation, + struct IndexInfo *indexInfo); /* cleanup after insert */ typedef void (*aminsertcleanup_function) (Relation indexRelation, @@ -252,6 +271,8 @@ typedef struct IndexAmRoutine bool amusemaintenanceworkmem; /* does AM store tuple information only at block granularity? */ bool amsummarizing; + /* does AM can provide MVCC */ + bool ammvccaware; /* OR of parallel vacuum flags. See vacuum.h for flags. */ uint8 amparallelvacuumoptions; /* type of data stored in index, or InvalidOid if variable */ @@ -268,6 +289,8 @@ typedef struct IndexAmRoutine ambuildempty_function ambuildempty; aminsert_function aminsert; aminsertcleanup_function aminsertcleanup; + amupdate_function amupdate; + amdelete_function amdelete; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index fd569bdd5f0..7cb6d2aa3a2 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -151,6 +151,21 @@ extern bool index_insert(Relation indexRelation, struct IndexInfo *indexInfo); extern void index_insert_cleanup(Relation indexRelation, struct IndexInfo *indexInfo); +extern bool index_update(Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +extern bool index_delete(Relation indexRelation, Datum *values, bool *isnull, + Datum tupleid, Relation heapRelation, + struct IndexInfo *indexInfo); extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 9770752ea3c..1833f4d84b1 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -633,6 +633,16 @@ extern List *ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, bool noDupErr, bool *specConflict, List *arbiterIndexes, bool onlySummarizing); +extern List *ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *oldSlot, + EState *estate, + bool noDupErr, + bool *specConflict, List *arbiterIndexes, + bool onlySummarizing); +extern void ExecDeleteIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate); extern bool ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, ItemPointer conflictTid, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index ddc80007b34..82443390a85 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2993,6 +2993,7 @@ typedef struct CreateAmStmt char *amname; /* access method name */ List *handler_name; /* handler function name */ char amtype; /* type of access method */ + char *tableam_name; /* table AM name */ } CreateAmStmt; /* ---------------------- From 6a6e9ffa7e2f8e6be0896d8249d302b75809ae7d Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 12 Aug 2024 12:30:00 +0300 Subject: [PATCH 35/56] Hook to override index AM routine --- src/backend/access/index/amapi.c | 67 ++++++++++++++++++---- src/backend/catalog/index.c | 2 +- src/backend/commands/indexcmds.c | 4 +- src/backend/commands/opclasscmds.c | 9 +-- src/backend/executor/execAmi.c | 2 +- src/backend/replication/logical/relation.c | 2 +- src/backend/utils/adt/amutils.c | 4 +- src/backend/utils/adt/ruleutils.c | 2 +- src/backend/utils/cache/relcache.c | 2 +- src/include/access/amapi.h | 9 ++- 10 files changed, 76 insertions(+), 27 deletions(-) diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index 079fb7cba65..40fb78e71d2 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -16,25 +16,27 @@ #include "access/amapi.h" #include "access/htup_details.h" #include "catalog/pg_am.h" +#include "catalog/pg_class.h" +#include "catalog/pg_index.h" #include "catalog/pg_opclass.h" #include "utils/fmgrprotos.h" #include "utils/syscache.h" +IndexAMRoutineHookType IndexAMRoutineHook = NULL; -/* - * GetIndexAmRoutine - call the specified access method handler routine to get - * its IndexAmRoutine struct, which will be palloc'd in the caller's context. - * - * Note that if the amhandler function is built-in, this will not involve - * any catalog access. It's therefore safe to use this while bootstrapping - * indexes for the system catalogs. relcache.c relies on that. - */ IndexAmRoutine * -GetIndexAmRoutine(Oid amhandler) +GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) { Datum datum; IndexAmRoutine *routine; + if (IndexAMRoutineHook != NULL) + { + routine = IndexAMRoutineHook(tamoid, amhandler); + if (routine) + return routine; + } + datum = OidFunctionCall0(amhandler); routine = (IndexAmRoutine *) DatumGetPointer(datum); @@ -45,6 +47,47 @@ GetIndexAmRoutine(Oid amhandler) return routine; } + +/* + * GetIndexAmRoutine - call the specified access method handler routine to get + * its IndexAmRoutine struct, which will be palloc'd in the caller's context. + * + * Note that if the amhandler function is built-in, this will not involve + * any catalog access. It's therefore safe to use this while bootstrapping + * indexes for the system catalogs. relcache.c relies on that. + */ +IndexAmRoutine * +GetIndexAmRoutine(Oid indoid, Oid amhandler) +{ + HeapTuple ht_idx; + HeapTuple ht_tblrel; + Form_pg_index idxrec; + Form_pg_class tblrelrec; + Oid indrelid; + Oid tamoid; + + if (!OidIsValid((indoid)) || indoid < FirstNormalObjectId) + return GetIndexAmRoutineWithTableAM(HEAP_TABLE_AM_OID, amhandler); + + ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indoid)); + if (!HeapTupleIsValid(ht_idx)) + elog(ERROR, "cache lookup failed for index %u", indoid); + idxrec = (Form_pg_index) GETSTRUCT(ht_idx); + Assert(indoid == idxrec->indexrelid); + indrelid = idxrec->indrelid; + + ht_tblrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indrelid)); + if (!HeapTupleIsValid(ht_tblrel)) + elog(ERROR, "cache lookup failed for relation %u", indrelid); + tblrelrec = (Form_pg_class) GETSTRUCT(ht_tblrel); + tamoid = tblrelrec->relam; + + ReleaseSysCache(ht_tblrel); + ReleaseSysCache(ht_idx); + + return GetIndexAmRoutineWithTableAM(tamoid, amhandler); +} + /* * GetIndexAmRoutineByAmId - look up the handler of the index access method * with the given OID, and get its IndexAmRoutine struct. @@ -53,7 +96,7 @@ GetIndexAmRoutine(Oid amhandler) * noerror is true, else throws error. */ IndexAmRoutine * -GetIndexAmRoutineByAmId(Oid amoid, bool noerror) +GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror) { HeapTuple tuple; Form_pg_am amform; @@ -103,7 +146,7 @@ GetIndexAmRoutineByAmId(Oid amoid, bool noerror) ReleaseSysCache(tuple); /* And finally, call the handler function to get the API struct. */ - return GetIndexAmRoutine(amhandler); + return GetIndexAmRoutine(indoid, amhandler); } @@ -129,7 +172,7 @@ amvalidate(PG_FUNCTION_ARGS) ReleaseSysCache(classtup); - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); if (amroutine->amvalidate == NULL) elog(ERROR, "function amvalidate is not defined for index access method %u", diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 91a115003b9..92211c04d57 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -292,7 +292,7 @@ ConstructTupleDescriptor(Relation heapRelation, int i; /* We need access to the index AM's API struct */ - amroutine = GetIndexAmRoutineByAmId(accessMethodId, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, accessMethodId, false); /* ... and to the table's tuple descriptor */ heapTupDesc = RelationGetDescr(heapRelation); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 78d8bda48a0..89a9fecb30e 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -220,7 +220,7 @@ CheckIndexCompatible(Oid oldId, accessMethodName))); accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutine(oldId, accessMethodForm->amhandler); ReleaseSysCache(tuple); amcanorder = amRoutine->amcanorder; @@ -841,7 +841,7 @@ DefineIndex(Oid tableId, } accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutineWithTableAM(rel->rd_rel->relam, accessMethodForm->amhandler); pgstat_progress_update_param(PROGRESS_CREATEIDX_ACCESS_METHOD_OID, accessMethodId); diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index b8b5c147c5d..fe91b816c32 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -42,6 +42,7 @@ #include "parser/parse_oper.h" #include "parser/parse_type.h" #include "utils/acl.h" +#include "postgres_ext.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" @@ -376,7 +377,7 @@ DefineOpClass(CreateOpClassStmt *stmt) amform = (Form_pg_am) GETSTRUCT(tup); amoid = amform->oid; - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); ReleaseSysCache(tup); maxOpNumber = amroutine->amstrategies; @@ -834,7 +835,7 @@ AlterOpFamily(AlterOpFamilyStmt *stmt) amform = (Form_pg_am) GETSTRUCT(tup); amoid = amform->oid; - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); ReleaseSysCache(tup); maxOpNumber = amroutine->amstrategies; @@ -881,7 +882,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid, int maxOpNumber, int maxProcNumber, int optsProcNumber, List *items) { - IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false); + IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); List *operators; /* OpFamilyMember list for operators */ List *procedures; /* OpFamilyMember list for support procs */ ListCell *l; @@ -1164,7 +1165,7 @@ assignOperTypes(OpFamilyMember *member, Oid amoid, Oid typeoid) * the family has been created but not yet populated with the required * operators.) */ - IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false); + IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); if (!amroutine->amcanorderbyop) ereport(ERROR, diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 3289e3e0219..1a7f6ae2c9b 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -613,7 +613,7 @@ IndexSupportsBackwardScan(Oid indexid) idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutineByAmId(idxrelrec->relam, false); + amroutine = GetIndexAmRoutineByAmId(indexid, idxrelrec->relam, false); result = amroutine->amcanbackward; diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index f139e7b01e9..4429127c434 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -834,7 +834,7 @@ IsIndexUsableForReplicaIdentityFull(IndexInfo *indexInfo, AttrMap *attrmap) IndexAmRoutine *amroutine; /* The given index access method must implement amgettuple. */ - amroutine = GetIndexAmRoutineByAmId(indexInfo->ii_Am, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, indexInfo->ii_Am, false); Assert(amroutine->amgettuple != NULL); } #endif diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c index dd39a994c8d..b7ebe6a5f76 100644 --- a/src/backend/utils/adt/amutils.c +++ b/src/backend/utils/adt/amutils.c @@ -195,7 +195,7 @@ indexam_property(FunctionCallInfo fcinfo, /* * Get AM information. If we don't have a valid AM OID, return NULL. */ - routine = GetIndexAmRoutineByAmId(amoid, true); + routine = GetIndexAmRoutineByAmId(index_oid, amoid, true); if (routine == NULL) PG_RETURN_NULL(); @@ -455,7 +455,7 @@ pg_indexam_progress_phasename(PG_FUNCTION_ARGS) IndexAmRoutine *routine; char *name; - routine = GetIndexAmRoutineByAmId(amoid, true); + routine = GetIndexAmRoutineByAmId(InvalidOid, amoid, true); if (routine == NULL || !routine->ambuildphasename) PG_RETURN_NULL(); diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index cd9c3eddd1d..16aebc1cb90 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1323,7 +1323,7 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, amrec = (Form_pg_am) GETSTRUCT(ht_am); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutine(amrec->amhandler); + amroutine = GetIndexAmRoutine(indexrelid, amrec->amhandler); /* * Get the index expressions, if any. (NOTE: we do not use the relcache diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 558e428e9b7..43884c43e80 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1413,7 +1413,7 @@ InitIndexAmRoutine(Relation relation) * Call the amhandler in current, short-lived memory context, just in case * it leaks anything (it probably won't, but let's be paranoid). */ - tmp = GetIndexAmRoutine(relation->rd_amhandler); + tmp = GetIndexAmRoutine(relation->rd_id, relation->rd_amhandler); /* OK, now transfer the data into relation's rd_indexcxt. */ cached = (IndexAmRoutine *) MemoryContextAlloc(relation->rd_indexcxt, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 11f84bbaf50..3d7682be540 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -316,7 +316,12 @@ typedef struct IndexAmRoutine /* Functions in access/index/amapi.c */ -extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler); -extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid amoid, bool noerror); +extern IndexAmRoutine *GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutine(Oid indoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror); + +typedef IndexAmRoutine *(*IndexAMRoutineHookType) (Oid tamoid, Oid amhandler); + +extern IndexAMRoutineHookType IndexAMRoutineHook; #endif /* AMAPI_H */ From 9793c08fc1f4ee491a157ec3b9d44345456c205b Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Thu, 5 Sep 2024 00:03:23 +0200 Subject: [PATCH 36/56] Always building child/root maps for relations with ROW_REF_ROWID --- src/backend/executor/execUtils.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 5737f9f4ebd..5cbe3bf46d1 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -1211,9 +1211,19 @@ ExecGetChildToRootMap(ResultRelInfo *resultRelInfo) ResultRelInfo *rootRelInfo = resultRelInfo->ri_RootResultRelInfo; if (rootRelInfo) - resultRelInfo->ri_ChildToRootMap = - convert_tuples_by_name(RelationGetDescr(resultRelInfo->ri_RelationDesc), - RelationGetDescr(rootRelInfo->ri_RelationDesc)); + { + TupleDesc indesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + TupleDesc outdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); + AttrMap *attrMap; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) != ROW_REF_ROWID) + attrMap = build_attrmap_by_name_if_req(indesc, outdesc, false); + else + attrMap = build_attrmap_by_name(indesc, outdesc, false); + if (attrMap) + resultRelInfo->ri_ChildToRootMap = + convert_tuples_by_name_attrmap(indesc, outdesc, attrMap); + } else /* this isn't a child result rel */ resultRelInfo->ri_ChildToRootMap = NULL; @@ -1250,8 +1260,10 @@ ExecGetRootToChildMap(ResultRelInfo *resultRelInfo, EState *estate) * to ignore by passing true for missing_ok. */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - attrMap = build_attrmap_by_name_if_req(indesc, outdesc, - !childrel->rd_rel->relispartition); + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) != ROW_REF_ROWID) + attrMap = build_attrmap_by_name_if_req(indesc, outdesc, !childrel->rd_rel->relispartition); + else + attrMap = build_attrmap_by_name(indesc, outdesc, !childrel->rd_rel->relispartition); if (attrMap) resultRelInfo->ri_RootToChildMap = convert_tuples_by_name_attrmap(indesc, outdesc, attrMap); From 6f7f21612c99df810dbcb1e0338ab71435df7cb8 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Tue, 20 Aug 2024 14:09:51 +0200 Subject: [PATCH 37/56] Don't run internal btree _bt_getrootheight on non-btree in get_relation_info --- src/backend/optimizer/util/plancat.c | 3 ++- src/include/optimizer/plancat.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 86655f05dc8..6a45058cbbd 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -58,6 +58,7 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; /* Hook for plugins to get control in get_relation_info() */ get_relation_info_hook_type get_relation_info_hook = NULL; +skip_tree_height_hook_type skip_tree_height_hook = NULL; static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, @@ -485,7 +486,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->tuples = rel->tuples; } - if (info->relam == BTREE_AM_OID) + if (info->relam == BTREE_AM_OID && (!skip_tree_height_hook || !skip_tree_height_hook(indexRelation))) { /* * For btrees, get tree height while we have the index diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index f59b77b1012..e44fff74f5e 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -24,6 +24,9 @@ typedef void (*get_relation_info_hook_type) (PlannerInfo *root, RelOptInfo *rel); extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; +typedef bool (*skip_tree_height_hook_type) (Relation indexRelation); +extern PGDLLIMPORT skip_tree_height_hook_type skip_tree_height_hook; + extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); From 5dd30272465c144d22f9da1c4b7c18521061502f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Tue, 17 Sep 2024 01:58:24 +0300 Subject: [PATCH 38/56] Fix handling tupleid in logical replication --- src/backend/access/table/tableam.c | 8 ++-- src/backend/executor/execReplication.c | 54 +++++++++++++++++------- src/backend/replication/logical/worker.c | 15 +++---- src/include/access/tableam.h | 4 +- 4 files changed, 50 insertions(+), 31 deletions(-) diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index d5eb19be3bf..8168bb78021 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -287,7 +287,7 @@ simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) * via ereport(). */ void -simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, +simple_table_tuple_delete(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *oldSlot) { TM_Result result; @@ -298,7 +298,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_delete(rel, PointerGetDatum(tid), + result = table_tuple_delete(rel, tupleid, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, @@ -339,7 +339,7 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot, * via ereport(). */ void -simple_table_tuple_update(Relation rel, ItemPointer otid, +simple_table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, Snapshot snapshot, TU_UpdateIndexes *update_indexes, @@ -354,7 +354,7 @@ simple_table_tuple_update(Relation rel, ItemPointer otid, if (oldSlot) options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_update(rel, PointerGetDatum(otid), slot, + result = table_tuple_update(rel, tupleid, slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, options, diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 1ace97b6d47..252efe51738 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -166,6 +166,25 @@ build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel, return skey_attoff; } +static Datum +slot_get_tupleid(Relation rel, TupleTableSlot *slot) +{ + Datum tupleid; + + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + tupleid = PointerGetDatum(&slot->tts_tid); + } + + return tupleid; +} + /* * Search the relation 'rel' for tuple using the index. * @@ -250,7 +269,7 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + res = table_tuple_lock(rel, slot_get_tupleid(rel, outslot), GetLatestSnapshot(), outslot, GetCurrentCommandId(false), @@ -435,7 +454,7 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, PointerGetDatum(&(outslot->tts_tid)), + res = table_tuple_lock(rel, slot_get_tupleid(rel, outslot), GetLatestSnapshot(), outslot, GetCurrentCommandId(false), @@ -559,7 +578,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, { bool skip_tuple = false; Relation rel = resultRelInfo->ri_RelationDesc; - ItemPointer tid = &(searchslot->tts_tid); + Datum tupleid = slot_get_tupleid(rel, searchslot); /* For now we support only tables. */ Assert(rel->rd_rel->relkind == RELKIND_RELATION); @@ -571,7 +590,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - PointerGetDatum(tid), NULL, slot, NULL, NULL)) + tupleid, NULL, slot, NULL, NULL)) skip_tuple = true; /* "do nothing" */ } @@ -593,16 +612,17 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, if (rel->rd_rel->relispartition) ExecPartitionCheck(resultRelInfo, slot, estate, true); - if (resultRelInfo->ri_TrigDesc && - resultRelInfo->ri_TrigDesc->trig_update_after_row) - oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); - simple_table_tuple_update(rel, tid, slot, estate->es_snapshot, + simple_table_tuple_update(rel, tupleid, slot, estate->es_snapshot, &update_indexes, oldSlot); if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, true, false, + recheckIndexes = ExecUpdateIndexTuples(resultRelInfo, + slot, + oldSlot, + estate, + false, NULL, NIL, (update_indexes == TU_Summarizing)); @@ -629,7 +649,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, { bool skip_tuple = false; Relation rel = resultRelInfo->ri_RelationDesc; - ItemPointer tid = &searchslot->tts_tid; + Datum tupleid = slot_get_tupleid(rel, searchslot); CheckCmdReplicaIdentity(rel, CMD_DELETE); @@ -638,19 +658,21 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - PointerGetDatum(tid), NULL, NULL, NULL, NULL); + tupleid, NULL, NULL, NULL, NULL); } if (!skip_tuple) { TupleTableSlot *oldSlot = NULL; - if (resultRelInfo->ri_TrigDesc && - resultRelInfo->ri_TrigDesc->trig_delete_after_row) - oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); /* OK, delete the tuple */ - simple_table_tuple_delete(rel, tid, estate->es_snapshot, oldSlot); + simple_table_tuple_delete(rel, tupleid, estate->es_snapshot, oldSlot); + + /* delete index entries if necessary */ + if (resultRelInfo->ri_NumIndices > 0) + ExecDeleteIndexTuples(resultRelInfo, oldSlot, estate); /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index d091a1dd27c..f5a83e52eef 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2417,9 +2417,8 @@ apply_handle_insert(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* Process and store remote tuple in the slot */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); @@ -2573,9 +2572,8 @@ apply_handle_update(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* * Populate updatedCols so that per-column triggers can fire, and so @@ -2753,9 +2751,8 @@ apply_handle_delete(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* Build the search tuple. */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index d681c7636cb..573a2576935 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -2097,10 +2097,10 @@ table_tuple_is_current(Relation rel, TupleTableSlot *slot) */ extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); -extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, +extern void simple_table_tuple_delete(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *oldSlot); -extern void simple_table_tuple_update(Relation rel, ItemPointer otid, +extern void simple_table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, Snapshot snapshot, TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot); From 2ebae2aa8659d424cbe41cf1888f6f249b9d139d Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 27 Sep 2024 14:26:40 +0200 Subject: [PATCH 39/56] New csn snapshot format Added xlogptr and xmin to determine right order of transactions when decoding on replica --- src/backend/utils/time/snapmgr.c | 10 +++++++--- src/include/utils/snapshot.h | 8 +++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 0ad250a959b..ea14754a418 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -205,7 +205,7 @@ typedef struct SerializedSnapshotData CommandId curcid; TimestampTz whenTaken; XLogRecPtr lsn; - CommitSeqNo snapshotcsn; + CSNSnapshotData csnSnapshotData; uint64 undoRegularLocation; uint64 undoRegularXmin; uint64 undoSystemLocation; @@ -1763,7 +1763,9 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; - serialized_snapshot.snapshotcsn = snapshot->snapshotcsn; + serialized_snapshot.csnSnapshotData.xmin = snapshot->csnSnapshotData.xmin; + serialized_snapshot.csnSnapshotData.snapshotcsn = snapshot->csnSnapshotData.snapshotcsn; + serialized_snapshot.csnSnapshotData.xlogptr = snapshot->csnSnapshotData.xlogptr; serialized_snapshot.undoRegularXmin = snapshot->undoRegularLocationPhNode.xmin; serialized_snapshot.undoRegularLocation = snapshot->undoRegularLocationPhNode.undoLocation; serialized_snapshot.undoSystemXmin = snapshot->undoSystemLocationPhNode.xmin; @@ -1843,7 +1845,9 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; - snapshot->snapshotcsn = serialized_snapshot.snapshotcsn; + snapshot->csnSnapshotData.xmin = serialized_snapshot.csnSnapshotData.xmin; + snapshot->csnSnapshotData.snapshotcsn = serialized_snapshot.csnSnapshotData.snapshotcsn; + snapshot->csnSnapshotData.xlogptr = serialized_snapshot.csnSnapshotData.xlogptr; snapshot->undoRegularLocationPhNode.xmin = serialized_snapshot.undoRegularXmin; snapshot->undoRegularLocationPhNode.undoLocation = serialized_snapshot.undoRegularLocation; snapshot->undoSystemLocationPhNode.xmin = serialized_snapshot.undoSystemXmin; diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 49c913b12f8..6052c760056 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -129,6 +129,12 @@ typedef struct pairingheap_node ph_node; } RetainUndoLocationPHNode; +typedef struct CSNSnapshotData { + uint64 xmin; + CommitSeqNo snapshotcsn; + XLogRecPtr xlogptr; +} CSNSnapshotData; + /* * Struct representing all kind of possible snapshots. * @@ -224,7 +230,7 @@ typedef struct SnapshotData RetainUndoLocationPHNode undoRegularLocationPhNode; RetainUndoLocationPHNode undoSystemLocationPhNode; - CommitSeqNo snapshotcsn; + CSNSnapshotData csnSnapshotData; } SnapshotData; typedef void (*snapshot_hook_type) (Snapshot snapshot); From c539b5cd2f3af168f838182ab0849cea12996e93 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Mon, 7 Oct 2024 18:55:44 +0400 Subject: [PATCH 40/56] expose functions that became private in PG17 due to ReourceOwner and SAOP changes in PG17 --- src/backend/access/nbtree/nbtutils.c | 3 +-- src/backend/utils/cache/catcache.c | 5 +++-- src/include/access/nbtree.h | 1 + src/include/utils/resowner_private.h | 33 ++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 src/include/utils/resowner_private.h diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index c22ccec789d..55ff1fdbfaf 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -73,7 +73,6 @@ static int _bt_binsrch_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result); -static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, @@ -1377,7 +1376,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) * On false result, the scankeys stay the same, and the array keys are not * advanced (every array remains at its final element for scan direction). */ -static bool +bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 6f15161e426..1b2ad91424c 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -38,6 +38,7 @@ #include "utils/memutils.h" #include "utils/rel.h" #include "utils/resowner.h" +#include "utils/resowner_private.h" #include "utils/syscache.h" @@ -141,7 +142,7 @@ static const ResourceOwnerDesc catlistref_resowner_desc = }; /* Convenience wrappers over ResourceOwnerRemember/Forget */ -static inline void +void ResourceOwnerRememberCatCacheRef(ResourceOwner owner, HeapTuple tuple) { ResourceOwnerRemember(owner, PointerGetDatum(tuple), &catcache_resowner_desc); @@ -151,7 +152,7 @@ ResourceOwnerForgetCatCacheRef(ResourceOwner owner, HeapTuple tuple) { ResourceOwnerForget(owner, PointerGetDatum(tuple), &catcache_resowner_desc); } -static inline void +void ResourceOwnerRememberCatCacheListRef(ResourceOwner owner, CatCList *list) { ResourceOwnerRemember(owner, PointerGetDatum(list), &catlistref_resowner_desc); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 049ebf72b7b..9ba149aa47d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1288,6 +1288,7 @@ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); extern void _bt_preprocess_keys(IndexScanDesc scan); extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts); diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h new file mode 100644 index 00000000000..d32a3a42ef0 --- /dev/null +++ b/src/include/utils/resowner_private.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * resowner_private.h + * POSTGRES resource owner private definitions. + * + * See utils/resowner/README for more info. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/resowner_private.h + * + *------------------------------------------------------------------------- + */ +#ifndef RESOWNER_PRIVATE_H +#define RESOWNER_PRIVATE_H + +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/lock.h" +#include "utils/catcache.h" +#include "utils/plancache.h" +#include "utils/resowner.h" +#include "utils/snapshot.h" + + +extern void ResourceOwnerRememberCatCacheRef(ResourceOwner owner, + HeapTuple tuple); +extern void ResourceOwnerRememberCatCacheListRef(ResourceOwner owner, + CatCList *list); + +#endif /* RESOWNER_PRIVATE_H */ From 7f152e12a4c6eb7dbe88de8db3232c67f2568cc2 Mon Sep 17 00:00:00 2001 From: Ilya Kobets Date: Fri, 27 Sep 2024 14:26:40 +0200 Subject: [PATCH 41/56] New CSN snapshot format * Add xlogptr and xmin to determine right order of transactions when decoding on replica. * Add CSN snapshot data to snapshot builder. * Record CSN to the running xids and restore it during logical decoding to the snapshot builder. * Add function to update CSN snapshot data in snapshot builder. * Update CSN snapshot LSN in snapshot building after each transaction commit. * Restore CSN snapshot data in SnapBuildBuildSnapshot(). --- src/backend/replication/logical/snapbuild.c | 16 ++++++++++++++++ src/backend/storage/ipc/procarray.c | 1 + src/backend/storage/ipc/standby.c | 1 + src/include/replication/snapbuild.h | 2 ++ src/include/storage/standby.h | 1 + src/include/storage/standbydefs.h | 1 + src/include/utils/snapshot.h | 3 ++- 7 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index f4a81de2f4e..0f1dc7d278e 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -219,6 +219,8 @@ struct SnapBuild */ TransactionId next_phase_at; + CSNSnapshotData csnSnapshotData; + /* * Array of transactions which could have catalog changes that committed * between xmin and xmax. @@ -576,6 +578,8 @@ SnapBuildBuildSnapshot(SnapBuild *builder) snapshot->regd_count = 0; snapshot->snapXactCompletionCount = 0; + snapshot->csnSnapshotData = builder->csnSnapshotData; + return snapshot; } @@ -1053,6 +1057,8 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, TransactionId xmax = xid; + builder->csnSnapshotData.xlogptr = lsn; + /* * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor * will they be part of a snapshot. So we don't need to record anything. @@ -1267,6 +1273,9 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * we hit fast paths in heapam_visibility.c. */ builder->xmin = running->oldestRunningXid; + builder->csnSnapshotData.snapshotcsn = running->csn; + builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.xlogptr = lsn; /* Remove transactions we don't need to keep track off anymore */ SnapBuildPurgeOlderTxn(builder); @@ -2185,3 +2194,10 @@ SnapBuildSnapshotExists(XLogRecPtr lsn) return ret == 0; } + +void +SnapBuildUpdateCSNSnaphot(SnapBuild *builder, + CSNSnapshotData *csnSnapshotData) +{ + builder->csnSnapshotData = *csnSnapshotData; +} diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 8da12c98346..e7b083ee7a7 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -2858,6 +2858,7 @@ GetRunningTransactionData(void) CurrentRunningXacts->oldestRunningXid = oldestRunningXid; CurrentRunningXacts->oldestDatabaseRunningXid = oldestDatabaseRunningXid; CurrentRunningXacts->latestCompletedXid = latestCompletedXid; + CurrentRunningXacts->csn = pg_atomic_read_u64(&TransamVariables->nextCommitSeqNo); Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 872679ca447..17ddeb893c6 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -1353,6 +1353,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) xlrec.nextXid = CurrRunningXacts->nextXid; xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; + xlrec.csn = CurrRunningXacts->csn; /* Header */ XLogBeginInsert(); diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index 6eee98557ad..4a74c89c358 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -92,6 +92,8 @@ extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, struct xl_running_xacts *running); extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn); +extern void SnapBuildUpdateCSNSnaphot(SnapBuild *builder, + CSNSnapshotData *csnSnapshotData); extern bool SnapBuildSnapshotExists(XLogRecPtr lsn); diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index cce0bc521e7..4a42f9a767b 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -93,6 +93,7 @@ typedef struct RunningTransactionsData TransactionId oldestDatabaseRunningXid; /* same as above, but within the * current database */ TransactionId latestCompletedXid; /* so we can set xmax */ + CommitSeqNo csn; /* current csn */ TransactionId *xids; /* array of (sub)xids still running */ } RunningTransactionsData; diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h index fe12f463a86..394bc42052f 100644 --- a/src/include/storage/standbydefs.h +++ b/src/include/storage/standbydefs.h @@ -52,6 +52,7 @@ typedef struct xl_running_xacts TransactionId nextXid; /* xid from TransamVariables->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ + CommitSeqNo csn; /* current csn */ TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; } xl_running_xacts; diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 6052c760056..9eec035622d 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -129,7 +129,8 @@ typedef struct pairingheap_node ph_node; } RetainUndoLocationPHNode; -typedef struct CSNSnapshotData { +typedef struct CSNSnapshotData +{ uint64 xmin; CommitSeqNo snapshotcsn; XLogRecPtr xlogptr; From 231143137f29fb52e75911eaf16370ee73edb580 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Thu, 3 Oct 2024 13:12:01 +0300 Subject: [PATCH 42/56] Restart archiver during PM_SHUTDOWN postmaster stage That allows S3 mode to finish WAL archiving if needed. --- src/backend/postmaster/postmaster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 82b23791f31..16fde3e8ec4 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -436,7 +436,7 @@ static void MaybeStartSlotSyncWorker(void); * even during recovery. */ #define PgArchStartupAllowed() \ - (((XLogArchivingActive() && pmState == PM_RUN) || \ + (((XLogArchivingActive() && (pmState == PM_RUN || pmState == PM_SHUTDOWN)) || \ (XLogArchivingAlways() && \ (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) && \ PgArchCanRestart()) From 25b8e2f7270fb3e58f7dfed904a15f2a6fee0a95 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Tue, 8 Oct 2024 21:31:33 +0300 Subject: [PATCH 43/56] Add handling of CSN snapshot in some places of snapbuild.c --- src/backend/replication/logical/snapbuild.c | 5 +++++ src/backend/utils/time/snapmgr.c | 1 + 2 files changed, 6 insertions(+) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 0f1dc7d278e..12ebfdb4702 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -677,6 +677,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) snap->snapshot_type = SNAPSHOT_MVCC; snap->xcnt = newxcnt; snap->xip = newxip; + snap->csnSnapshotData = builder->csnSnapshotData; return snap; } @@ -1246,6 +1247,10 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact ReorderBufferTXN *txn; TransactionId xmin; + builder->csnSnapshotData.snapshotcsn = running->csn; + builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.xlogptr = lsn; + /* * If we're not consistent yet, inspect the record to see whether it * allows to get closer to being consistent. If we are consistent, dump diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index ea14754a418..df9f4394f07 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -516,6 +516,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, CurrentSnapshot->xmin = sourcesnap->xmin; CurrentSnapshot->xmax = sourcesnap->xmax; CurrentSnapshot->xcnt = sourcesnap->xcnt; + CurrentSnapshot->csnSnapshotData = sourcesnap->csnSnapshotData; Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount()); if (sourcesnap->xcnt > 0) memcpy(CurrentSnapshot->xip, sourcesnap->xip, From 7563680371fac3acc22c021bf19ae7dbe887d8dc Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 14 Oct 2024 16:22:14 +0300 Subject: [PATCH 44/56] Move CheckPoint_hook() call after CheckPointBuffers() That allows to process flushed buffers in CheckPoint_hook(). --- src/backend/access/transam/xlog.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 42312c75830..6e12db59c9c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7491,8 +7491,6 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { - if (CheckPoint_hook) - CheckPoint_hook(checkPointRedo, flags); CheckPointRelationMap(); CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN); CheckPointSnapBuild(); @@ -7509,6 +7507,9 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointPredicate(); CheckPointBuffers(flags); + if (CheckPoint_hook) + CheckPoint_hook(checkPointRedo, flags); + /* Perform all queued up fsyncs */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); From 313aa6cb41be03378ed1dba24ec1041aba4cab12 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Tue, 15 Oct 2024 22:39:08 +0400 Subject: [PATCH 45/56] Restore GetIndexAmRoutine signature for compatibility with other callers Use GetIndexAmRoutineExtended instead for all Orioledb extensibility. --- src/backend/access/index/amapi.c | 11 ++++++++--- src/backend/commands/indexcmds.c | 2 +- src/backend/utils/adt/ruleutils.c | 2 +- src/backend/utils/cache/relcache.c | 2 +- src/include/access/amapi.h | 3 ++- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index 40fb78e71d2..a8f1c580acd 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -47,7 +47,6 @@ GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) return routine; } - /* * GetIndexAmRoutine - call the specified access method handler routine to get * its IndexAmRoutine struct, which will be palloc'd in the caller's context. @@ -57,7 +56,13 @@ GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) * indexes for the system catalogs. relcache.c relies on that. */ IndexAmRoutine * -GetIndexAmRoutine(Oid indoid, Oid amhandler) +GetIndexAmRoutine(Oid amhandler) +{ + return GetIndexAmRoutineExtended(InvalidOid, amhandler); +} + +IndexAmRoutine * +GetIndexAmRoutineExtended(Oid indoid, Oid amhandler) { HeapTuple ht_idx; HeapTuple ht_tblrel; @@ -146,7 +151,7 @@ GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror) ReleaseSysCache(tuple); /* And finally, call the handler function to get the API struct. */ - return GetIndexAmRoutine(indoid, amhandler); + return GetIndexAmRoutineExtended(indoid, amhandler); } diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 89a9fecb30e..c8a926c0463 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -220,7 +220,7 @@ CheckIndexCompatible(Oid oldId, accessMethodName))); accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(oldId, accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutineExtended(oldId, accessMethodForm->amhandler); ReleaseSysCache(tuple); amcanorder = amRoutine->amcanorder; diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 16aebc1cb90..216db91f335 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1323,7 +1323,7 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, amrec = (Form_pg_am) GETSTRUCT(ht_am); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutine(indexrelid, amrec->amhandler); + amroutine = GetIndexAmRoutineExtended(indexrelid, amrec->amhandler); /* * Get the index expressions, if any. (NOTE: we do not use the relcache diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 43884c43e80..33bd7bcda8f 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1413,7 +1413,7 @@ InitIndexAmRoutine(Relation relation) * Call the amhandler in current, short-lived memory context, just in case * it leaks anything (it probably won't, but let's be paranoid). */ - tmp = GetIndexAmRoutine(relation->rd_id, relation->rd_amhandler); + tmp = GetIndexAmRoutineExtended(relation->rd_id, relation->rd_amhandler); /* OK, now transfer the data into relation's rd_indexcxt. */ cached = (IndexAmRoutine *) MemoryContextAlloc(relation->rd_indexcxt, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 3d7682be540..d98cd56fcb7 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -317,7 +317,8 @@ typedef struct IndexAmRoutine /* Functions in access/index/amapi.c */ extern IndexAmRoutine *GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler); -extern IndexAmRoutine *GetIndexAmRoutine(Oid indoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutineExtended(Oid indoid, Oid amhandler); extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror); typedef IndexAmRoutine *(*IndexAMRoutineHookType) (Oid tamoid, Oid amhandler); From 333f7e3fd3e8ce12ab1d2270bb2bac61c2384342 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Wed, 16 Oct 2024 19:23:07 +0400 Subject: [PATCH 46/56] Make index insert compatible with outside callers We split aminsert method to aminsert and aminsertextended. aminsert is a method for indexes implemented in other extensions, it accepts ItemPointer tupleid. aminsertextended is for internal Postgres indexes and Orioledb, it accepts Datum tupleid. They are not supposed to call aminsert method, so that it is set NULL for them. We can not rely that extensions are aware of aminsertextended, so index_insert() calls aminsert if it's not NULL preferentially. Signature of index_insert() is reverted so that it could be called by other extensions. Datum tupleid is confined inside index_insert method. --- contrib/bloom/blutils.c | 3 ++- doc/src/sgml/indexam.sgml | 1 + src/backend/access/brin/brin.c | 3 ++- src/backend/access/common/toast_internals.c | 2 +- src/backend/access/gin/ginutil.c | 3 ++- src/backend/access/gist/gist.c | 3 ++- src/backend/access/hash/hash.c | 3 ++- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/access/index/indexam.c | 21 ++++++++++++++++--- src/backend/access/nbtree/nbtree.c | 3 ++- src/backend/access/spgist/spgutils.c | 3 ++- src/backend/catalog/indexing.c | 2 +- src/backend/commands/constraint.c | 2 +- src/backend/executor/execIndexing.c | 20 ++++++++---------- src/include/access/amapi.h | 12 +++++++++++ src/include/access/genam.h | 2 +- .../modules/dummy_index_am/dummy_index_am.c | 3 ++- src/tools/pgindent/typedefs.list | 1 + 18 files changed, 62 insertions(+), 27 deletions(-) diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 6836129c90d..9b72303c895 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -131,7 +131,8 @@ blhandler(PG_FUNCTION_ARGS) amroutine->ambuild = blbuild; amroutine->ambuildempty = blbuildempty; - amroutine->aminsert = blinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = blinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = blbulkdelete; amroutine->amvacuumcleanup = blvacuumcleanup; diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index e3c1539a1e3..a33faf4f004 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -141,6 +141,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + aminsert_extended_function aminsertextended; aminsertcleanup_function aminsertcleanup; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index c33c3636801..c1ccef71937 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -273,7 +273,8 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambuild = brinbuild; amroutine->ambuildempty = brinbuildempty; - amroutine->aminsert = brininsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = brininsert; amroutine->aminsertcleanup = brininsertcleanup; amroutine->ambulkdelete = brinbulkdelete; amroutine->amvacuumcleanup = brinvacuumcleanup; diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 2b4fa1fb25a..538a554c917 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -338,7 +338,7 @@ toast_save_datum(Relation rel, Datum value, /* Only index relations marked as ready can be updated */ if (toastidxs[i]->rd_index->indisready) index_insert(toastidxs[i], t_values, t_isnull, - ItemPointerGetDatum(&(toasttup->t_self)), + &(toasttup->t_self), toastrel, toastidxs[i]->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 5747ae6a4ca..68ce032f150 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -63,7 +63,8 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambuild = ginbuild; amroutine->ambuildempty = ginbuildempty; - amroutine->aminsert = gininsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = gininsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = ginbulkdelete; amroutine->amvacuumcleanup = ginvacuumcleanup; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 0da8ab31046..66b086ee4c7 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -85,7 +85,8 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambuild = gistbuild; amroutine->ambuildempty = gistbuildempty; - amroutine->aminsert = gistinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = gistinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = gistbulkdelete; amroutine->amvacuumcleanup = gistvacuumcleanup; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 1dc15d2a53b..557c7a3f316 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -83,7 +83,8 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambuild = hashbuild; amroutine->ambuildempty = hashbuildempty; - amroutine->aminsert = hashinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = hashinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 6f0464896c0..7d6828db403 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2313,7 +2313,7 @@ heapam_index_validate_scan(Relation heapRelation, index_insert(indexRelation, values, isnull, - ItemPointerGetDatum(&rootTuple), + &rootTuple, heapRelation, indexInfo->ii_Unique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index fe1efe283c2..4668d7159ae 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -213,24 +213,39 @@ bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - Datum tupleid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) { RELATION_CHECKS; - CHECK_REL_PROCEDURE(aminsert); + + if (indexRelation->rd_indam->aminsertextended == NULL && indexRelation->rd_indam->aminsert == NULL ) + elog(ERROR, "at least one function aminsert or aminsertextended should be defined for index \"%s\"", \ + RelationGetRelationName(indexRelation)); if (!(indexRelation->rd_indam->ampredlocks)) CheckForSerializableConflictIn(indexRelation, (ItemPointer) NULL, InvalidBlockNumber); - return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, + if (indexRelation->rd_indam->aminsert) + { + /* compatibility method for extension AM's not aware of aminsertextended */ + return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, tupleid, heapRelation, checkUnique, indexUnchanged, indexInfo); + } + else + { + /* index insert method for internal AM's and Orioledb that are aware of aminsertextended */ + return indexRelation->rd_indam->aminsertextended(indexRelation, values, isnull, + ItemPointerGetDatum(tupleid), heapRelation, + checkUnique, indexUnchanged, + indexInfo); + } } /* ------------------------- diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4acb3c73089..b661adb689e 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -127,7 +127,8 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambuild = btbuild; amroutine->ambuildempty = btbuildempty; - amroutine->aminsert = btinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = btinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 76b80146ff0..c1228ed2c01 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -70,7 +70,8 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambuild = spgbuild; amroutine->ambuildempty = spgbuildempty; - amroutine->aminsert = spginsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = spginsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = spgbulkdelete; amroutine->amvacuumcleanup = spgvacuumcleanup; diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index cd78b1ea55e..d0d1abda58a 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -170,7 +170,7 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple, index_insert(index, /* index relation */ values, /* array of index Datums */ isnull, /* is-null flags */ - ItemPointerGetDatum(&(heapTuple->t_self)), /* tid of heap tuple */ + &(heapTuple->t_self), /* tid of heap tuple */ heapRelation, index->rd_index->indisunique ? UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index 43618646861..ea5a1f365b1 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -171,7 +171,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) * the row is now dead, because that is the TID the index will know * about. */ - index_insert(indexRel, values, isnull, ItemPointerGetDatum(&checktid), + index_insert(indexRel, values, isnull, &checktid, trigdata->tg_relation, UNIQUE_CHECK_EXISTING, false, indexInfo); diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index b2a8412205a..9e09ef1cf1f 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -313,19 +313,19 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - Datum tupleid; + ItemPointer tupleid; if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { bool isnull; - tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + tupleid = DatumGetItemPointer(slot_getsysattr(slot, RowIdAttributeNumber, &isnull)); Assert(!isnull); } else { Assert(ItemPointerIsValid(&slot->tts_tid)); - tupleid = PointerGetDatum(&slot->tts_tid); + tupleid = &slot->tts_tid; } /* @@ -473,7 +473,6 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; - ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -494,7 +493,7 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - raw_tupleid, values, isnull, + tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } @@ -537,18 +536,18 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - Datum tupleid; + ItemPointer tupleid; if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { bool isnull; - tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + tupleid = DatumGetItemPointer(slot_getsysattr(slot, RowIdAttributeNumber, &isnull)); Assert(!isnull); } else { Assert(ItemPointerIsValid(&slot->tts_tid)); - tupleid = PointerGetDatum(&slot->tts_tid); + tupleid = &slot->tts_tid; } /* @@ -717,7 +716,7 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, old_valid, values, /* array of index Datums */ isnull, /* null flags */ - tupleid, /* tid of heap tuple */ + ItemPointerGetDatum(tupleid), /* tid of heap tuple */ valuesOld, isnullOld, oldTupleid, @@ -768,7 +767,6 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, { bool violationOK; CEOUC_WAIT_MODE waitMode; - ItemPointer raw_tupleid = DatumGetItemPointer(tupleid); if (applyNoDupErr) { @@ -789,7 +787,7 @@ ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, satisfiesConstraint = check_exclusion_or_unique_constraint(heapRelation, indexRelation, indexInfo, - raw_tupleid, values, isnull, + tupleid, values, isnull, estate, false, waitMode, violationOK, NULL); } diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index d98cd56fcb7..c6f57f7d192 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -105,6 +105,16 @@ typedef void (*ambuildempty_function) (Relation indexRelation); /* insert this tuple */ typedef bool (*aminsert_function) (Relation indexRelation, + Datum *values, + bool *isnull, + ItemPointer tupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); + +/* extended version of aminsert taking Datum tupleid */ +typedef bool (*aminsert_extended_function) (Relation indexRelation, Datum *values, bool *isnull, Datum tupleid, @@ -112,6 +122,7 @@ typedef bool (*aminsert_function) (Relation indexRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); + /* update this tuple */ typedef bool (*amupdate_function) (Relation indexRelation, bool new_valid, @@ -288,6 +299,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + aminsert_extended_function aminsertextended; aminsertcleanup_function aminsertcleanup; amupdate_function amupdate; amdelete_function amdelete; diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 7cb6d2aa3a2..5752a3cf1ef 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,7 +144,7 @@ extern void index_close(Relation relation, LOCKMODE lockmode); extern bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - Datum tupleid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 80c6668666a..1c6825f391a 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -303,7 +303,8 @@ dihandler(PG_FUNCTION_ARGS) amroutine->ambuild = dibuild; amroutine->ambuildempty = dibuildempty; - amroutine->aminsert = diinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = diinsert; amroutine->ambulkdelete = dibulkdelete; amroutine->amvacuumcleanup = divacuumcleanup; amroutine->amcanreturn = NULL; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index d4e9515e9f4..9b3c2334949 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3266,6 +3266,7 @@ amgetbitmap_function amgettuple_function aminitparallelscan_function aminsert_function +aminsert_extended_function aminsertcleanup_function ammarkpos_function amoptions_function From 20cea792be399ad68b2b74bd1cfb21cb7fb37a33 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Fri, 18 Oct 2024 14:31:16 +0400 Subject: [PATCH 47/56] Fix warning in pg_rewind --- src/bin/pg_rewind/pg_rewind.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 9b0a340d14b..50873c5d2cb 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -229,6 +229,7 @@ main(int argc, char **argv) case 6: if (!parse_sync_method(optarg, &sync_method)) exit(1); + break; case 'e': /* -e or --extension */ simple_string_list_append(&extensions, optarg); From 83d892d8d921924aa4158e85336adff573fd1953 Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Mon, 21 Oct 2024 14:58:36 +0400 Subject: [PATCH 48/56] Revert pre 5bf748b86bc67 SAOP behavior in PG17 Commit 5bf748b86bc67 allows generation of unsafe SAOP path keys on a multicolumn index that were disabled previously by 807a40c5. --- src/backend/optimizer/path/indxpath.c | 42 +++++++++++++++++++--- src/test/regress/expected/create_index.out | 24 +++++++------ 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 7c043c53133..6dbba5b1ca9 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -98,7 +98,8 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop); + bool *skip_nonnative_saop, + bool *skip_lower_saop); static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, List *clauses, List *other_clauses); static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, @@ -702,6 +703,7 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { List *indexpaths; bool skip_nonnative_saop = false; + bool skip_lower_saop = false; ListCell *lc; /* @@ -712,8 +714,24 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, index, clauses, index->predOK, ST_ANYSCAN, - &skip_nonnative_saop); - + &skip_nonnative_saop, + &skip_lower_saop); + + /* + * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM + * that supports them, then try again including those clauses. This will + * produce paths with more selectivity but no ordering. + */ + if (skip_lower_saop) + { + indexpaths = list_concat(indexpaths, + build_index_paths(root, rel, + index, clauses, + index->predOK, + ST_ANYSCAN, + &skip_nonnative_saop, + NULL)); + } /* * Submit all the ones that can form plain IndexScan plans to add_path. (A * plain IndexPath can represent either a plain IndexScan or an @@ -750,6 +768,7 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, index, clauses, false, ST_BITMAPSCAN, + NULL, NULL); *bitindexpaths = list_concat(*bitindexpaths, indexpaths); } @@ -794,7 +813,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop) + bool *skip_nonnative_saop, + bool *skip_lower_saop) { List *result = NIL; IndexPath *ipath; @@ -805,6 +825,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, List *orderbyclausecols; List *index_pathkeys; List *useful_pathkeys; + bool found_lower_saop_clause; bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; @@ -843,6 +864,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, * otherwise accounted for. */ index_clauses = NIL; + found_lower_saop_clause = false; outer_relids = bms_copy(rel->lateral_relids); for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++) { @@ -866,6 +888,16 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, *skip_nonnative_saop = true; continue; } + if (skip_nonnative_saop && IsA(rinfo->clause, ScalarArrayOpExpr) && indexcol > 0) + { + if (skip_lower_saop) + { + /* Caller doesn't want to lose index ordering */ + *skip_lower_saop = true; + continue; + } + found_lower_saop_clause = true; + } /* OK to include this clause */ index_clauses = lappend(index_clauses, iclause); @@ -897,6 +929,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, * if we are only trying to build bitmap indexscans. */ pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN && + !found_lower_saop_clause && has_useful_pathkeys(root, rel)); index_is_ordered = (index->sortopfamily != NULL); if (index_is_ordered && pathkeys_possibly_useful) @@ -1148,6 +1181,7 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, index, &clauseset, useful_predicate, ST_BITMAPSCAN, + NULL, NULL); result = list_concat(result, indexpaths); } diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index cf6eac57349..f23c60f94bc 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1943,11 +1943,13 @@ explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; - QUERY PLAN --------------------------------------------------------------------------------- - Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) -(2 rows) + QUERY PLAN +-------------------------------------------------------------------------------------- + Sort + Sort Key: thousand + -> Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(4 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -1963,11 +1965,13 @@ explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand DESC, tenthous DESC; - QUERY PLAN --------------------------------------------------------------------------------- - Index Only Scan Backward using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) -(2 rows) + QUERY PLAN +-------------------------------------------------------------------------------------- + Sort + Sort Key: thousand DESC, tenthous DESC + -> Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(4 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) From 94d572f12a41e8622b2b853db8265829a7f64a4c Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Mon, 21 Oct 2024 15:37:02 +0400 Subject: [PATCH 49/56] Revert "Revert pre 5bf748b86bc67 SAOP behavior in PG17" This reverts commit 83d892d8d921924aa4158e85336adff573fd1953. --- src/backend/optimizer/path/indxpath.c | 42 +++------------------- src/test/regress/expected/create_index.out | 24 ++++++------- 2 files changed, 14 insertions(+), 52 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 6dbba5b1ca9..7c043c53133 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -98,8 +98,7 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop); + bool *skip_nonnative_saop); static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, List *clauses, List *other_clauses); static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, @@ -703,7 +702,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { List *indexpaths; bool skip_nonnative_saop = false; - bool skip_lower_saop = false; ListCell *lc; /* @@ -714,24 +712,8 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, index, clauses, index->predOK, ST_ANYSCAN, - &skip_nonnative_saop, - &skip_lower_saop); - - /* - * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM - * that supports them, then try again including those clauses. This will - * produce paths with more selectivity but no ordering. - */ - if (skip_lower_saop) - { - indexpaths = list_concat(indexpaths, - build_index_paths(root, rel, - index, clauses, - index->predOK, - ST_ANYSCAN, - &skip_nonnative_saop, - NULL)); - } + &skip_nonnative_saop); + /* * Submit all the ones that can form plain IndexScan plans to add_path. (A * plain IndexPath can represent either a plain IndexScan or an @@ -768,7 +750,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, index, clauses, false, ST_BITMAPSCAN, - NULL, NULL); *bitindexpaths = list_concat(*bitindexpaths, indexpaths); } @@ -813,8 +794,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop) + bool *skip_nonnative_saop) { List *result = NIL; IndexPath *ipath; @@ -825,7 +805,6 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, List *orderbyclausecols; List *index_pathkeys; List *useful_pathkeys; - bool found_lower_saop_clause; bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; @@ -864,7 +843,6 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, * otherwise accounted for. */ index_clauses = NIL; - found_lower_saop_clause = false; outer_relids = bms_copy(rel->lateral_relids); for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++) { @@ -888,16 +866,6 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, *skip_nonnative_saop = true; continue; } - if (skip_nonnative_saop && IsA(rinfo->clause, ScalarArrayOpExpr) && indexcol > 0) - { - if (skip_lower_saop) - { - /* Caller doesn't want to lose index ordering */ - *skip_lower_saop = true; - continue; - } - found_lower_saop_clause = true; - } /* OK to include this clause */ index_clauses = lappend(index_clauses, iclause); @@ -929,7 +897,6 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, * if we are only trying to build bitmap indexscans. */ pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN && - !found_lower_saop_clause && has_useful_pathkeys(root, rel)); index_is_ordered = (index->sortopfamily != NULL); if (index_is_ordered && pathkeys_possibly_useful) @@ -1181,7 +1148,6 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, index, &clauseset, useful_predicate, ST_BITMAPSCAN, - NULL, NULL); result = list_concat(result, indexpaths); } diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index f23c60f94bc..cf6eac57349 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1943,13 +1943,11 @@ explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; - QUERY PLAN --------------------------------------------------------------------------------------- - Sort - Sort Key: thousand - -> Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) -(4 rows) + QUERY PLAN +-------------------------------------------------------------------------------- + Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -1965,13 +1963,11 @@ explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand DESC, tenthous DESC; - QUERY PLAN --------------------------------------------------------------------------------------- - Sort - Sort Key: thousand DESC, tenthous DESC - -> Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) -(4 rows) + QUERY PLAN +-------------------------------------------------------------------------------- + Index Only Scan Backward using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) From b8f54509c59804ca3c3fc5ab8258fb3062f97bbc Mon Sep 17 00:00:00 2001 From: Pavel Borisov Date: Tue, 22 Oct 2024 12:32:59 +0400 Subject: [PATCH 50/56] Fix active snapshot checks --- src/backend/executor/execMain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 26b3974b9fb..f0dfccd9fab 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -150,7 +150,7 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) Assert(queryDesc->estate == NULL); /* caller must ensure the query's snapshot is active */ - Assert(GetActiveSnapshot() == queryDesc->snapshot); + Assert((ActiveSnapshotSet() ? GetActiveSnapshot() : InvalidSnapshot) == queryDesc->snapshot); /* * If the transaction is read-only, we need to check if any writes are @@ -325,7 +325,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); /* caller must ensure the query's snapshot is active */ - Assert(GetActiveSnapshot() == estate->es_snapshot); + Assert((ActiveSnapshotSet() ? GetActiveSnapshot() : InvalidSnapshot) == estate->es_snapshot); /* * Switch into per-query memory context From ef396875638fd8fd76786b148ac97247288e980b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 23 Sep 2024 14:03:54 +0300 Subject: [PATCH 51/56] Transform OR-clauses to SAOP's during index matching Replace "(indexkey op C1) OR (indexkey op C2) ... (indexkey op CN)" with "indexkey op ANY(ARRAY[C1, C2, ...])" (ScalarArrayOpExpr node) during matching a clause to index. Here Ci is an i-th constant or parameters expression, 'expr' is non-constant expression, 'op' is an operator which returns boolean result and has a commuter (for the case of reverse order of constant and non-constant parts of the expression, like 'Cn op expr'). This transformation allows handling long OR-clauses with single IndexScan avoiding slower bitmap scans. Discussion: https://postgr.es/m/567ED6CA.2040504%40sigaev.ru Author: Alena Rybakina Author: Andrey Lepikhov Reviewed-by: Peter Geoghegan Reviewed-by: Ranier Vilela Reviewed-by: Alexander Korotkov Reviewed-by: Robert Haas Reviewed-by: Jian He Reviewed-by: Tom Lane Reviewed-by: Nikolay Shaplov --- src/backend/optimizer/path/indxpath.c | 281 ++++++++++++++++++++- src/test/regress/expected/create_index.out | 270 ++++++++++++++++++-- src/test/regress/expected/join.out | 57 ++++- src/test/regress/expected/rowsecurity.out | 7 + src/test/regress/expected/stats_ext.out | 12 + src/test/regress/expected/uuid.out | 31 +++ src/test/regress/sql/create_index.sql | 69 +++++ src/test/regress/sql/join.sql | 9 + src/test/regress/sql/rowsecurity.sql | 1 + src/test/regress/sql/stats_ext.sql | 3 + src/test/regress/sql/uuid.sql | 12 + 11 files changed, 729 insertions(+), 23 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 7c043c53133..d5ba0c5cd68 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -20,6 +20,7 @@ #include "access/stratnum.h" #include "access/sysattr.h" #include "catalog/pg_am.h" +#include "catalog/pg_amop.h" #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" @@ -32,8 +33,10 @@ #include "optimizer/paths.h" #include "optimizer/prep.h" #include "optimizer/restrictinfo.h" +#include "utils/array.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" +#include "utils/syscache.h" /* XXX see PartCollMatchesExprColl */ @@ -166,6 +169,10 @@ static IndexClause *match_rowcompare_to_indexcol(PlannerInfo *root, RestrictInfo *rinfo, int indexcol, IndexOptInfo *index); +static IndexClause *match_orclause_to_indexcol(PlannerInfo *root, + RestrictInfo *rinfo, + int indexcol, + IndexOptInfo *index); static IndexClause *expand_indexqual_rowcompare(PlannerInfo *root, RestrictInfo *rinfo, int indexcol, @@ -2138,7 +2145,10 @@ match_clause_to_index(PlannerInfo *root, * (3) must match the collation of the index, if collation is relevant. * * Our definition of "const" is exceedingly liberal: we allow anything that - * doesn't involve a volatile function or a Var of the index's relation. + * doesn't involve a volatile function or a Var of the index's relation + * except for a boolean OR expression input: due to a trade-off between the + * expected execution speedup and planning complexity, we limit or->saop + * transformation by obvious cases when an index scan can get a profit. * In particular, Vars belonging to other relations of the query are * accepted here, since a clause of that form can be used in a * parameterized indexscan. It's the responsibility of higher code levels @@ -2168,6 +2178,10 @@ match_clause_to_index(PlannerInfo *root, * It is also possible to match ScalarArrayOpExpr clauses to indexes, when * the clause is of the form "indexkey op ANY (arrayconst)". * + * It is also possible to match a list of OR clauses if it might be + * transformed into a single ScalarArrayOpExpr clause. On success, + * the returning index clause will contain a trasformed clause. + * * For boolean indexes, it is also possible to match the clause directly * to the indexkey; or perhaps the clause is (NOT indexkey). * @@ -2217,9 +2231,9 @@ match_clause_to_indexcol(PlannerInfo *root, } /* - * Clause must be an opclause, funcclause, ScalarArrayOpExpr, or - * RowCompareExpr. Or, if the index supports it, we can handle IS - * NULL/NOT NULL clauses. + * Clause must be an opclause, funcclause, ScalarArrayOpExpr, + * RowCompareExpr, or OR-clause that could be converted to SAOP. Or, if + * the index supports it, we can handle IS NULL/NOT NULL clauses. */ if (IsA(clause, OpExpr)) { @@ -2237,6 +2251,10 @@ match_clause_to_indexcol(PlannerInfo *root, { return match_rowcompare_to_indexcol(root, rinfo, indexcol, index); } + else if (restriction_is_or_clause(rinfo)) + { + return match_orclause_to_indexcol(root, rinfo, indexcol, index); + } else if (index->amsearchnulls && IsA(clause, NullTest)) { NullTest *nt = (NullTest *) clause; @@ -2760,6 +2778,261 @@ match_rowcompare_to_indexcol(PlannerInfo *root, return NULL; } +/* + * match_orclause_to_indexcol() + * Handles the OR-expr case for match_clause_to_indexcol() in the case + * when it could be transformed to ScalarArrayOpExpr. + * + * Given a list of OR-clause args, attempts to transform this BoolExpr into + * a single SAOP expression. On success, returns an IndexClause, containing + * the transformed expression or NULL, if failed. + */ +static IndexClause * +match_orclause_to_indexcol(PlannerInfo *root, + RestrictInfo *rinfo, + int indexcol, + IndexOptInfo *index) +{ + ListCell *lc; + BoolExpr *orclause = (BoolExpr *) rinfo->orclause; + Node *indexExpr = NULL; + List *consts = NIL; + Node *arrayNode = NULL; + ScalarArrayOpExpr *saopexpr = NULL; + Oid matchOpno = InvalidOid; + IndexClause *iclause; + Oid consttype = InvalidOid; + Oid arraytype = InvalidOid; + Oid inputcollid = InvalidOid; + bool firstTime = true; + bool have_param = false; + + Assert(IsA(orclause, BoolExpr)); + Assert(orclause->boolop == OR_EXPR); + + /* + * Try to convert a list of OR-clauses to a single SAOP expression. Each + * OR entry must be in the form: (indexkey operator constant) or (constant + * operator indexkey). Operators of all the entries must match. Constant + * might be either Const or Param. To be effective, give up on the first + * non-matching entry. Exit is implemented as a break from the loop, which + * is catched afterwards. + */ + foreach(lc, orclause->args) + { + RestrictInfo *subRinfo; + OpExpr *subClause; + Oid opno; + Node *leftop, + *rightop; + Node *constExpr; + + if (!IsA(lfirst(lc), RestrictInfo)) + break; + + subRinfo = (RestrictInfo *) lfirst(lc); + + /* Only operator clauses can match */ + if (!IsA(subRinfo->clause, OpExpr)) + break; + + subClause = (OpExpr *) subRinfo->clause; + opno = subClause->opno; + + /* Only binary operators can match */ + if (list_length(subClause->args) != 2) + break; + + /* + * The parameters below must match between sub-rinfo and its parent as + * make_restrictinfo() fills them with the same values, and further + * modifications are also the same for the whole subtree. However, + * still make a sanity check. + */ + Assert(subRinfo->is_pushed_down == rinfo->is_pushed_down); + Assert(subRinfo->is_clone == rinfo->is_clone); + Assert(subRinfo->security_level == rinfo->security_level); + Assert(bms_equal(subRinfo->incompatible_relids, rinfo->incompatible_relids)); + Assert(bms_equal(subRinfo->outer_relids, rinfo->outer_relids)); + + /* + * Also, check that required_relids in sub-rinfo is subset of parent's + * required_relids. + */ + Assert(bms_is_subset(subRinfo->required_relids, rinfo->required_relids)); + + /* Only operator returning boolean suits the transformation */ + if (get_op_rettype(opno) != BOOLOID) + break; + + /* + * Check for clauses of the form: (indexkey operator constant) or + * (constant operator indexkey). Determine indexkey side first, check + * the constant later. + */ + leftop = (Node *) linitial(subClause->args); + rightop = (Node *) lsecond(subClause->args); + if (match_index_to_operand(leftop, indexcol, index)) + { + indexExpr = leftop; + constExpr = rightop; + } + else if (match_index_to_operand(rightop, indexcol, index)) + { + opno = get_commutator(opno); + if (!OidIsValid(opno)) + { + /* commutator doesn't exist, we can't reverse the order */ + break; + } + indexExpr = rightop; + constExpr = leftop; + } + else + { + break; + } + + /* + * Ignore any RelabelType node above the operands. This is needed to + * be able to apply indexscanning in binary-compatible-operator cases. + * Note: we can assume there is at most one RelabelType node; + * eval_const_expressions() will have simplified if more than one. + */ + if (IsA(constExpr, RelabelType)) + constExpr = (Node *) ((RelabelType *) constExpr)->arg; + if (IsA(indexExpr, RelabelType)) + indexExpr = (Node *) ((RelabelType *) indexExpr)->arg; + + /* We allow constant to be Const or Param */ + if (!IsA(constExpr, Const) && !IsA(constExpr, Param)) + break; + + /* Forbid transformation for composite types, records. */ + if (type_is_rowtype(exprType(constExpr)) || + type_is_rowtype(exprType(indexExpr))) + break; + + /* + * Save information about the operator, type, and collation for the + * first matching qual. Then, check that subsequent quals match the + * first. + */ + if (firstTime) + { + matchOpno = opno; + consttype = exprType(constExpr); + arraytype = get_array_type(consttype); + inputcollid = subClause->inputcollid; + + /* + * Check that the operator is presented in the opfamily and that + * the expression collation matches the index collation. Also, + * there must be an array type to construct an array later. + */ + if (!IndexCollMatchesExprColl(index->indexcollations[indexcol], inputcollid) || + !op_in_opfamily(matchOpno, index->opfamily[indexcol]) || + !OidIsValid(arraytype)) + break; + firstTime = false; + } + else + { + if (opno != matchOpno || + inputcollid != subClause->inputcollid || + consttype != exprType(constExpr)) + break; + } + + if (IsA(constExpr, Param)) + have_param = true; + consts = lappend(consts, constExpr); + } + + /* + * Catch the break from the loop above. Normally, a foreach() loop ends + * up with a NULL list cell. A non-NULL list cell indicates a break from + * the foreach() loop. Free the consts list and return NULL then. + */ + if (lc != NULL) + { + list_free(consts); + return NULL; + } + + /* + * Assemble an array from the list of constants. It seems more profitable + * to build a const array. But in the presence of parameters, we don't + * have a specific value here and must employ an ArrayExpr instead. + */ + + if (have_param) + { + ArrayExpr *arrayExpr = makeNode(ArrayExpr); + + /* array_collid will be set by parse_collate.c */ + arrayExpr->element_typeid = consttype; + arrayExpr->array_typeid = arraytype; + arrayExpr->multidims = false; + arrayExpr->elements = consts; + arrayExpr->location = -1; + + arrayNode = (Node *) arrayExpr; + } + else + { + int16 typlen; + bool typbyval; + char typalign; + Datum *elems; + int i = 0; + ArrayType *arrayConst; + + get_typlenbyvalalign(consttype, &typlen, &typbyval, &typalign); + + elems = (Datum *) palloc(sizeof(Datum) * list_length(consts)); + foreach(lc, consts) + elems[i++] = ((Const *) lfirst(lc))->constvalue; + + arrayConst = construct_array(elems, i, consttype, + typlen, typbyval, typalign); + arrayNode = (Node *) makeConst(arraytype, -1, inputcollid, + -1, PointerGetDatum(arrayConst), + false, false); + + pfree(elems); + list_free(consts); + } + + /* Build the SAOP expression node */ + saopexpr = makeNode(ScalarArrayOpExpr); + saopexpr->opno = matchOpno; + saopexpr->opfuncid = get_opcode(matchOpno); + saopexpr->hashfuncid = InvalidOid; + saopexpr->negfuncid = InvalidOid; + saopexpr->useOr = true; + saopexpr->inputcollid = inputcollid; + saopexpr->args = list_make2(indexExpr, arrayNode); + saopexpr->location = -1; + + /* + * Finally, build an IndexClause based on the SAOP node. Use + * make_simple_restrictinfo() to get RestrictInfo with clean selectivity + * estimations because it may differ from the estimation made for an OR + * clause. Although it is not a lossy expression, keep the old version of + * rinfo in iclause->rinfo to detect duplicates and recheck the original + * clause. + */ + iclause = makeNode(IndexClause); + iclause->rinfo = rinfo; + iclause->indexquals = list_make1(make_simple_restrictinfo(root, + &saopexpr->xpr)); + iclause->lossy = false; + iclause->indexcol = indexcol; + iclause->indexcols = NIL; + return iclause; +} + /* * expand_indexqual_rowcompare --- expand a single indexqual condition * that is a RowCompareExpr diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index cf6eac57349..1324d6927c4 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1844,18 +1844,67 @@ DROP TABLE onek_with_null; EXPLAIN (COSTS OFF) SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------ - Bitmap Heap Scan on tenk1 - Recheck Cond: (((thousand = 42) AND (tenthous = 1)) OR ((thousand = 42) AND (tenthous = 3)) OR ((thousand = 42) AND (tenthous = 42))) - -> BitmapOr - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 1)) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 3)) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 42)) -(9 rows) + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) +(2 rows) + +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx +(1 row) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + QUERY PLAN +---------------------------------------------------------------------------------------- + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY (ARRAY[1, (InitPlan 1).col1, 42]))) + InitPlan 1 + -> Result +(4 rows) + +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + QUERY PLAN +--------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,99}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) +(2 rows) SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); @@ -1864,6 +1913,27 @@ SELECT * FROM tenk1 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx (1 row) +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric); + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: (((tenthous)::numeric = '1'::numeric) OR (tenthous = 3) OR ((tenthous)::numeric = '42'::numeric)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric; + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Seq Scan on tenk1 + Filter: (((tenthous)::numeric = '1'::numeric) OR (tenthous = 3) OR ((tenthous)::numeric = '42'::numeric)) +(2 rows) + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); @@ -1872,6 +1942,102 @@ SELECT count(*) FROM tenk1 Aggregate -> Bitmap Heap Scan on tenk1 Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,99}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand < 42) OR (thousand < 99) OR (43 > thousand) OR (42 > thousand))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand < ANY ('{42,99,43,42}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (((thousand = 42) AND ((tenthous = 1) OR (tenthous = 3))) OR (thousand = 41)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3}'::integer[]))) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 41) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (((hundred = 42) AND ((thousand = 42) OR (thousand = 99) OR (tenthous < 2))) OR (thousand = 41)) + -> BitmapOr + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 99) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (tenthous < 2) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 41) +(16 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + count +------- + 20 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 41) OR ((thousand = 99) AND (tenthous = 2)))) -> BitmapAnd -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 42) @@ -1879,16 +2045,90 @@ SELECT count(*) FROM tenk1 -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: (thousand = 42) -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 99) -(11 rows) + Index Cond: (thousand = 41) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 99) AND (tenthous = 2)) +(13 rows) SELECT count(*) FROM tenk1 - WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); count ------- 10 (1 row) +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk1.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Aggregate + -> Nested Loop + Join Filter: ((tenk2.thousand = 42) OR (tenk1.thousand = 41) OR (tenk2.tenthous = 2)) + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Materialize + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------ + Aggregate + -> Nested Loop + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = 42) +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------ + Aggregate + -> Nested Loop + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = 42) +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 LEFT JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: (tenk1.hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + -> Memoize + Cache Key: tenk1.hundred + Cache Mode: logical + -> Index Scan using tenk2_hundred on tenk2 + Index Cond: (hundred = tenk1.hundred) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) +(10 rows) + -- -- Check behavior with duplicate index column contents -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 8d1d3ec1dcf..00e88ed6abb 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4277,15 +4277,64 @@ select * from tenk1 a join tenk1 b on Index Cond: (hundred = 4) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + Recheck Cond: ((unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr -> Bitmap Index Scan on tenk1_unique1 Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 - Index Cond: (unique2 = 3) + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) +(18 rows) + +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 4) + -> Materialize + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 - Index Cond: (unique2 = 7) -(19 rows) + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) +(18 rows) + +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 < 20 or a.unique1 = 3 or a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: ((a.unique1 < 20) OR (a.unique1 = 3) OR ((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) + -> Seq Scan on tenk1 b + -> Materialize + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Filter: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 < 20) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 3) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) +(16 rows) -- -- test placement of movable quals in a parameterized join tree diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index 319190855bd..ef890b96cc6 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -4492,6 +4492,13 @@ SELECT * FROM rls_tbl WHERE a <<< 1000; --- (0 rows) +EXPLAIN (COSTS OFF) SELECT * FROM rls_tbl WHERE a <<< 1000 or a <<< 900; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + DROP OPERATOR <<< (int, int); DROP FUNCTION op_leak(int, int); RESET SESSION AUTHORIZATION; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 8c4da955084..a4c7be487ef 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -3254,6 +3254,8 @@ CREATE OPERATOR <<< (procedure = op_leak, leftarg = int, rightarg = int, restrict = scalarltsel); SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Permission denied ERROR: permission denied for table priv_test_tbl +SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 OR b <<< 0; +ERROR: permission denied for table priv_test_tbl DELETE FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Permission denied ERROR: permission denied for table priv_test_tbl -- Grant access via a security barrier view, but hide all data @@ -3268,6 +3270,11 @@ SELECT * FROM tststats.priv_test_view WHERE a <<< 0 AND b <<< 0; -- Should not l ---+--- (0 rows) +SELECT * FROM tststats.priv_test_view WHERE a <<< 0 OR b <<< 0; -- Should not leak + a | b +---+--- +(0 rows) + DELETE FROM tststats.priv_test_view WHERE a <<< 0 AND b <<< 0; -- Should not leak -- Grant table access, but hide all data with RLS RESET SESSION AUTHORIZATION; @@ -3280,6 +3287,11 @@ SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Should not le ---+--- (0 rows) +SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 OR b <<< 0; + a | b +---+--- +(0 rows) + DELETE FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Should not leak -- privilege checks for pg_stats_ext and pg_stats_ext_exprs RESET SESSION AUTHORIZATION; diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out index 6026e15ed31..8f4ef0d7a6a 100644 --- a/src/test/regress/expected/uuid.out +++ b/src/test/regress/expected/uuid.out @@ -129,6 +129,37 @@ CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <> '11111111111111111111111111111111' OR + guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------ + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field <> '11111111-1111-1111-1111-111111111111'::uuid) OR (guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid)) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <= '22222222-2222-2222-2222-222222222222' OR + guid_field <= '11111111111111111111111111111111' OR + guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field <= '22222222-2222-2222-2222-222222222222'::uuid) OR (guid_field <= '11111111-1111-1111-1111-111111111111'::uuid) OR (guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid)) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e' OR + guid_field = '11111111111111111111111111111111'; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid) OR (guid_field = '11111111-1111-1111-1111-111111111111'::uuid)) +(3 rows) + -- should fail INSERT INTO guid1(guid_field) VALUES('11111111-1111-1111-1111-111111111111'); ERROR: duplicate key value violates unique constraint "guid1_unique_btree" diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index e296891cab8..7e108f9b283 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -732,12 +732,81 @@ SELECT * FROM tenk1 SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk1.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 LEFT JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; -- -- Check behavior with duplicate index column contents -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 8281bbd8ef8..b67b4caef23 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1433,6 +1433,15 @@ select * from tenk1 a join tenk1 b on (a.unique1 = 1 and b.unique1 = 2) or ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 < 20 or a.unique1 = 3 or a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + -- -- test placement of movable quals in a parameterized join tree -- diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index 3011d71b12b..6d2414b6044 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -2177,6 +2177,7 @@ CREATE FUNCTION op_leak(int, int) RETURNS bool CREATE OPERATOR <<< (procedure = op_leak, leftarg = int, rightarg = int, restrict = scalarltsel); SELECT * FROM rls_tbl WHERE a <<< 1000; +EXPLAIN (COSTS OFF) SELECT * FROM rls_tbl WHERE a <<< 1000 or a <<< 900; DROP OPERATOR <<< (int, int); DROP FUNCTION op_leak(int, int); RESET SESSION AUTHORIZATION; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 0c08a6cc42e..5c786b16c6f 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1634,6 +1634,7 @@ CREATE FUNCTION op_leak(int, int) RETURNS bool CREATE OPERATOR <<< (procedure = op_leak, leftarg = int, rightarg = int, restrict = scalarltsel); SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Permission denied +SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 OR b <<< 0; DELETE FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Permission denied -- Grant access via a security barrier view, but hide all data @@ -1645,6 +1646,7 @@ GRANT SELECT, DELETE ON tststats.priv_test_view TO regress_stats_user1; -- Should now have access via the view, but see nothing and leak nothing SET SESSION AUTHORIZATION regress_stats_user1; SELECT * FROM tststats.priv_test_view WHERE a <<< 0 AND b <<< 0; -- Should not leak +SELECT * FROM tststats.priv_test_view WHERE a <<< 0 OR b <<< 0; -- Should not leak DELETE FROM tststats.priv_test_view WHERE a <<< 0 AND b <<< 0; -- Should not leak -- Grant table access, but hide all data with RLS @@ -1655,6 +1657,7 @@ GRANT SELECT, DELETE ON tststats.priv_test_tbl TO regress_stats_user1; -- Should now have direct table access, but see nothing and leak nothing SET SESSION AUTHORIZATION regress_stats_user1; SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Should not leak +SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 OR b <<< 0; DELETE FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Should not leak -- privilege checks for pg_stats_ext and pg_stats_ext_exprs diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql index c88f6d087a7..75ee966ded0 100644 --- a/src/test/regress/sql/uuid.sql +++ b/src/test/regress/sql/uuid.sql @@ -63,6 +63,18 @@ CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <> '11111111111111111111111111111111' OR + guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <= '22222222-2222-2222-2222-222222222222' OR + guid_field <= '11111111111111111111111111111111' OR + guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e' OR + guid_field = '11111111111111111111111111111111'; + -- should fail INSERT INTO guid1(guid_field) VALUES('11111111-1111-1111-1111-111111111111'); From cd8124d33607f0941d4ab674bf942b245f0be847 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 23 Sep 2024 14:04:03 +0300 Subject: [PATCH 52/56] Teach bitmap path generation about transforming OR-clauses to SAOP's When optimizer generates bitmap paths, it considers breaking OR-clause arguments one-by-one. But now, a group of similar OR-clauses can be transformed into SAOP during index matching. So, bitmap paths should keep up. This commit teaches bitmap paths generation machinery to group similar OR-clauses into dedicated RestrictInfos. Those RestrictInfos are considered both to match index as a whole (as SAOP), or to match as a set of individual OR-clause argument one-by-one (the old way). Therefore, bitmap path generation will takes advantage of OR-clauses to SAOP's transformation. The old way of handling them is also considered. So, there shouldn't be planning regression. Discussion: https://postgr.es/m/567ED6CA.2040504%40sigaev.ru Reviewed-by: Alexander Korotkov --- src/backend/optimizer/path/indxpath.c | 439 ++++++++++++++++++++- src/backend/optimizer/util/restrictinfo.c | 107 +++-- src/include/optimizer/restrictinfo.h | 11 + src/test/regress/expected/create_index.out | 125 +++++- src/test/regress/expected/join.out | 56 ++- src/test/regress/sql/create_index.sql | 38 ++ src/tools/pgindent/typedefs.list | 1 + 7 files changed, 670 insertions(+), 107 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index d5ba0c5cd68..cae5f3d7e63 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -1162,6 +1162,383 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, return result; } +/* + * Utility structure used to group similar OR-clause arguments in + * group_similar_or_args(). It represents information about the OR-clause + * argument and its matching index key. + */ +typedef struct +{ + int indexnum; /* index of the matching index, or -1 if no + * matching index */ + int colnum; /* index of the matching column, or -1 if no + * matching index */ + Oid opno; /* OID of the OpClause operator, or InvalidOid + * if not an OpExpr */ + Oid inputcollid; /* OID of the OpClause input collation */ + int argindex; /* index of the clause in the list of + * arguments */ +} OrArgIndexMatch; + +/* + * Comparison function for OrArgIndexMatch which provides sort order placing + * similar OR-clause arguments together. + */ +static int +or_arg_index_match_cmp(const void *a, const void *b) +{ + const OrArgIndexMatch *match_a = (const OrArgIndexMatch *) a; + const OrArgIndexMatch *match_b = (const OrArgIndexMatch *) b; + + if (match_a->indexnum < match_b->indexnum) + return -1; + else if (match_a->indexnum > match_b->indexnum) + return 1; + + if (match_a->colnum < match_b->colnum) + return -1; + else if (match_a->colnum > match_b->colnum) + return 1; + + if (match_a->opno < match_b->opno) + return -1; + else if (match_a->opno > match_b->opno) + return 1; + + if (match_a->inputcollid < match_b->inputcollid) + return -1; + else if (match_a->inputcollid > match_b->inputcollid) + return 1; + + if (match_a->argindex < match_b->argindex) + return -1; + else if (match_a->argindex > match_b->argindex) + return 1; + + return 0; +} + +/* + * group_similar_or_args + * Transform incoming OR-restrictinfo into a list of sub-restrictinfos, + * each of them containing a subset of OR-clauses from the source rinfo + * matching the same index column with the same operator and collation, + * It may be employed later, during the match_clause_to_indexcol() to + * transform whole OR-sub-rinfo to an SAOP clause. + * + * Similar arguments clauses of form "indexkey op constant" having same + * indexkey, operator, and collation. Constant may comprise either Const + * or Param. + * + * Returns the processed list of arguments. + */ +static List * +group_similar_or_args(PlannerInfo *root, RelOptInfo *rel, RestrictInfo *rinfo) +{ + int n; + int i; + int group_start; + OrArgIndexMatch *matches; + bool matched = false; + ListCell *lc; + ListCell *lc2; + List *orargs; + List *result = NIL; + + Assert(IsA(rinfo->orclause, BoolExpr)); + orargs = ((BoolExpr *) rinfo->orclause)->args; + n = list_length(orargs); + + /* + * To avoid N^2 behavior, take utility pass along the list of OR-clause + * arguments. For each argument, fill the OrArgIndexMatch structure, + * which will be used to sort these arguments at the next step. + */ + i = -1; + matches = (OrArgIndexMatch *) palloc(sizeof(OrArgIndexMatch) * n); + foreach(lc, orargs) + { + Node *arg = lfirst(lc); + RestrictInfo *argrinfo; + OpExpr *clause; + Oid opno; + Node *leftop, + *rightop; + Node *nonConstExpr; + int indexnum; + int colnum; + + i++; + matches[i].argindex = i; + matches[i].indexnum = -1; + matches[i].colnum = -1; + matches[i].opno = InvalidOid; + matches[i].inputcollid = InvalidOid; + + if (!IsA(arg, RestrictInfo)) + continue; + + argrinfo = castNode(RestrictInfo, arg); + + /* Only operator clauses can match */ + if (!IsA(argrinfo->clause, OpExpr)) + continue; + + clause = (OpExpr *) argrinfo->clause; + opno = clause->opno; + + /* Only binary operators can match */ + if (list_length(clause->args) != 2) + continue; + + /* + * Ignore any RelabelType node above the operands. This is needed to + * be able to apply indexscanning in binary-compatible-operator cases. + * Note: we can assume there is at most one RelabelType node; + * eval_const_expressions() will have simplified if more than one. + */ + leftop = get_leftop(clause); + if (IsA(leftop, RelabelType)) + leftop = (Node *) ((RelabelType *) leftop)->arg; + + rightop = get_rightop(clause); + if (IsA(rightop, RelabelType)) + rightop = (Node *) ((RelabelType *) rightop)->arg; + + /* + * Check for clauses of the form: (indexkey operator constant) or + * (constant operator indexkey). But we don't know a particular index + * yet. First check for a constant, which must be Const or Param. + * That's cheaper than search for an index key among all indexes. + */ + if (IsA(leftop, Const) || IsA(leftop, Param)) + { + opno = get_commutator(opno); + + if (!OidIsValid(opno)) + { + /* commutator doesn't exist, we can't reverse the order */ + continue; + } + nonConstExpr = rightop; + } + else if (IsA(rightop, Const) || IsA(rightop, Param)) + { + nonConstExpr = leftop; + } + else + { + continue; + } + + /* + * Match non-constant part to the index key. It's possible that a + * single non-constant part matches multiple index keys. It's OK, we + * just stop with first matching index key. Given that this choice is + * determined the same for every clause, we will group similar clauses + * together anyway. + */ + indexnum = 0; + foreach(lc2, rel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(lc2); + + /* Ignore index if it doesn't support bitmap scans */ + if (!index->amhasgetbitmap) + continue; + + for (colnum = 0; colnum < index->nkeycolumns; colnum++) + { + if (match_index_to_operand(nonConstExpr, colnum, index)) + { + matches[i].indexnum = indexnum; + matches[i].colnum = colnum; + matches[i].opno = opno; + matches[i].inputcollid = clause->inputcollid; + matched = true; + break; + } + } + + /* + * Stop looping through the indexes, if we managed to match + * nonConstExpr to any index column. + */ + if (matches[i].indexnum >= 0) + break; + indexnum++; + } + } + + /* + * Fast-path check: if no clause is matching to the index column, we can + * just give up at this stage and return the clause list as-is. + */ + if (!matched) + { + pfree(matches); + return orargs; + } + + /* Sort clauses to make similar clauses go together */ + qsort(matches, n, sizeof(OrArgIndexMatch), or_arg_index_match_cmp); + + /* + * Group similar clauses into single sub-restrictinfo. Side effect: the + * resulting list of restrictions will be sorted by indexnum and colnum. + */ + group_start = 0; + for (i = 1; i <= n; i++) + { + /* Check if it's a group boundary */ + if (group_start >= 0 && + (i == n || + matches[i].indexnum != matches[group_start].indexnum || + matches[i].colnum != matches[group_start].colnum || + matches[i].opno != matches[group_start].opno || + matches[i].inputcollid != matches[group_start].inputcollid || + matches[i].indexnum == -1)) + { + /* + * One clause in group: add it "as is" to the upper-level OR. + */ + if (i - group_start == 1) + { + result = lappend(result, + list_nth(orargs, + matches[group_start].argindex)); + } + else + { + /* + * Two or more clauses in a group: create a nested OR. + */ + List *args = NIL; + List *rargs = NIL; + RestrictInfo *subrinfo; + int j; + + Assert(i - group_start >= 2); + + /* Construct the list of nested OR arguments */ + for (j = group_start; j < i; j++) + { + Node *arg = list_nth(orargs, matches[j].argindex); + + rargs = lappend(rargs, arg); + if (IsA(arg, RestrictInfo)) + args = lappend(args, ((RestrictInfo *) arg)->clause); + else + args = lappend(args, arg); + } + + /* Construct the nested OR and wrap it with RestrictInfo */ + subrinfo = make_plain_restrictinfo(root, + make_orclause(args), + make_orclause(rargs), + rinfo->is_pushed_down, + rinfo->has_clone, + rinfo->is_clone, + rinfo->pseudoconstant, + rinfo->security_level, + rinfo->required_relids, + rinfo->incompatible_relids, + rinfo->outer_relids); + result = lappend(result, subrinfo); + } + + group_start = i; + } + } + pfree(matches); + return result; +} + +/* + * make_bitmap_paths_for_or_group + * Generate bitmap paths for a group of similar OR-clause arguments + * produced by group_similar_or_args(). + * + * This function considers two cases: (1) matching a group of clauses to + * the index as a whole, and (2) matching the individual clauses one-by-one. + * (1) typically comprises an optimal solution. If not, (2) typically + * comprises fair alternative. + * + * Ideally, we could consider all arbitrary splits of arguments into + * subgroups, but that could lead to unacceptable computational complexity. + * This is why we only consider two cases of above. + */ +static List * +make_bitmap_paths_for_or_group(PlannerInfo *root, RelOptInfo *rel, + RestrictInfo *ri, List *other_clauses) +{ + List *jointlist = NIL; + List *splitlist = NIL; + ListCell *lc; + List *orargs; + List *args = ((BoolExpr *) ri->orclause)->args; + Cost jointcost = 0.0, + splitcost = 0.0; + Path *bitmapqual; + List *indlist; + + /* + * First, try to match the whole group to the one index. + */ + orargs = list_make1(ri); + indlist = build_paths_for_OR(root, rel, + orargs, + other_clauses); + if (indlist != NIL) + { + bitmapqual = choose_bitmap_and(root, rel, indlist); + jointcost = bitmapqual->total_cost; + jointlist = list_make1(bitmapqual); + } + + /* + * If we manage to find a bitmap scan, which uses the group of OR-clause + * arguments as a whole, we can skip matching OR-clause arguments + * one-by-one as long as there are no other clauses, which can bring more + * efficiency to one-by-one case. + */ + if (jointlist != NIL && other_clauses == NIL) + return jointlist; + + /* + * Also try to match all containing clauses one-by-one. + */ + foreach(lc, args) + { + orargs = list_make1(lfirst(lc)); + + indlist = build_paths_for_OR(root, rel, + orargs, + other_clauses); + + if (indlist == NIL) + { + splitlist = NIL; + break; + } + + bitmapqual = choose_bitmap_and(root, rel, indlist); + splitcost += bitmapqual->total_cost; + splitlist = lappend(splitlist, bitmapqual); + } + + /* + * Pick the best option. + */ + if (splitlist == NIL) + return jointlist; + else if (jointlist == NIL) + return splitlist; + else + return (jointcost < splitcost) ? jointlist : splitlist; +} + + /* * generate_bitmap_or_paths * Look through the list of clauses to find OR clauses, and generate @@ -1192,6 +1569,8 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, List *pathlist; Path *bitmapqual; ListCell *j; + List *groupedArgs; + List *inner_other_clauses = NIL; /* Ignore RestrictInfos that aren't ORs */ if (!restriction_is_or_clause(rinfo)) @@ -1202,7 +1581,28 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, * the OR, else we can't use it. */ pathlist = NIL; - foreach(j, ((BoolExpr *) rinfo->orclause)->args) + + /* + * Group the similar OR-clause argument into dedicated RestrictInfos, + * because those RestrictInfos might match to the index as a whole. + */ + groupedArgs = group_similar_or_args(root, rel, rinfo); + + if (groupedArgs != ((BoolExpr *) rinfo->orclause)->args) + { + /* + * Some parts of the rinfo were grouped. In this case, we have a + * set of sub-rinfos that together are an exact duplicate of + * rinfo. Thus, we need to remove the rinfo from other clauses. + * match_clauses_to_index detects duplicated iclauses by comparing + * pointers to original rinfos that would be different. So, we + * must delete rinfo to avoid de-facto duplicated clauses in the + * index clauses list. + */ + inner_other_clauses = list_delete(list_copy(all_clauses), rinfo); + } + + foreach(j, groupedArgs) { Node *orarg = (Node *) lfirst(j); List *indlist; @@ -1222,12 +1622,34 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, andargs, all_clauses)); } + else if (restriction_is_or_clause(castNode(RestrictInfo, orarg))) + { + RestrictInfo *ri = castNode(RestrictInfo, orarg); + + /* + * Generate bitmap paths for the group of similar OR-clause + * arguments. + */ + indlist = make_bitmap_paths_for_or_group(root, + rel, ri, + inner_other_clauses); + + if (indlist == NIL) + { + pathlist = NIL; + break; + } + else + { + pathlist = list_concat(pathlist, indlist); + continue; + } + } else { RestrictInfo *ri = castNode(RestrictInfo, orarg); List *orargs; - Assert(!restriction_is_or_clause(ri)); orargs = list_make1(ri); indlist = build_paths_for_OR(root, rel, @@ -1253,6 +1675,9 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, pathlist = lappend(pathlist, bitmapqual); } + if (inner_other_clauses != NIL) + list_free(inner_other_clauses); + /* * If we have a match for every arm, then turn them into a * BitmapOrPath, and add to result list. @@ -2430,7 +2855,7 @@ match_opclause_to_indexcol(PlannerInfo *root, /* * Check for clauses of the form: (indexkey operator constant) or - * (constant operator indexkey). See match_clause_to_indexcol's notes + * (constant operator indexkey). See match_clause_to_indexcol()'s notes * about const-ness. * * Note that we don't ask the support function about clauses that don't @@ -2991,8 +3416,12 @@ match_orclause_to_indexcol(PlannerInfo *root, get_typlenbyvalalign(consttype, &typlen, &typbyval, &typalign); elems = (Datum *) palloc(sizeof(Datum) * list_length(consts)); - foreach(lc, consts) - elems[i++] = ((Const *) lfirst(lc))->constvalue; + foreach_node(Const, value, consts) + { + Assert(!value->constisnull && value->constvalue); + + elems[i++] = value->constvalue; + } arrayConst = construct_array(elems, i, consttype, typlen, typbyval, typalign); diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c index 0b406e93342..9e1458401c2 100644 --- a/src/backend/optimizer/util/restrictinfo.c +++ b/src/backend/optimizer/util/restrictinfo.c @@ -21,17 +21,6 @@ #include "optimizer/restrictinfo.h" -static RestrictInfo *make_restrictinfo_internal(PlannerInfo *root, - Expr *clause, - Expr *orclause, - bool is_pushed_down, - bool has_clone, - bool is_clone, - bool pseudoconstant, - Index security_level, - Relids required_relids, - Relids incompatible_relids, - Relids outer_relids); static Expr *make_sub_restrictinfos(PlannerInfo *root, Expr *clause, bool is_pushed_down, @@ -90,36 +79,38 @@ make_restrictinfo(PlannerInfo *root, /* Shouldn't be an AND clause, else AND/OR flattening messed up */ Assert(!is_andclause(clause)); - return make_restrictinfo_internal(root, - clause, - NULL, - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return make_plain_restrictinfo(root, + clause, + NULL, + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } /* - * make_restrictinfo_internal + * make_plain_restrictinfo * - * Common code for the main entry points and the recursive cases. + * Common code for the main entry points and the recursive cases. Also, + * useful while contrucitng RestrictInfos above OR clause, which already has + * RestrictInfos above its subclauses. */ -static RestrictInfo * -make_restrictinfo_internal(PlannerInfo *root, - Expr *clause, - Expr *orclause, - bool is_pushed_down, - bool has_clone, - bool is_clone, - bool pseudoconstant, - Index security_level, - Relids required_relids, - Relids incompatible_relids, - Relids outer_relids) +RestrictInfo * +make_plain_restrictinfo(PlannerInfo *root, + Expr *clause, + Expr *orclause, + bool is_pushed_down, + bool has_clone, + bool is_clone, + bool pseudoconstant, + Index security_level, + Relids required_relids, + Relids incompatible_relids, + Relids outer_relids) { RestrictInfo *restrictinfo = makeNode(RestrictInfo); Relids baserels; @@ -296,17 +287,17 @@ make_sub_restrictinfos(PlannerInfo *root, NULL, incompatible_relids, outer_relids)); - return (Expr *) make_restrictinfo_internal(root, - clause, - make_orclause(orlist), - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return (Expr *) make_plain_restrictinfo(root, + clause, + make_orclause(orlist), + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } else if (is_andclause(clause)) { @@ -328,17 +319,17 @@ make_sub_restrictinfos(PlannerInfo *root, return make_andclause(andlist); } else - return (Expr *) make_restrictinfo_internal(root, - clause, - NULL, - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return (Expr *) make_plain_restrictinfo(root, + clause, + NULL, + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } /* diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h index 1b42c832c59..b77bf7ddfe9 100644 --- a/src/include/optimizer/restrictinfo.h +++ b/src/include/optimizer/restrictinfo.h @@ -22,6 +22,17 @@ make_restrictinfo(root, clause, true, false, false, false, 0, \ NULL, NULL, NULL) +extern RestrictInfo *make_plain_restrictinfo(PlannerInfo *root, + Expr *clause, + Expr *orclause, + bool is_pushed_down, + bool has_clone, + bool is_clone, + bool pseudoconstant, + Index security_level, + Relids required_relids, + Relids incompatible_relids, + Relids outer_relids); extern RestrictInfo *make_restrictinfo(PlannerInfo *root, Expr *clause, bool is_pushed_down, diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 1324d6927c4..913c9712afd 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1875,6 +1875,60 @@ SELECT * FROM tenk1 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx (1 row) +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous IS NULL); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (((thousand = 42) AND (tenthous IS NULL)) OR ((thousand = 42) AND ((tenthous = 1) OR (tenthous = 3) OR (tenthous = 42)))) + Filter: ((tenthous = 1) OR (tenthous = 3) OR (tenthous = 42) OR (tenthous IS NULL)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous IS NULL)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) +(8 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous = 42::int8); + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: ((tenthous = '1'::smallint) OR ((tenthous)::smallint = '3'::bigint) OR (tenthous = '42'::bigint)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous::int2 = 42::int8); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: ((tenthous = '1'::smallint) OR ((tenthous)::smallint = '3'::bigint) OR ((tenthous)::smallint = '42'::bigint)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous = 3::int8 OR tenthous = 42::int8); + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (((thousand = 42) AND ((tenthous = '3'::bigint) OR (tenthous = '42'::bigint))) OR ((thousand = 42) AND (tenthous = '1'::smallint))) + Filter: ((tenthous = '1'::smallint) OR (tenthous = '3'::bigint) OR (tenthous = '42'::bigint)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{3,42}'::bigint[]))) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = '1'::smallint)) +(8 rows) + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); @@ -2003,25 +2057,24 @@ SELECT count(*) FROM tenk1 EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- Aggregate -> Bitmap Heap Scan on tenk1 - Recheck Cond: (((hundred = 42) AND ((thousand = 42) OR (thousand = 99) OR (tenthous < 2))) OR (thousand = 41)) + Recheck Cond: (((hundred = 42) AND (((thousand = 42) OR (thousand = 99)) OR (tenthous < 2))) OR (thousand = 41)) + Filter: (((hundred = 42) AND ((thousand = 42) OR (thousand = 99) OR (tenthous < 2))) OR (thousand = 41)) -> BitmapOr -> BitmapAnd -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 42) -> BitmapOr -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 42) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 99) + Index Cond: (thousand = ANY ('{42,99}'::integer[])) -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: (tenthous < 2) -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: (thousand = 41) -(16 rows) +(15 rows) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; @@ -2033,22 +2086,21 @@ SELECT count(*) FROM tenk1 EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); - QUERY PLAN -------------------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- Aggregate -> Bitmap Heap Scan on tenk1 - Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 41) OR ((thousand = 99) AND (tenthous = 2)))) + Recheck Cond: ((hundred = 42) AND (((thousand = 99) AND (tenthous = 2)) OR ((thousand = 42) OR (thousand = 41)))) + Filter: ((thousand = 42) OR (thousand = 41) OR ((thousand = 99) AND (tenthous = 2))) -> BitmapAnd -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 42) -> BitmapOr - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 42) - -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 41) -> Bitmap Index Scan on tenk1_thous_tenthous Index Cond: ((thousand = 99) AND (tenthous = 2)) -(13 rows) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,41}'::integer[])) +(12 rows) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); @@ -3144,6 +3196,49 @@ SELECT b.relname, (2 rows) DROP TABLE concur_temp_tab_1, concur_temp_tab_2, reindex_temp_before; +-- Check bitmap scan can consider similar OR arguments separately without +-- grouping them into SAOP. +CREATE TABLE bitmap_split_or (a int NOT NULL, b int NOT NULL, c int NOT NULL); +INSERT INTO bitmap_split_or (SELECT 1, 1, i FROM generate_series(1, 1000) i); +INSERT INTO bitmap_split_or (select i, 2, 2 FROM generate_series(1, 1000) i); +VACUUM ANALYZE bitmap_split_or; +CREATE INDEX t_b_partial_1_idx ON bitmap_split_or (b) WHERE a = 1; +CREATE INDEX t_b_partial_2_idx ON bitmap_split_or (b) WHERE a = 2; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE (a = 1 OR a = 2) AND b = 2; + QUERY PLAN +------------------------------------------------------------------ + Bitmap Heap Scan on bitmap_split_or + Recheck Cond: (((b = 2) AND (a = 1)) OR ((b = 2) AND (a = 2))) + -> BitmapOr + -> Bitmap Index Scan on t_b_partial_1_idx + Index Cond: (b = 2) + -> Bitmap Index Scan on t_b_partial_2_idx + Index Cond: (b = 2) +(7 rows) + +DROP INDEX t_b_partial_1_idx; +DROP INDEX t_b_partial_2_idx; +CREATE INDEX t_a_b_idx ON bitmap_split_or (a, b); +CREATE INDEX t_b_c_idx ON bitmap_split_or (b, c); +CREATE STATISTICS t_a_b_stat (mcv) ON a, b FROM bitmap_split_or; +CREATE STATISTICS t_b_c_stat (mcv) ON b, c FROM bitmap_split_or; +ANALYZE bitmap_split_or; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE a = 1 AND (b = 1 OR b = 2) AND c = 2; + QUERY PLAN +------------------------------------------------------------------ + Bitmap Heap Scan on bitmap_split_or + Recheck Cond: (((b = 1) AND (c = 2)) OR ((a = 1) AND (b = 2))) + Filter: ((a = 1) AND (c = 2)) + -> BitmapOr + -> Bitmap Index Scan on t_b_c_idx + Index Cond: ((b = 1) AND (c = 2)) + -> Bitmap Index Scan on t_a_b_idx + Index Cond: ((a = 1) AND (b = 2)) +(8 rows) + +DROP TABLE bitmap_split_or; -- -- REINDEX SCHEMA -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 00e88ed6abb..f1664516bf7 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4225,20 +4225,20 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.hundred = 4))) -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 2) -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + Recheck Cond: ((unique2 = 3) OR (unique1 = 1)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = 3) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (17 rows) explain (costs off) @@ -4252,12 +4252,12 @@ select * from tenk1 a join tenk1 b on Filter: ((unique1 = 2) OR (ten = 4)) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + Recheck Cond: ((unique2 = 3) OR (unique1 = 1)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = 3) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (12 rows) explain (costs off) @@ -4269,21 +4269,21 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 2) -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR (unique1 = 1)) Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = ANY ('{3,7}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (18 rows) explain (costs off) @@ -4295,21 +4295,21 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 2) -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR (unique1 = 1)) Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = ANY ('{3,7}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (18 rows) explain (costs off) @@ -4323,18 +4323,16 @@ select * from tenk1 a join tenk1 b on -> Seq Scan on tenk1 b -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR ((unique2 = 3) OR (unique2 = 7))) + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR ((unique1 = 3) OR (unique1 = 1)) OR (unique1 < 20)) Filter: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 < 20) - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 3) - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = ANY ('{3,7}'::integer[])) -(16 rows) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = ANY ('{3,1}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 < 20) +(14 rows) -- -- test placement of movable quals in a parameterized join tree diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index 7e108f9b283..eb34bc80a85 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -738,6 +738,23 @@ SELECT * FROM tenk1 SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous IS NULL); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous = 42::int8); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous::int2 = 42::int8); + + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous = 3::int8 OR tenthous = 42::int8); + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); @@ -1321,6 +1338,27 @@ SELECT b.relname, ORDER BY 1; DROP TABLE concur_temp_tab_1, concur_temp_tab_2, reindex_temp_before; +-- Check bitmap scan can consider similar OR arguments separately without +-- grouping them into SAOP. +CREATE TABLE bitmap_split_or (a int NOT NULL, b int NOT NULL, c int NOT NULL); +INSERT INTO bitmap_split_or (SELECT 1, 1, i FROM generate_series(1, 1000) i); +INSERT INTO bitmap_split_or (select i, 2, 2 FROM generate_series(1, 1000) i); +VACUUM ANALYZE bitmap_split_or; +CREATE INDEX t_b_partial_1_idx ON bitmap_split_or (b) WHERE a = 1; +CREATE INDEX t_b_partial_2_idx ON bitmap_split_or (b) WHERE a = 2; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE (a = 1 OR a = 2) AND b = 2; +DROP INDEX t_b_partial_1_idx; +DROP INDEX t_b_partial_2_idx; +CREATE INDEX t_a_b_idx ON bitmap_split_or (a, b); +CREATE INDEX t_b_c_idx ON bitmap_split_or (b, c); +CREATE STATISTICS t_a_b_stat (mcv) ON a, b FROM bitmap_split_or; +CREATE STATISTICS t_b_c_stat (mcv) ON b, c FROM bitmap_split_or; +ANALYZE bitmap_split_or; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE a = 1 AND (b = 1 OR b = 2) AND c = 2; +DROP TABLE bitmap_split_or; + -- -- REINDEX SCHEMA -- diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9b3c2334949..0131f9a8d43 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1761,6 +1761,7 @@ OprCacheKey OprInfo OprProofCacheEntry OprProofCacheKey +OrArgIndexMatch OuterJoinClauseInfo OutputPluginCallbacks OutputPluginOptions From 70b1ec3cac353c5fcde58d5ea7778941945fc6ab Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 30 Oct 2024 11:42:28 -0400 Subject: [PATCH 53/56] Stabilize jsonb_path_query test case. An operation like '12:34:56'::time_tz takes the UTC offset from the prevailing time zone, which means that the results change across DST transitions. One of the test cases added in ed055d249 failed to consider this. Per report from Bernhard Wiedemann. Back-patch to v17, as the test case was. Discussion: https://postgr.es/m/ba8e1bc0-8a99-45b7-8397-3f2e94415e03@suse.de --- src/test/regress/expected/jsonb_jsonpath.out | 6 +++++- src/test/regress/sql/jsonb_jsonpath.sql | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/test/regress/expected/jsonb_jsonpath.out b/src/test/regress/expected/jsonb_jsonpath.out index 57c117ea580..8cf6ecfc7f8 100644 --- a/src/test/regress/expected/jsonb_jsonpath.out +++ b/src/test/regress/expected/jsonb_jsonpath.out @@ -2634,12 +2634,16 @@ select jsonb_path_query('"12:34:56 +5:30"', '$.time_tz().string()'); "12:34:56+05:30" (1 row) +-- this timetz usage will absorb the UTC offset of the current timezone setting +begin; +set local timezone = 'UTC-10'; select jsonb_path_query_tz('"12:34:56"', '$.time_tz().string()'); jsonb_path_query_tz --------------------- - "12:34:56-07:00" + "12:34:56+10:00" (1 row) +rollback; select jsonb_path_query('"12:34:56"', '$.time().string()'); jsonb_path_query ------------------ diff --git a/src/test/regress/sql/jsonb_jsonpath.sql b/src/test/regress/sql/jsonb_jsonpath.sql index c647af55e94..acb508c0dd2 100644 --- a/src/test/regress/sql/jsonb_jsonpath.sql +++ b/src/test/regress/sql/jsonb_jsonpath.sql @@ -596,7 +596,11 @@ select jsonb_path_query_tz('"2023-08-15 12:34:56"', '$.timestamp_tz().string()') select jsonb_path_query('"2023-08-15 12:34:56 +5:30"', '$.timestamp_tz().string()'); select jsonb_path_query('"2023-08-15 12:34:56"', '$.timestamp().string()'); select jsonb_path_query('"12:34:56 +5:30"', '$.time_tz().string()'); +-- this timetz usage will absorb the UTC offset of the current timezone setting +begin; +set local timezone = 'UTC-10'; select jsonb_path_query_tz('"12:34:56"', '$.time_tz().string()'); +rollback; select jsonb_path_query('"12:34:56"', '$.time().string()'); select jsonb_path_query('"2023-08-15"', '$.date().string()'); From d5996f333ebbc35d5c328786530f30448cfc5183 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Mon, 25 Nov 2024 09:05:26 +0200 Subject: [PATCH 54/56] Remove the wrong assertion from match_orclause_to_indexcol() Obviously, the constant could be zero. Also, add the relevant check to regression tests. Reported-by: Richard Guo Discussion: https://postgr.es/m/CAMbWs4-siKJdtWhcbqk4Y-xG12do2Ckm1qw672GNsSnDqL9FQg%40mail.gmail.com --- src/backend/optimizer/path/indxpath.c | 2 +- src/test/regress/expected/create_index.out | 10 +++++----- src/test/regress/sql/create_index.sql | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index cae5f3d7e63..31315f28ef1 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -3418,7 +3418,7 @@ match_orclause_to_indexcol(PlannerInfo *root, elems = (Datum *) palloc(sizeof(Datum) * list_length(consts)); foreach_node(Const, value, consts) { - Assert(!value->constisnull && value->constvalue); + Assert(!value->constisnull); elems[i++] = value->constvalue; } diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 913c9712afd..79a113d1838 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1843,15 +1843,15 @@ DROP TABLE onek_with_null; -- EXPLAIN (COSTS OFF) SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); - QUERY PLAN ------------------------------------------------------------------------------- + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); + QUERY PLAN +-------------------------------------------------------------------------------- Index Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42,0}'::integer[]))) (2 rows) SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 ---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index eb34bc80a85..f0d7cbfd822 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -728,9 +728,9 @@ DROP TABLE onek_with_null; EXPLAIN (COSTS OFF) SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); EXPLAIN (COSTS OFF) SELECT * FROM tenk1 From ba8e624f594f2ad99e05e301b2e4a551908dfec8 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Fri, 29 Nov 2024 01:46:43 +0200 Subject: [PATCH 55/56] Skip not SOAP-supported indexes while transforming an OR clause into SAOP There is no point in transforming OR-clauses into SAOP's if the target index doesn't support SAOP scans anyway. This commit adds corresponding checks to match_orclause_to_indexcol() and group_similar_or_args(). The first check fixes the actual bug, while the second just saves some cycles. Reported-by: Alexander Lakhin Discussion: https://postgr.es/m/8174de69-9e1a-0827-0e81-ef97f56a5939%40gmail.com Author: Alena Rybakina Reviewed-by: Ranier Vilela, Alexander Korotkov --- src/backend/optimizer/path/indxpath.c | 11 +++++++++-- src/test/regress/expected/create_index.out | 18 ++++++++++++++++++ src/test/regress/sql/create_index.sql | 6 ++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 31315f28ef1..a698f888d71 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -1343,8 +1343,11 @@ group_similar_or_args(PlannerInfo *root, RelOptInfo *rel, RestrictInfo *rinfo) { IndexOptInfo *index = (IndexOptInfo *) lfirst(lc2); - /* Ignore index if it doesn't support bitmap scans */ - if (!index->amhasgetbitmap) + /* + * Ignore index if it doesn't support bitmap scans or SAOP + * clauses. + */ + if (!index->amhasgetbitmap || !index->amsearcharray) continue; for (colnum = 0; colnum < index->nkeycolumns; colnum++) @@ -3235,6 +3238,10 @@ match_orclause_to_indexcol(PlannerInfo *root, Assert(IsA(orclause, BoolExpr)); Assert(orclause->boolop == OR_EXPR); + /* Ignore index if it doesn't support SAOP clauses */ + if(!index->amsearcharray) + return NULL; + /* * Try to convert a list of OR-clauses to a single SAOP expression. Each * OR entry must be in the form: (indexkey operator constant) or (constant diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 79a113d1838..1f74afeca8f 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1233,6 +1233,24 @@ SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; 14 (1 row) +-- OR-clauses shouldn't be transformed into SAOP because hash indexes don't +-- support SAOP scans. +SET enable_seqscan = off; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM tenk1 WHERE stringu1 = 'TVAAAA' OR stringu1 = 'TVAAAB'; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((stringu1 = 'TVAAAA'::name) OR (stringu1 = 'TVAAAB'::name)) + -> BitmapOr + -> Bitmap Index Scan on hash_tuplesort_idx + Index Cond: (stringu1 = 'TVAAAA'::name) + -> Bitmap Index Scan on hash_tuplesort_idx + Index Cond: (stringu1 = 'TVAAAB'::name) +(8 rows) + +RESET enable_seqscan; DROP INDEX hash_tuplesort_idx; RESET maintenance_work_mem; -- diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index f0d7cbfd822..6b683da30f9 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -372,6 +372,12 @@ CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fi EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; +-- OR-clauses shouldn't be transformed into SAOP because hash indexes don't +-- support SAOP scans. +SET enable_seqscan = off; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM tenk1 WHERE stringu1 = 'TVAAAA' OR stringu1 = 'TVAAAB'; +RESET enable_seqscan; DROP INDEX hash_tuplesort_idx; RESET maintenance_work_mem; From 142e73479eccf7cbf9d471c9942a3a98e8bce5a3 Mon Sep 17 00:00:00 2001 From: Angus Dippenaar Date: Thu, 19 Dec 2024 00:30:47 +0100 Subject: [PATCH 56/56] fix pg_rewind docs --- doc/src/sgml/ref/pg_rewind.sgml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index 0c8e7dd2cc3..063364f9702 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -302,11 +302,19 @@ PostgreSQL documentation This option has no effect when is used. + + + + + - Load shared library that performs custom rewind for postgres extension. The path may be full or relative to PKGLIBDIR. File extension is optional. Multiple extensions can be selected by multiple switches. + Load shared library that performs custom rewind for postgres extension. + The path may be full or + relative to PKGLIBDIR. File extension is optional. Multiple extensions + can be selected by multiple switches.