diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000000..c6f1bef64aa --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,31 @@ +name: build + +on: + push: + pull_request: + +jobs: + test: + runs-on: + - ubuntu-20.04 + strategy: + fail-fast: false + matrix: + compiler: [clang, gcc] + check_type: [normal, debug] + env: + LLVM_VER: 10 + COMPILER: ${{ matrix.compiler }} + CHECK_TYPE: ${{ matrix.check_type }} + steps: + - name: Checkout code into workspace directory + uses: actions/checkout@v2 + - name: Setup prerequisites + run: bash ./ci/prerequisites.sh + - name: Build + run: bash ./ci/build.sh + - name: Check + run: bash ./ci/check.sh + - name: Check output + run: bash ./ci/check_output.sh + if: ${{ success() || failure() }} diff --git a/ci/build.sh b/ci/build.sh new file mode 100644 index 00000000000..f541929e69c --- /dev/null +++ b/ci/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -eu + +if [ $COMPILER = "clang" ]; then + export CC=clang-$LLVM_VER +else + export CC=gcc +fi + +# configure & build +if [ $CHECK_TYPE = "debug" ]; then + CFLAGS="-O0" ./configure --enable-debug --enable-cassert --enable-tap-tests --with-icu +else + ./configure --disable-debug --disable-cassert --enable-tap-tests --with-icu +fi + +make -sj4 +cd contrib +make -sj4 +cd .. diff --git a/ci/check.sh b/ci/check.sh new file mode 100644 index 00000000000..faa8c25e84a --- /dev/null +++ b/ci/check.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -eu + +# unsets limit for coredumps size +ulimit -c unlimited -S +# sets a coredump file pattern +mkdir -p /tmp/cores-$GITHUB_SHA-$TIMESTAMP +sudo sh -c "echo \"/tmp/cores-$GITHUB_SHA-$TIMESTAMP/%t_%p_%s.core\" > /proc/sys/kernel/core_pattern" + +make check-world -j4 diff --git a/ci/check_output.sh b/ci/check_output.sh new file mode 100644 index 00000000000..ae26cf63d68 --- /dev/null +++ b/ci/check_output.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -eu + +status=0 + +# show diff if it exists +for f in ` find . -name regression.diffs ` ; do + echo "========= Contents of $f" + cat $f + status=1 +done + +# check core dumps if any +cores=$(find /tmp/cores-$GITHUB_SHA-$TIMESTAMP/ -name '*.core' 2>/dev/null) + +if [ -n "$cores" ]; then + for corefile in $cores ; do + if [[ $corefile != *_3.core ]]; then + binary=$(gdb -quiet -core $corefile -batch -ex 'info auxv' | grep AT_EXECFN | perl -pe "s/^.*\"(.*)\"\$/\$1/g") + echo dumping $corefile for $binary + gdb --batch --quiet -ex "thread apply all bt full" -ex "quit" $binary $corefile + status=1 + fi + done +fi + +rm -rf /tmp/cores-$GITHUB_SHA-$TIMESTAMP + +exit $status diff --git a/ci/prerequisites.sh b/ci/prerequisites.sh new file mode 100644 index 00000000000..b26251b711c --- /dev/null +++ b/ci/prerequisites.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu + +# print the hostname to be able to identify runner by logs +echo "HOSTNAME=`hostname`" +TIMESTAMP=$(date +%s) +echo "TIMESTAMP=$TIMESTAMP" >> $GITHUB_ENV +echo "TIMESTAMP=$TIMESTAMP" + +sudo apt-get -y install -qq wget ca-certificates + +sudo apt-get update -qq + +apt_packages="build-essential flex bison pkg-config libreadline-dev make gdb libipc-run-perl libicu-dev python3 python3-dev python3-pip python3-setuptools python3-testresources" + +if [ $COMPILER = "clang" ]; then + apt_packages="$apt_packages llvm-$LLVM_VER clang-$LLVM_VER clang-tools-$LLVM_VER" +fi + +# install required packages +sudo apt-get -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" -y install -qq $apt_packages diff --git a/configure b/configure index 6db03e4a228..9da3f6a9af1 100755 --- a/configure +++ b/configure @@ -628,6 +628,7 @@ ac_includes_default="\ ac_subst_vars='LTLIBOBJS vpath_build PG_SYSROOT +ORIOLEDB_PATCHSET_VERSION PG_VERSION_NUM LDFLAGS_EX_BE PROVE @@ -6663,6 +6664,99 @@ fi if test -n "$NOT_THE_CFLAGS"; then CFLAGS="$CFLAGS -Wno-cast-function-type-strict" fi + if test x"$host_cpu" == x"aarch64"; then + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -moutline-atomics, for CFLAGS" >&5 +$as_echo_n "checking whether ${CC} supports -moutline-atomics, for CFLAGS... " >&6; } +if ${pgac_cv_prog_CC_cflags__moutline_atomics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CC} +CFLAGS="${CFLAGS} -moutline-atomics" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CC_cflags__moutline_atomics=yes +else + pgac_cv_prog_CC_cflags__moutline_atomics=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__moutline_atomics" >&5 +$as_echo "$pgac_cv_prog_CC_cflags__moutline_atomics" >&6; } +if test x"$pgac_cv_prog_CC_cflags__moutline_atomics" = x"yes"; then + CFLAGS="${CFLAGS} -moutline-atomics" +fi + + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CXX} supports -moutline-atomics, for CXXFLAGS" >&5 +$as_echo_n "checking whether ${CXX} supports -moutline-atomics, for CXXFLAGS... " >&6; } +if ${pgac_cv_prog_CXX_cxxflags__moutline_atomics+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CXXFLAGS=$CXXFLAGS +pgac_save_CXX=$CXX +CXX=${CXX} +CXXFLAGS="${CXXFLAGS} -moutline-atomics" +ac_save_cxx_werror_flag=$ac_cxx_werror_flag +ac_cxx_werror_flag=yes +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + pgac_cv_prog_CXX_cxxflags__moutline_atomics=yes +else + pgac_cv_prog_CXX_cxxflags__moutline_atomics=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_cxx_werror_flag=$ac_save_cxx_werror_flag +CXXFLAGS="$pgac_save_CXXFLAGS" +CXX="$pgac_save_CXX" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CXX_cxxflags__moutline_atomics" >&5 +$as_echo "$pgac_cv_prog_CXX_cxxflags__moutline_atomics" >&6; } +if test x"$pgac_cv_prog_CXX_cxxflags__moutline_atomics" = x"yes"; then + CXXFLAGS="${CXXFLAGS} -moutline-atomics" +fi + + + fi elif test "$ICC" = yes; then # Intel's compiler has a bug/misoptimization in checking for # division by NAN (NaN == 0), -mp1 fixes it, so add it to the CFLAGS. @@ -15263,7 +15357,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l +for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" @@ -19203,6 +19297,10 @@ _ACEOF +# Needed to check postgresql patches git tag during orioledb extension build +ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2` + + # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not # literally, so that it's possible to override it at build time using # a command like "make ... PG_SYSROOT=path". This has to be done after diff --git a/configure.ac b/configure.ac index 7531366b758..c1531abdd38 100644 --- a/configure.ac +++ b/configure.ac @@ -580,6 +580,10 @@ if test "$GCC" = yes -a "$ICC" = no; then if test -n "$NOT_THE_CFLAGS"; then CFLAGS="$CFLAGS -Wno-cast-function-type-strict" fi + if test x"$host_cpu" == x"aarch64"; then + PGAC_PROG_CC_CFLAGS_OPT([-moutline-atomics]) + PGAC_PROG_CXX_CFLAGS_OPT([-moutline-atomics]) + fi elif test "$ICC" = yes; then # Intel's compiler has a bug/misoptimization in checking for # division by NAN (NaN == 0), -mp1 fixes it, so add it to the CFLAGS. @@ -1758,7 +1762,6 @@ AC_CHECK_FUNCS(m4_normalize([ memset_s posix_fallocate ppoll - pthread_is_threaded_np setproctitle setproctitle_fast strchrnul @@ -2447,6 +2450,10 @@ $AWK '{printf "%d%04d", $1, $2}'`"] AC_DEFINE_UNQUOTED(PG_VERSION_NUM, $PG_VERSION_NUM, [PostgreSQL version as a number]) AC_SUBST(PG_VERSION_NUM) +# Needed to check postgresql patches git tag during orioledb extension build +[ORIOLEDB_PATCHSET_VERSION=`git describe --tags | cut -d'_' -f2`] +AC_SUBST(ORIOLEDB_PATCHSET_VERSION) + # If we are inserting PG_SYSROOT into CPPFLAGS, do so symbolically not # literally, so that it's possible to override it at build time using # a command like "make ... PG_SYSROOT=path". This has to be done after diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 34990c5cea3..ed4497f9620 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -994,7 +994,7 @@ heap_entry_is_visible(BtreeCheckState *state, ItemPointer tid) TupleTableSlot *slot = table_slot_create(state->heaprel, NULL); tid_visible = table_tuple_fetch_row_version(state->heaprel, - tid, state->snapshot, slot); + PointerGetDatum(tid), state->snapshot, slot); if (slot != NULL) ExecDropSingleTupleTableSlot(slot); diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index f8a1061abb9..7873118d112 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -172,7 +172,7 @@ blbuildempty(Relation index) */ bool blinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -189,6 +189,7 @@ blinsert(Relation index, Datum *values, bool *isnull, BlockNumber blkno = InvalidBlockNumber; OffsetNumber nStart; GenericXLogState *state; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Bloom insert temporary context", diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h index fba3ba77711..b9aaca16fa2 100644 --- a/contrib/bloom/bloom.h +++ b/contrib/bloom/bloom.h @@ -189,7 +189,7 @@ extern bool blvalidate(Oid opclassoid); /* index access method interface functions */ extern bool blinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 6836129c90d..9b72303c895 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -131,7 +131,8 @@ blhandler(PG_FUNCTION_ARGS) amroutine->ambuild = blbuild; amroutine->ambuildempty = blbuildempty; - amroutine->aminsert = blinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = blinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = blbulkdelete; amroutine->amvacuumcleanup = blvacuumcleanup; diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 38a539dad1b..cff8b945297 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -368,6 +368,7 @@ tuple_data_split_internal(Oid relid, char *tupdata, */ if (VARATT_IS_EXTERNAL(tupdata + off) && !VARATT_IS_EXTERNAL_ONDISK(tupdata + off) && + !VARATT_IS_EXTERNAL_ORIOLEDB(tupdata + off) && !VARATT_IS_EXTERNAL_INDIRECT(tupdata + off)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c index 7c50d139698..02d5c2e07da 100644 --- a/contrib/test_decoding/test_decoding.c +++ b/contrib/test_decoding/test_decoding.c @@ -578,7 +578,7 @@ tuple_to_stringinfo(StringInfo s, TupleDesc tupdesc, HeapTuple tuple, bool skip_ /* print data */ if (isnull) appendStringInfoString(s, "null"); - else if (typisvarlena && VARATT_IS_EXTERNAL_ONDISK(origval)) + else if (typisvarlena && (VARATT_IS_EXTERNAL_ONDISK(origval) || VARATT_IS_EXTERNAL_ORIOLEDB(origval))) appendStringInfoString(s, "unchanged-toast-datum"); else if (!typisvarlena) print_literal(s, typid, diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index e3c1539a1e3..a33faf4f004 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -141,6 +141,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + aminsert_extended_function aminsertextended; aminsertcleanup_function aminsertcleanup; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index dc039d87566..063364f9702 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -306,6 +306,19 @@ PostgreSQL documentation + + + + + + Load shared library that performs custom rewind for postgres extension. + The path may be full or + relative to PKGLIBDIR. File extension is optional. Multiple extensions + can be selected by multiple switches. + + + + diff --git a/meson.build b/meson.build index 4c2769dee0a..8ded179b23e 100644 --- a/meson.build +++ b/meson.build @@ -153,6 +153,7 @@ cdata.set('PG_VERSION_NUM', pg_version_num) # PG_VERSION_STR is built later, it depends on compiler test results cdata.set_quoted('CONFIGURE_ARGS', '') +orioledb_patchset_version = '22' ############################################################### @@ -2689,7 +2690,6 @@ func_checks = [ ['posix_fallocate'], ['ppoll'], ['pthread_barrier_wait', {'dependencies': [thread_dep]}], - ['pthread_is_threaded_np', {'dependencies': [thread_dep]}], ['sem_init', {'dependencies': [rt_dep, thread_dep], 'skip': sema_kind != 'unnamed_posix', 'define': false}], ['setproctitle', {'dependencies': [util_dep]}], ['setproctitle_fast'], diff --git a/src/Makefile.global.in b/src/Makefile.global.in index a00c909681e..8c7ee1c7217 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -44,6 +44,9 @@ VERSION_NUM = @PG_VERSION_NUM@ PACKAGE_URL = @PACKAGE_URL@ +# OrioleDB patchset git tag number +ORIOLEDB_PATCHSET_VERSION = @ORIOLEDB_PATCHSET_VERSION@ + # Set top_srcdir, srcdir, and VPATH. ifdef PGXS top_srcdir = $(top_builddir) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 6467bed604a..c1ccef71937 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -273,7 +273,8 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambuild = brinbuild; amroutine->ambuildempty = brinbuildempty; - amroutine->aminsert = brininsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = brininsert; amroutine->aminsertcleanup = brininsertcleanup; amroutine->ambulkdelete = brinbulkdelete; amroutine->amvacuumcleanup = brinvacuumcleanup; @@ -333,7 +334,7 @@ initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo) */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, - ItemPointer heaptid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -348,6 +349,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls, MemoryContext tupcxt = NULL; MemoryContext oldcxt = CurrentMemoryContext; bool autosummarize = BrinGetAutoSummarize(idxRel); + ItemPointer heaptid = DatumGetItemPointer(tupleid); /* * If first time through in this statement, initialize the insert state diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c index 3547cdba56e..27d0e37607a 100644 --- a/src/backend/access/common/detoast.c +++ b/src/backend/access/common/detoast.c @@ -26,9 +26,10 @@ static struct varlena *toast_fetch_datum(struct varlena *attr); static struct varlena *toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 slicelength); -static struct varlena *toast_decompress_datum(struct varlena *attr); static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); +static ToastFunc o_detoast_func = NULL; + /* ---------- * detoast_external_attr - * @@ -46,7 +47,7 @@ detoast_external_attr(struct varlena *attr) { struct varlena *result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) { /* * This is an external stored plain value @@ -115,7 +116,7 @@ detoast_external_attr(struct varlena *attr) struct varlena * detoast_attr(struct varlena *attr) { - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) { /* * This is an externally stored datum --- fetch it back from there @@ -223,7 +224,14 @@ detoast_attr_slice(struct varlena *attr, else if (pg_add_s32_overflow(sliceoffset, slicelength, &slicelimit)) slicelength = slicelimit = -1; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + Assert(o_detoast_func != NULL); + preslice = o_detoast_func(attr); + if (preslice == NULL) + elog(ERROR, "unexpected NULL detoast result"); + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { struct varatt_external toast_pointer; @@ -332,6 +340,18 @@ detoast_attr_slice(struct varlena *attr, return result; } +void +register_o_detoast_func(ToastFunc func) +{ + o_detoast_func = func; +} + +void +deregister_o_detoast_func() +{ + o_detoast_func = NULL; +} + /* ---------- * toast_fetch_datum - * @@ -347,6 +367,17 @@ toast_fetch_datum(struct varlena *attr) struct varatt_external toast_pointer; int32 attrsize; + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + if (o_detoast_func != NULL) + { + result = o_detoast_func(attr); + if (result == NULL) + elog(ERROR, "unexpected NULL detoast result"); + return result; + } + } + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) elog(ERROR, "toast_fetch_datum shouldn't be called for non-ondisk datums"); @@ -467,7 +498,7 @@ toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, * * Decompress a compressed version of a varlena datum */ -static struct varlena * +struct varlena * toast_decompress_datum(struct varlena *attr) { ToastCompressionId cmid; @@ -547,11 +578,17 @@ toast_raw_datum_size(Datum value) struct varlena *attr = (struct varlena *) DatumGetPointer(value); Size result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + result = toasted->raw_size + VARHDRSZ; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { - /* va_rawsize is the size of the original datum -- including header */ struct varatt_external toast_pointer; + /* va_rawsize is the size of the original datum -- including header */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); result = toast_pointer.va_rawsize; } @@ -603,7 +640,12 @@ toast_datum_size(Datum value) struct varlena *attr = (struct varlena *) DatumGetPointer(value); Size result; - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + result = toasted->toasted_size - VARHDRSZ; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { /* * Attribute is stored externally - return the extsize whether diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 9e3407bf987..a1b8a99b739 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -755,6 +755,10 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) case TableOidAttributeNumber: result = ObjectIdGetDatum(tup->t_tableOid); break; + case RowIdAttributeNumber: + *isnull = true; + result = 0; + break; default: elog(ERROR, "invalid attnum: %d", attnum); result = 0; /* keep compiler quiet */ diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index d6eb5d85599..963995388bb 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -24,6 +24,7 @@ #include "access/nbtree.h" #include "access/reloptions.h" #include "access/spgist_private.h" +#include "access/tableam.h" #include "catalog/pg_type.h" #include "commands/defrem.h" #include "commands/tablespace.h" @@ -1377,7 +1378,7 @@ untransformRelOptions(Datum options) */ bytea * extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, - amoptions_function amoptions) + const TableAmRoutine *tableam, amoptions_function amoptions) { bytea *options; bool isnull; @@ -1399,7 +1400,8 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: - options = heap_reloptions(classForm->relkind, datum, false); + options = tableam_reloptions(tableam, classForm->relkind, + datum, false); break; case RELKIND_PARTITIONED_TABLE: options = partitioned_table_reloptions(datum, false); diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index 52230f31c68..0717947d689 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -260,7 +260,12 @@ toast_get_compression_id(struct varlena *attr) * the external toast pointer. If compressed inline, fetch it from the * toast compression header. */ - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ORIOLEDB(attr)) + { + OToastExternal *toasted = (OToastExternal*) VARDATA_EXTERNAL(attr); + cmid = toasted->formatFlags >> ORIOLEDB_EXT_FORMAT_FLAGS_BITS; + } + else if (VARATT_IS_EXTERNAL_ONDISK(attr)) { struct varatt_external toast_pointer; diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 90d0654e629..538a554c917 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -239,7 +239,7 @@ toast_save_datum(Relation rel, Datum value, { struct varatt_external old_toast_pointer; - Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); + Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal) || VARATT_IS_EXTERNAL_ORIOLEDB(oldexternal)); /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) @@ -395,7 +395,7 @@ toast_delete_datum(Relation rel, Datum value, bool is_speculative) int validIndex; SnapshotData SnapshotToast; - if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + if (!VARATT_IS_EXTERNAL_ONDISK(attr) && !VARATT_IS_EXTERNAL_ORIOLEDB(attr)) return; /* Must copy to access aligned fields */ diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 71f38be90c3..690c744d9a9 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -481,7 +481,7 @@ ginHeapTupleInsert(GinState *ginstate, OffsetNumber attnum, bool gininsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -490,6 +490,7 @@ gininsert(Relation index, Datum *values, bool *isnull, MemoryContext oldCtx; MemoryContext insertCtx; int i; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* Initialize GinState cache if first call in this statement */ if (ginstate == NULL) diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 5747ae6a4ca..68ce032f150 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -63,7 +63,8 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambuild = ginbuild; amroutine->ambuildempty = ginbuildempty; - amroutine->aminsert = gininsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = gininsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = ginbulkdelete; amroutine->amvacuumcleanup = ginvacuumcleanup; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index ed4ffa63a77..66b086ee4c7 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -85,7 +85,8 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambuild = gistbuild; amroutine->ambuildempty = gistbuildempty; - amroutine->aminsert = gistinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = gistinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = gistbulkdelete; amroutine->amvacuumcleanup = gistvacuumcleanup; @@ -157,7 +158,7 @@ gistbuildempty(Relation index) */ bool gistinsert(Relation r, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -165,6 +166,7 @@ gistinsert(Relation r, Datum *values, bool *isnull, GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache; IndexTuple itup; MemoryContext oldCxt; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* Initialize GISTSTATE cache if first call in this statement */ if (giststate == NULL) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 01d06b7c328..557c7a3f316 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -83,7 +83,8 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambuild = hashbuild; amroutine->ambuildempty = hashbuildempty; - amroutine->aminsert = hashinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = hashinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; @@ -249,7 +250,7 @@ hashbuildCallback(Relation index, */ bool hashinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -257,6 +258,7 @@ hashinsert(Relation rel, Datum *values, bool *isnull, Datum index_values[1]; bool index_isnull[1]; IndexTuple itup; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* convert data to a hash key; on failure, do not insert anything */ if (!_hash_convert_tuple(rel, diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 91b20147a00..9d6b0ad10ae 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2669,10 +2669,11 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) } /* - * heap_delete - delete a tuple + * heap_delete - delete a tuple, optionally fetching it into a slot * * See table_tuple_delete() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. + * this routine directly takes a tuple rather than a slot. Also, we don't + * place a lock on the tuple in this function, just fetch the existing version. * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last @@ -2681,8 +2682,9 @@ xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) */ TM_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) + CommandId cid, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -2760,7 +2762,7 @@ heap_delete(Relation relation, ItemPointer tid, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("attempted to delete invisible tuple"))); } - else if (result == TM_BeingModified && wait) + else if (result == TM_BeingModified && (options & TABLE_MODIFY_WAIT)) { TransactionId xwait; uint16 infomask; @@ -2901,7 +2903,30 @@ heap_delete(Relation relation, ItemPointer tid, tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); else tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); + + /* + * If we're asked to lock the updated tuple, we just fetch the + * existing tuple. That let's the caller save some resources on + * placing the lock. + */ + if (result == TM_Updated && + (options & TABLE_MODIFY_LOCK_UPDATED)) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + bslot->base.tupdata = tp; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + UnlockReleaseBuffer(buffer); + } if (have_tuple_lock) UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); if (vmbuffer != InvalidBuffer) @@ -3075,8 +3100,24 @@ heap_delete(Relation relation, ItemPointer tid, */ CacheInvalidateHeapTuple(relation, &tp, NULL); - /* Now we can release the buffer */ - ReleaseBuffer(buffer); + /* Fetch the old tuple version if we're asked for that. */ + if (options & TABLE_MODIFY_FETCH_OLD_TUPLE) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + bslot->base.tupdata = tp; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + } /* * Release the lmgr tuple lock, if we had it. @@ -3108,8 +3149,8 @@ simple_heap_delete(Relation relation, ItemPointer tid) result = heap_delete(relation, tid, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); + TABLE_MODIFY_WAIT /* wait for commit */ , + &tmfd, false /* changingPart */ , NULL); switch (result) { case TM_SelfModified: @@ -3136,10 +3177,11 @@ simple_heap_delete(Relation relation, ItemPointer tid) } /* - * heap_update - replace a tuple + * heap_update - replace a tuple, optionally fetching it into a slot * * See table_tuple_update() for an explanation of the parameters, except that - * this routine directly takes a tuple rather than a slot. + * this routine directly takes a tuple rather than a slot. Also, we don't + * place a lock on the tuple in this function, just fetch the existing version. * * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last @@ -3148,9 +3190,9 @@ simple_heap_delete(Relation relation, ItemPointer tid) */ TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, + CommandId cid, Snapshot crosscheck, int options, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) + TU_UpdateIndexes *update_indexes, TupleTableSlot *oldSlot) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -3327,7 +3369,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer); /* see below about the "no wait" case */ - Assert(result != TM_BeingModified || wait); + Assert(result != TM_BeingModified || (options & TABLE_MODIFY_WAIT)); if (result == TM_Invisible) { @@ -3336,7 +3378,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("attempted to update invisible tuple"))); } - else if (result == TM_BeingModified && wait) + else if (result == TM_BeingModified && (options & TABLE_MODIFY_WAIT)) { TransactionId xwait; uint16 infomask; @@ -3540,7 +3582,30 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); else tmfd->cmax = InvalidCommandId; - UnlockReleaseBuffer(buffer); + + /* + * If we're asked to lock the updated tuple, we just fetch the + * existing tuple. That let's the caller save some resouces on + * placing the lock. + */ + if (result == TM_Updated && + (options & TABLE_MODIFY_LOCK_UPDATED)) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + bslot->base.tupdata = oldtup; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + UnlockReleaseBuffer(buffer); + } if (have_tuple_lock) UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); if (vmbuffer != InvalidBuffer) @@ -4019,7 +4084,26 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, /* Now we can release the buffer(s) */ if (newbuf != buffer) ReleaseBuffer(newbuf); - ReleaseBuffer(buffer); + + /* Fetch the old tuple version if we're asked for that. */ + if (options & TABLE_MODIFY_FETCH_OLD_TUPLE) + { + BufferHeapTupleTableSlot *bslot; + + Assert(TTS_IS_BUFFERTUPLE(oldSlot)); + bslot = (BufferHeapTupleTableSlot *) oldSlot; + + bslot->base.tupdata = oldtup; + ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, + oldSlot, + buffer); + } + else + { + /* Now we can release the buffer */ + ReleaseBuffer(buffer); + } + if (BufferIsValid(vmbuffer_new)) ReleaseBuffer(vmbuffer_new); if (BufferIsValid(vmbuffer)) @@ -4227,8 +4311,8 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, result = heap_update(relation, otid, tup, GetCurrentCommandId(true), InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + TABLE_MODIFY_WAIT /* wait for commit */ , + &tmfd, &lockmode, update_indexes, NULL); switch (result) { case TM_SelfModified: @@ -4291,12 +4375,14 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * tuples. * * Output parameters: - * *tuple: all fields filled in - * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *slot: BufferHeapTupleTableSlot filled with tuple * *tmfd: filled in failure cases (see below) * * Function results are the same as the ones for table_tuple_lock(). * + * If *slot already contains the target tuple, it takes advantage on that by + * skipping the ReadBuffer() call. + * * In the failure cases other than TM_Invisible, the routine fills * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, * if necessary), and t_cmax (the last only for TM_SelfModified, @@ -4307,15 +4393,14 @@ get_mxact_status_for_lock(LockTupleMode mode, bool is_update) * See README.tuplock for a thorough explanation of this mechanism. */ TM_Result -heap_lock_tuple(Relation relation, HeapTuple tuple, +heap_lock_tuple(Relation relation, ItemPointer tid, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, TM_FailureData *tmfd) + bool follow_updates, TM_FailureData *tmfd) { TM_Result result; - ItemPointer tid = &(tuple->t_self); ItemId lp; Page page; + Buffer buffer; Buffer vmbuffer = InvalidBuffer; BlockNumber block; TransactionId xid, @@ -4327,8 +4412,24 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, bool skip_tuple_lock = false; bool have_tuple_lock = false; bool cleared_all_frozen = false; + BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; + HeapTuple tuple = &bslot->base.tupdata; + + Assert(TTS_IS_BUFFERTUPLE(slot)); - *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + /* Take advantage if slot already contains the relevant tuple */ + if (!TTS_EMPTY(slot) && + slot->tts_tableOid == relation->rd_id && + ItemPointerCompare(&slot->tts_tid, tid) == 0 && + BufferIsValid(bslot->buffer)) + { + buffer = bslot->buffer; + IncrBufferRefCount(buffer); + } + else + { + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + } block = ItemPointerGetBlockNumber(tid); /* @@ -4337,21 +4438,22 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * in the middle of changing this, so we'll need to recheck after we have * the lock. */ - if (PageIsAllVisible(BufferGetPage(*buffer))) + if (PageIsAllVisible(BufferGetPage(buffer))) visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - page = BufferGetPage(*buffer); + page = BufferGetPage(buffer); lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); Assert(ItemIdIsNormal(lp)); + tuple->t_self = *tid; tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); tuple->t_len = ItemIdGetLength(lp); tuple->t_tableOid = RelationGetRelid(relation); l3: - result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); + result = HeapTupleSatisfiesUpdate(tuple, cid, buffer); if (result == TM_Invisible) { @@ -4380,7 +4482,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, infomask2 = tuple->t_data->t_infomask2; ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* * If any subtransaction of the current top transaction already holds @@ -4532,12 +4634,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = res; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } } - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Make sure it's still an appropriate lock, else start over. @@ -4572,7 +4674,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Make sure it's still an appropriate lock, else start over. @@ -4600,7 +4702,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * No conflict, but if the xmax changed under us in the * meantime, start over. */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), xwait)) @@ -4612,7 +4714,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, } else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* if the xmax changed in the meantime, start over */ if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || @@ -4640,7 +4742,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, TransactionIdIsCurrentTransactionId(xwait)) { /* ... but if the xmax changed in the meantime, start over */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), xwait)) @@ -4662,7 +4764,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (require_sleep && (result == TM_Updated || result == TM_Deleted)) { - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } else if (require_sleep) @@ -4687,7 +4789,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } @@ -4713,7 +4815,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } break; @@ -4753,7 +4855,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = TM_WouldBlock; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } break; @@ -4779,12 +4881,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, { result = res; /* recovery code expects to have buffer lock held */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto failed; } } - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * xwait is done, but if xwait had just locked the tuple then some @@ -4806,7 +4908,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, * don't check for this in the multixact case, because some * locker transactions might still be running. */ - UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); + UpdateXmaxHintBits(tuple->t_data, buffer, xwait); } } @@ -4865,9 +4967,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) { - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); visibilitymap_pin(relation, block, &vmbuffer); - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); goto l3; } @@ -4930,7 +5032,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, cleared_all_frozen = true; - MarkBufferDirty(*buffer); + MarkBufferDirty(buffer); /* * XLOG stuff. You might think that we don't need an XLOG record because @@ -4950,7 +5052,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, XLogRecPtr recptr; XLogBeginInsert(); - XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); xlrec.xmax = xid; @@ -4971,7 +5073,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, result = TM_Ok; out_locked: - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); out_unlocked: if (BufferIsValid(vmbuffer)) @@ -4989,6 +5091,9 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, if (have_tuple_lock) UnlockTupleTuplock(relation, tid, mode); + /* Put the target tuple to the slot */ + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); + return result; } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 6f8b1b79298..7d6828db403 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -23,6 +23,7 @@ #include "access/heapam.h" #include "access/heaptoast.h" #include "access/multixact.h" +#include "access/reloptions.h" #include "access/rewriteheap.h" #include "access/syncscan.h" #include "access/tableam.h" @@ -46,6 +47,12 @@ #include "utils/builtins.h" #include "utils/rel.h" +static TM_Result heapam_tuple_lock(Relation relation, Datum tid, + Snapshot snapshot, TupleTableSlot *slot, + CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, uint8 flags, + TM_FailureData *tmfd); + static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, Datum *values, bool *isnull, RewriteState rwstate); @@ -70,6 +77,20 @@ heapam_slot_callbacks(Relation relation) return &TTSOpsBufferHeapTuple; } +static RowRefType +heapam_get_row_ref_type(Relation rel) +{ + return ROW_REF_TID; +} + +static void +heapam_free_rd_amcache(Relation rel) +{ + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; +} + /* ------------------------------------------------------------------------ * Index Scan Callbacks for heap AM @@ -111,7 +132,7 @@ heapam_index_fetch_end(IndexFetchTableData *scan) static bool heapam_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead) @@ -119,6 +140,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; bool got_heap_tuple; + ItemPointer tid = DatumGetItemPointer(tupleid); Assert(TTS_IS_BUFFERTUPLE(slot)); @@ -179,7 +201,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, static bool heapam_fetch_row_version(Relation relation, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot) { @@ -188,7 +210,7 @@ heapam_fetch_row_version(Relation relation, Assert(TTS_IS_BUFFERTUPLE(slot)); - bslot->base.tupdata.t_self = *tid; + bslot->base.tupdata.t_self = *DatumGetItemPointer(tupleid); if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false)) { /* store in slot, transferring existing pin */ @@ -238,7 +260,7 @@ heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, * ---------------------------------------------------------------------------- */ -static void +static TupleTableSlot * heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, int options, BulkInsertState bistate) { @@ -255,6 +277,8 @@ heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, if (shouldFree) pfree(tuple); + + return slot; } static void @@ -297,36 +321,341 @@ heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, pfree(tuple); } +/* + * ExecCheckTupleVisible -- verify tuple is visible + * + * It would not be consistent with guarantees of the higher isolation levels to + * proceed with avoiding insertion (taking speculative insertion's alternative + * path) on the basis of another tuple that is not visible to MVCC snapshot. + * Check for the need to raise a serialization failure, and do so as necessary. + */ +static void +ExecCheckTupleVisible(EState *estate, + Relation rel, + TupleTableSlot *slot) +{ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) + { + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + /* + * We should not raise a serialization failure if the conflict is + * against a tuple inserted by our own transaction, even if it's not + * visible to our snapshot. (This would happen, for example, if + * conflicting keys are proposed for insertion in a single command.) + */ + if (!TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + } +} + +/* + * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() + */ +static void +ExecCheckTIDVisible(EState *estate, + Relation rel, + ItemPointer tid, + TupleTableSlot *tempSlot) +{ + /* Redundantly check isolation level */ + if (!IsolationUsesXactSnapshot()) + return; + + if (!table_tuple_fetch_row_version(rel, PointerGetDatum(tid), + SnapshotAny, tempSlot)) + elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); + ExecCheckTupleVisible(estate, rel, tempSlot); + ExecClearTuple(tempSlot); +} + +static inline TupleTableSlot * +heapam_tuple_insert_with_arbiter(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot) +{ + Relation rel = resultRelInfo->ri_RelationDesc; + uint32 specToken; + ItemPointerData conflictTid; + bool specConflict; + List *recheckIndexes = NIL; + + while (true) + { + specConflict = false; + if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, &conflictTid, + arbiterIndexes)) + { + if (lockedSlot) + { + TM_Result test; + TM_FailureData tmfd; + Datum xminDatum; + TransactionId xmin; + bool isnull; + + /* Determine lock mode to use */ + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + + /* + * Lock tuple for update. Don't follow updates when tuple cannot be + * locked without doing so. A row locking conflict here means our + * previous conclusion that the tuple is conclusively committed is not + * true anymore. + */ + test = table_tuple_lock(rel, PointerGetDatum(&conflictTid), + estate->es_snapshot, + lockedSlot, estate->es_output_cid, + lockmode, LockWaitBlock, 0, + &tmfd); + switch (test) + { + case TM_Ok: + /* success! */ + break; + + case TM_Invisible: + + /* + * This can occur when a just inserted tuple is updated again in + * the same command. E.g. because multiple rows with the same + * conflicting key values are inserted. + * + * This is somewhat similar to the ExecUpdate() TM_SelfModified + * case. We do not want to proceed because it would lead to the + * same row being updated a second time in some unspecified order, + * and in contrast to plain UPDATEs there's no historical behavior + * to break. + * + * It is the user's responsibility to prevent this situation from + * occurring. These problems are why the SQL standard similarly + * specifies that for SQL MERGE, an exception must be raised in + * the event of an attempt to update the same row twice. + */ + xminDatum = slot_getsysattr(lockedSlot, + MinTransactionIdAttributeNumber, + &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + + if (TransactionIdIsCurrentTransactionId(xmin)) + ereport(ERROR, + (errcode(ERRCODE_CARDINALITY_VIOLATION), + /* translator: %s is a SQL command name */ + errmsg("%s command cannot affect row a second time", + "ON CONFLICT DO UPDATE"), + errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); + + /* This shouldn't happen */ + elog(ERROR, "attempted to lock invisible tuple"); + break; + + case TM_SelfModified: + + /* + * This state should never be reached. As a dirty snapshot is used + * to find conflicting tuples, speculative insertion wouldn't have + * seen this row to conflict with. + */ + elog(ERROR, "unexpected self-updated tuple"); + break; + + case TM_Updated: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent update"))); + + /* + * As long as we don't support an UPDATE of INSERT ON CONFLICT for + * a partitioned table we shouldn't reach to a case where tuple to + * be lock is moved to another partition due to concurrent update + * of the partition key. + */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + + /* + * Tell caller to try again from the very start. + * + * It does not make sense to use the usual EvalPlanQual() style + * loop here, as the new version of the row might not conflict + * anymore, or the conflicting tuple has actually been deleted. + */ + ExecClearTuple(lockedSlot); + return false; + + case TM_Deleted: + if (IsolationUsesXactSnapshot()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to concurrent delete"))); + + /* see TM_Updated case */ + Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); + ExecClearTuple(lockedSlot); + return false; + + default: + elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + } + + /* Success, the tuple is locked. */ + + /* + * Verify that the tuple is visible to our MVCC snapshot if the current + * isolation level mandates that. + * + * It's not sufficient to rely on the check within ExecUpdate() as e.g. + * CONFLICT ... WHERE clause may prevent us from reaching that. + * + * This means we only ever continue when a new command in the current + * transaction could see the row, even though in READ COMMITTED mode the + * tuple will not be visible according to the current statement's + * snapshot. This is in line with the way UPDATE deals with newer tuple + * versions. + */ + ExecCheckTupleVisible(estate, rel, lockedSlot); + return NULL; + } + else + { + ExecCheckTIDVisible(estate, rel, &conflictTid, tempSlot); + return NULL; + } + } + + /* + * Before we start insertion proper, acquire our "speculative + * insertion lock". Others can use that to wait for us to decide + * if we're going to go ahead with the insertion, instead of + * waiting for the whole transaction to complete. + */ + specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); + + /* insert the tuple, with the speculative token */ + heapam_tuple_insert_speculative(rel, slot, + estate->es_output_cid, + 0, + NULL, + specToken); + + /* insert index entries for tuple */ + recheckIndexes = ExecInsertIndexTuples(resultRelInfo, + slot, estate, false, true, + &specConflict, + arbiterIndexes, + false); + + /* adjust the tuple's state accordingly */ + heapam_tuple_complete_speculative(rel, slot, + specToken, !specConflict); + + /* + * Wake up anyone waiting for our decision. They will re-check + * the tuple, see that it's no longer speculative, and wait on our + * XID as if this was a regularly inserted tuple all along. Or if + * we killed the tuple, they will see it's dead, and proceed as if + * the tuple never existed. + */ + SpeculativeInsertionLockRelease(GetCurrentTransactionId()); + + /* + * If there was a conflict, start from the beginning. We'll do + * the pre-check again, which will now find the conflicting tuple + * (unless it aborts before we get there). + */ + if (specConflict) + { + list_free(recheckIndexes); + CHECK_FOR_INTERRUPTS(); + continue; + } + + return slot; + } +} + static TM_Result -heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) +heapam_tuple_delete(Relation relation, Datum tupleid, CommandId cid, + Snapshot snapshot, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { + TM_Result result; + ItemPointer tid = DatumGetItemPointer(tupleid); + /* * Currently Deleting of index tuples are handled at vacuum, in case if * the storage itself is cleaning the dead tuples by itself, it is the * time to call the index tuple deletion also. */ - return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart); + result = heap_delete(relation, tid, cid, crosscheck, options, + tmfd, changingPart, oldSlot); + + /* + * If the tuple has been concurrently updated, then get the lock on it. + * (Do only if caller asked for this by setting the + * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the + * delete should succeed even if there are more concurrent update + * attempts. + */ + if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED)) + { + /* + * heapam_tuple_lock() will take advantage of tuple loaded into + * oldSlot by heap_delete(). + */ + result = heapam_tuple_lock(relation, tupleid, snapshot, + oldSlot, cid, LockTupleExclusive, + (options & TABLE_MODIFY_WAIT) ? + LockWaitBlock : + LockWaitSkip, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + tmfd); + + if (result == TM_Ok) + return TM_Updated; + } + + return result; } static TM_Result -heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, +heapam_tuple_update(Relation relation, Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, - LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes) + int options, TM_FailureData *tmfd, + LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { bool shouldFree = true; HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree); TM_Result result; + ItemPointer otid = DatumGetItemPointer(tupleid); /* Update the tuple with table oid */ slot->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slot->tts_tableOid; - result = heap_update(relation, otid, tuple, cid, crosscheck, wait, - tmfd, lockmode, update_indexes); + result = heap_update(relation, otid, tuple, cid, crosscheck, options, + tmfd, lockmode, update_indexes, oldSlot); ItemPointerCopy(&tuple->t_self, &slot->tts_tid); /* @@ -353,19 +682,44 @@ heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, if (shouldFree) pfree(tuple); + /* + * If the tuple has been concurrently updated, then get the lock on it. + * (Do only if caller asked for this by setting the + * TABLE_MODIFY_LOCK_UPDATED option) With the lock held retry of the + * update should succeed even if there are more concurrent update + * attempts. + */ + if (result == TM_Updated && (options & TABLE_MODIFY_LOCK_UPDATED)) + { + /* + * heapam_tuple_lock() will take advantage of tuple loaded into + * oldSlot by heap_update(). + */ + result = heapam_tuple_lock(relation, tupleid, snapshot, + oldSlot, cid, *lockmode, + (options & TABLE_MODIFY_WAIT) ? + LockWaitBlock : + LockWaitSkip, + TUPLE_LOCK_FLAG_FIND_LAST_VERSION, + tmfd); + + if (result == TM_Ok) + return TM_Updated; + } + return result; } static TM_Result -heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, +heapam_tuple_lock(Relation relation, Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd) { BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; TM_Result result; - Buffer buffer; HeapTuple tuple = &bslot->base.tupdata; + ItemPointer tid = DatumGetItemPointer(tupleid); bool follow_updates; follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0; @@ -374,9 +728,8 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, Assert(TTS_IS_BUFFERTUPLE(slot)); tuple_lock_retry: - tuple->t_self = *tid; - result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy, - follow_updates, &buffer, tmfd); + result = heap_lock_tuple(relation, tid, slot, cid, mode, wait_policy, + follow_updates, tmfd); if (result == TM_Updated && (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION)) @@ -384,8 +737,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, /* Should not encounter speculative tuple on recheck */ Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data)); - ReleaseBuffer(buffer); - if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self)) { SnapshotData SnapshotDirty; @@ -407,6 +758,8 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, InitDirtySnapshot(SnapshotDirty); for (;;) { + Buffer buffer = InvalidBuffer; + if (ItemPointerIndicatesMovedPartitions(tid)) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), @@ -501,7 +854,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, /* * This is a live tuple, so try to lock it again. */ - ReleaseBuffer(buffer); + ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); goto tuple_lock_retry; } @@ -512,7 +865,7 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, */ if (tuple->t_data == NULL) { - Assert(!BufferIsValid(buffer)); + ReleaseBuffer(buffer); return TM_Deleted; } @@ -565,9 +918,6 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, slot->tts_tableOid = RelationGetRelid(relation); tuple->t_tableOid = slot->tts_tableOid; - /* store in slot, transferring existing pin */ - ExecStorePinnedBufferHeapTuple(tuple, slot, buffer); - return result; } @@ -2583,6 +2933,29 @@ SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer, } } +static bool +heapam_tuple_is_current(Relation rel, TupleTableSlot *slot) +{ + Datum xminDatum; + TransactionId xmin; + bool isnull; + + xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); + Assert(!isnull); + xmin = DatumGetTransactionId(xminDatum); + return TransactionIdIsCurrentTransactionId(xmin); +} + +static bytea * +heapam_reloptions(char relkind, Datum reloptions, bool validate) +{ + if (relkind == RELKIND_RELATION || + relkind == RELKIND_TOASTVALUE || + relkind == RELKIND_MATVIEW) + return heap_reloptions(relkind, reloptions, validate); + + return NULL; +} /* ------------------------------------------------------------------------ * Definition of the heap table access method. @@ -2593,6 +2966,8 @@ static const TableAmRoutine heapam_methods = { .type = T_TableAmRoutine, .slot_callbacks = heapam_slot_callbacks, + .get_row_ref_type = heapam_get_row_ref_type, + .free_rd_amcache = heapam_free_rd_amcache, .scan_begin = heap_beginscan, .scan_end = heap_endscan, @@ -2612,8 +2987,7 @@ static const TableAmRoutine heapam_methods = { .index_fetch_tuple = heapam_index_fetch_tuple, .tuple_insert = heapam_tuple_insert, - .tuple_insert_speculative = heapam_tuple_insert_speculative, - .tuple_complete_speculative = heapam_tuple_complete_speculative, + .tuple_insert_with_arbiter = heapam_tuple_insert_with_arbiter, .multi_insert = heap_multi_insert, .tuple_delete = heapam_tuple_delete, .tuple_update = heapam_tuple_update, @@ -2645,7 +3019,11 @@ static const TableAmRoutine heapam_methods = { .scan_bitmap_next_block = heapam_scan_bitmap_next_block, .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, .scan_sample_next_block = heapam_scan_sample_next_block, - .scan_sample_next_tuple = heapam_scan_sample_next_tuple + .scan_sample_next_tuple = heapam_scan_sample_next_tuple, + + .tuple_is_current = heapam_tuple_is_current, + + .reloptions = heapam_reloptions }; diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index 079fb7cba65..a8f1c580acd 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -16,25 +16,27 @@ #include "access/amapi.h" #include "access/htup_details.h" #include "catalog/pg_am.h" +#include "catalog/pg_class.h" +#include "catalog/pg_index.h" #include "catalog/pg_opclass.h" #include "utils/fmgrprotos.h" #include "utils/syscache.h" +IndexAMRoutineHookType IndexAMRoutineHook = NULL; -/* - * GetIndexAmRoutine - call the specified access method handler routine to get - * its IndexAmRoutine struct, which will be palloc'd in the caller's context. - * - * Note that if the amhandler function is built-in, this will not involve - * any catalog access. It's therefore safe to use this while bootstrapping - * indexes for the system catalogs. relcache.c relies on that. - */ IndexAmRoutine * -GetIndexAmRoutine(Oid amhandler) +GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler) { Datum datum; IndexAmRoutine *routine; + if (IndexAMRoutineHook != NULL) + { + routine = IndexAMRoutineHook(tamoid, amhandler); + if (routine) + return routine; + } + datum = OidFunctionCall0(amhandler); routine = (IndexAmRoutine *) DatumGetPointer(datum); @@ -45,6 +47,52 @@ GetIndexAmRoutine(Oid amhandler) return routine; } +/* + * GetIndexAmRoutine - call the specified access method handler routine to get + * its IndexAmRoutine struct, which will be palloc'd in the caller's context. + * + * Note that if the amhandler function is built-in, this will not involve + * any catalog access. It's therefore safe to use this while bootstrapping + * indexes for the system catalogs. relcache.c relies on that. + */ +IndexAmRoutine * +GetIndexAmRoutine(Oid amhandler) +{ + return GetIndexAmRoutineExtended(InvalidOid, amhandler); +} + +IndexAmRoutine * +GetIndexAmRoutineExtended(Oid indoid, Oid amhandler) +{ + HeapTuple ht_idx; + HeapTuple ht_tblrel; + Form_pg_index idxrec; + Form_pg_class tblrelrec; + Oid indrelid; + Oid tamoid; + + if (!OidIsValid((indoid)) || indoid < FirstNormalObjectId) + return GetIndexAmRoutineWithTableAM(HEAP_TABLE_AM_OID, amhandler); + + ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indoid)); + if (!HeapTupleIsValid(ht_idx)) + elog(ERROR, "cache lookup failed for index %u", indoid); + idxrec = (Form_pg_index) GETSTRUCT(ht_idx); + Assert(indoid == idxrec->indexrelid); + indrelid = idxrec->indrelid; + + ht_tblrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indrelid)); + if (!HeapTupleIsValid(ht_tblrel)) + elog(ERROR, "cache lookup failed for relation %u", indrelid); + tblrelrec = (Form_pg_class) GETSTRUCT(ht_tblrel); + tamoid = tblrelrec->relam; + + ReleaseSysCache(ht_tblrel); + ReleaseSysCache(ht_idx); + + return GetIndexAmRoutineWithTableAM(tamoid, amhandler); +} + /* * GetIndexAmRoutineByAmId - look up the handler of the index access method * with the given OID, and get its IndexAmRoutine struct. @@ -53,7 +101,7 @@ GetIndexAmRoutine(Oid amhandler) * noerror is true, else throws error. */ IndexAmRoutine * -GetIndexAmRoutineByAmId(Oid amoid, bool noerror) +GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror) { HeapTuple tuple; Form_pg_am amform; @@ -103,7 +151,7 @@ GetIndexAmRoutineByAmId(Oid amoid, bool noerror) ReleaseSysCache(tuple); /* And finally, call the handler function to get the API struct. */ - return GetIndexAmRoutine(amhandler); + return GetIndexAmRoutineExtended(indoid, amhandler); } @@ -129,7 +177,7 @@ amvalidate(PG_FUNCTION_ARGS) ReleaseSysCache(classtup); - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); if (amroutine->amvalidate == NULL) elog(ERROR, "function amvalidate is not defined for index access method %u", diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index de751e8e4a3..e162df6dfd1 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -101,6 +101,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->orderByData = NULL; scan->xs_want_itup = false; /* may be set later */ + scan->xs_want_rowid = false; /* may be set later */ /* * During recovery we ignore killed tuples and don't bother to kill them @@ -122,6 +123,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_itupdesc = NULL; scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + scan->xs_rowid.isnull = true; return scan; } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index dcd04b813d8..4668d7159ae 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -213,24 +213,39 @@ bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_t_ctid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) { RELATION_CHECKS; - CHECK_REL_PROCEDURE(aminsert); + + if (indexRelation->rd_indam->aminsertextended == NULL && indexRelation->rd_indam->aminsert == NULL ) + elog(ERROR, "at least one function aminsert or aminsertextended should be defined for index \"%s\"", \ + RelationGetRelationName(indexRelation)); if (!(indexRelation->rd_indam->ampredlocks)) CheckForSerializableConflictIn(indexRelation, (ItemPointer) NULL, InvalidBlockNumber); - return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, - heap_t_ctid, heapRelation, + if (indexRelation->rd_indam->aminsert) + { + /* compatibility method for extension AM's not aware of aminsertextended */ + return indexRelation->rd_indam->aminsert(indexRelation, values, isnull, + tupleid, heapRelation, + checkUnique, indexUnchanged, + indexInfo); + } + else + { + /* index insert method for internal AM's and Orioledb that are aware of aminsertextended */ + return indexRelation->rd_indam->aminsertextended(indexRelation, values, isnull, + ItemPointerGetDatum(tupleid), heapRelation, checkUnique, indexUnchanged, indexInfo); + } } /* ------------------------- @@ -247,6 +262,66 @@ index_insert_cleanup(Relation indexRelation, indexRelation->rd_indam->aminsertcleanup(indexRelation, indexInfo); } +/* ---------------- + * index_update - update an index tuple in a relation + * ---------------- + */ +bool +index_update(Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amupdate); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->amupdate(indexRelation, + new_valid, old_valid, + values, isnull, tupleid, + valuesOld, isnullOld, oldTupleid, + heapRelation, + checkUnique, + indexInfo); +} + + +/* ---------------- + * index_delete - delete an index tuple from a relation + * ---------------- + */ +bool +index_delete(Relation indexRelation, + Datum *values, bool *isnull, Datum tupleid, + Relation heapRelation, + IndexInfo *indexInfo) +{ + RELATION_CHECKS; + CHECK_REL_PROCEDURE(amdelete); + + if (!(indexRelation->rd_indam->ampredlocks)) + CheckForSerializableConflictIn(indexRelation, + (ItemPointer) NULL, + InvalidBlockNumber); + + return indexRelation->rd_indam->amdelete(indexRelation, + values, isnull, tupleid, + heapRelation, + indexInfo); +} + /* * index_beginscan - start a scan of an index with amgettuple * @@ -610,6 +685,55 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } +/* ---------------- + * index_getnext_rowid - get the next ROWID from a scan + * + * The result is the next ROWID satisfying the scan keys, + * or isnull if no more matching tuples exist. + * ---------------- + */ +NullableDatum +index_getnext_rowid(IndexScanDesc scan, ScanDirection direction) +{ + NullableDatum result; + bool found; + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amgettuple); + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * The AM's amgettuple proc finds the next index entry matching the scan + * keys, and puts the TID into scan->xs_heaptid. It should also set + * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we + * pay no attention to those fields here. + */ + found = scan->indexRelation->rd_indam->amgettuple(scan, direction); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + scan->xs_heap_continue = false; + + /* If we're out of index entries, we're done */ + if (!found) + { + /* release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + result.isnull = true; + return result; + } + /* Assert(RowidIsValid(&scan->xs_rowid)); */ + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Return the ROWID of the tuple we found. */ + return scan->xs_rowid; +} + /* ---------------- * index_fetch_heap - get the scan's next heap tuple * @@ -633,8 +757,17 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) { bool all_dead = false; bool found; + Datum tupleid; + + if (scan->xs_want_rowid) + { + Assert(!scan->xs_rowid.isnull); + tupleid = scan->xs_rowid.value; + } + else + tupleid = PointerGetDatum(&scan->xs_heaptid); - found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, + found = table_index_fetch_tuple(scan->xs_heapfetch, tupleid, scan->xs_snapshot, slot, &scan->xs_heap_continue, &all_dead); @@ -676,16 +809,30 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * { if (!scan->xs_heap_continue) { - ItemPointer tid; + if (scan->xs_want_rowid) + { + NullableDatum rowid; + /* Time to fetch the next TID from the index */ + rowid = index_getnext_rowid(scan, direction); - /* Time to fetch the next TID from the index */ - tid = index_getnext_tid(scan, direction); + /* If we're out of index entries, we're done */ + if (rowid.isnull) + break; - /* If we're out of index entries, we're done */ - if (tid == NULL) - break; + /* Assert(RowidEquals(rowid, &scan->xs_rowid)); */ + } + else + { + ItemPointer tid; + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scan, direction); - Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + } } /* @@ -693,7 +840,8 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * * If we don't find anything, loop around and grab the next TID from * the index. */ - Assert(ItemPointerIsValid(&scan->xs_heaptid)); + if (!scan->xs_want_rowid) + Assert(ItemPointerIsValid(&scan->xs_heaptid)); if (index_fetch_heap(scan, slot)) return true; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 59155a7bea6..b661adb689e 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -127,7 +127,8 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambuild = btbuild; amroutine->ambuildempty = btbuildempty; - amroutine->aminsert = btinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = btinsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; @@ -180,13 +181,14 @@ btbuildempty(Relation index) */ bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) { bool result; IndexTuple itup; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); /* generate an index tuple */ itup = index_form_tuple(RelationGetDescr(rel), values, isnull); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index c22ccec789d..55ff1fdbfaf 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -73,7 +73,6 @@ static int _bt_binsrch_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result); -static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, @@ -1377,7 +1376,7 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) * On false result, the scankeys stay the same, and the array keys are not * advanced (every array remains at its final element for scan direction). */ -static bool +bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 1bec19c2b88..57004e79f54 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -181,7 +181,7 @@ spgbuildempty(Relation index) */ bool spginsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -189,6 +189,7 @@ spginsert(Relation index, Datum *values, bool *isnull, SpGistState spgstate; MemoryContext oldCtx; MemoryContext insertCtx; + ItemPointer ht_ctid = DatumGetItemPointer(tupleid); insertCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST insert temporary context", diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 76b80146ff0..c1228ed2c01 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -70,7 +70,8 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambuild = spgbuild; amroutine->ambuildempty = spgbuildempty; - amroutine->aminsert = spginsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = spginsert; amroutine->aminsertcleanup = NULL; amroutine->ambulkdelete = spgbulkdelete; amroutine->amvacuumcleanup = spgvacuumcleanup; diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index e57a0b7ea31..8168bb78021 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -217,7 +217,7 @@ table_index_fetch_tuple_check(Relation rel, slot = table_slot_create(rel, NULL); scan = table_index_fetch_begin(rel); - found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, + found = table_index_fetch_tuple(scan, PointerGetDatum(tid), snapshot, slot, &call_again, all_dead); table_index_fetch_end(scan); ExecDropSingleTupleTableSlot(slot); @@ -287,16 +287,23 @@ simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) * via ereport(). */ void -simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot) +simple_table_tuple_delete(Relation rel, Datum tupleid, Snapshot snapshot, + TupleTableSlot *oldSlot) { TM_Result result; TM_FailureData tmfd; + int options = TABLE_MODIFY_WAIT; /* wait for commit */ - result = table_tuple_delete(rel, tid, + /* Fetch old tuple if the relevant slot is provided */ + if (oldSlot) + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; + + result = table_tuple_delete(rel, tupleid, GetCurrentCommandId(true), snapshot, InvalidSnapshot, - true /* wait for commit */ , - &tmfd, false /* changingPart */ ); + options, + &tmfd, false /* changingPart */ , + oldSlot); switch (result) { @@ -332,20 +339,27 @@ simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot) * via ereport(). */ void -simple_table_tuple_update(Relation rel, ItemPointer otid, +simple_table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, Snapshot snapshot, - TU_UpdateIndexes *update_indexes) + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { TM_Result result; TM_FailureData tmfd; LockTupleMode lockmode; + int options = TABLE_MODIFY_WAIT; /* wait for commit */ + + /* Fetch old tuple if the relevant slot is provided */ + if (oldSlot) + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; - result = table_tuple_update(rel, otid, slot, + result = table_tuple_update(rel, tupleid, slot, GetCurrentCommandId(true), snapshot, InvalidSnapshot, - true /* wait for commit */ , - &tmfd, &lockmode, update_indexes); + options, + &tmfd, &lockmode, update_indexes, + oldSlot); switch (result) { diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index e9b598256fb..cd01bd9934f 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -13,10 +13,11 @@ #include "access/tableam.h" #include "access/xact.h" +#include "catalog/pg_am.h" #include "commands/defrem.h" #include "miscadmin.h" #include "utils/guc_hooks.h" - +#include "utils/syscache.h" /* * GetTableAmRoutine @@ -68,8 +69,7 @@ GetTableAmRoutine(Oid amhandler) * Could be made optional, but would require throwing error during * parse-analysis. */ - Assert(routine->tuple_insert_speculative != NULL); - Assert(routine->tuple_complete_speculative != NULL); + Assert(routine->tuple_insert_with_arbiter != NULL); Assert(routine->multi_insert != NULL); Assert(routine->tuple_delete != NULL); @@ -97,9 +97,29 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->scan_sample_next_block != NULL); Assert(routine->scan_sample_next_tuple != NULL); + Assert(routine->tuple_is_current != NULL); + return routine; } +const TableAmRoutine * +GetTableAmRoutineByAmOid(Oid amoid) +{ + HeapTuple ht_am; + Form_pg_am amrec; + const TableAmRoutine *tableam = NULL; + + ht_am = SearchSysCache1(AMOID, ObjectIdGetDatum(amoid)); + if (!HeapTupleIsValid(ht_am)) + elog(ERROR, "cache lookup failed for access method %u", + amoid); + amrec = (Form_pg_am)GETSTRUCT(ht_am); + + tableam = GetTableAmRoutine(amrec->amhandler); + ReleaseSysCache(ht_am); + return tableam; +} + /* check_hook: validate new default_table_access_method */ bool check_default_table_access_method(char **newval, void **extra, GucSource source) diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c index 53224932f0d..a0738622657 100644 --- a/src/backend/access/table/toast_helper.c +++ b/src/backend/access/table/toast_helper.c @@ -71,10 +71,10 @@ toast_tuple_init(ToastTupleContext *ttc) * we have to delete it later. */ if (att->attlen == -1 && !ttc->ttc_oldisnull[i] && - VARATT_IS_EXTERNAL_ONDISK(old_value)) + (VARATT_IS_EXTERNAL_ONDISK(old_value) || VARATT_IS_EXTERNAL_ORIOLEDB(old_value))) { if (ttc->ttc_isnull[i] || - !VARATT_IS_EXTERNAL_ONDISK(new_value) || + !(VARATT_IS_EXTERNAL_ONDISK(new_value) || VARATT_IS_EXTERNAL_ORIOLEDB(new_value)) || memcmp((char *) old_value, (char *) new_value, VARSIZE_EXTERNAL(old_value)) != 0) { @@ -330,7 +330,7 @@ toast_delete_external(Relation rel, const Datum *values, const bool *isnull, if (isnull[i]) continue; - else if (VARATT_IS_EXTERNAL_ONDISK(value)) + else if (VARATT_IS_EXTERNAL_ONDISK(value) || VARATT_IS_EXTERNAL_ORIOLEDB(value)) toast_delete_datum(rel, value, is_speculative); } } diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 75b5325df8b..95647a357ea 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -22,6 +22,7 @@ #include "access/clog.h" #include "access/subtrans.h" #include "access/transam.h" +#include "storage/proc.h" #include "utils/snapmgr.h" /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 4cecf630060..198f1b403c5 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -212,6 +212,7 @@ typedef struct TransactionStateData bool parallelChildXact; /* is any parent transaction parallel? */ bool chain; /* start a new block after this one */ bool topXidLogged; /* for a subxact: is top-level XID logged? */ + CommitSeqNo csn; struct TransactionStateData *parent; /* back link to parent */ } TransactionStateData; @@ -245,6 +246,7 @@ static TransactionStateData TopTransactionStateData = { .state = TRANS_DEFAULT, .blockState = TBLOCK_DEFAULT, .topXidLogged = false, + .csn = COMMITSEQNO_INPROGRESS }; /* @@ -323,6 +325,7 @@ typedef struct SubXactCallbackItem static SubXactCallbackItem *SubXact_callbacks = NULL; +xact_redo_hook_type xact_redo_hook = NULL; /* local function prototypes */ static void AssignTransactionId(TransactionState s); @@ -2035,6 +2038,7 @@ StartTransaction(void) */ s->state = TRANS_START; s->fullTransactionId = InvalidFullTransactionId; /* until assigned */ + s->csn = COMMITSEQNO_INPROGRESS; /* Determine if statements are logged in this transaction */ xact_is_sampled = log_xact_sample_rate != 0 && @@ -2336,7 +2340,9 @@ CommitTransaction(void) * must be done _before_ releasing locks we hold and _after_ * RecordTransactionCommit. */ + MyProc->lastCommittedCSN = s->csn; ProcArrayEndTransaction(MyProc, latestXid); + s->csn = MyProc->lastCommittedCSN; /* * This is all post-commit cleanup. Note that if an error is raised here, @@ -2770,6 +2776,7 @@ AbortTransaction(void) * while cleaning up! */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Clear wait information and command progress indicator */ pgstat_report_wait_end(); @@ -5180,6 +5187,7 @@ AbortSubTransaction(void) * Buffer locks, for example? I don't think so but I'm not sure. */ LWLockReleaseAll(); + CustomErrorCleanup(); pgstat_report_wait_end(); pgstat_progress_end_command(); @@ -6073,6 +6081,9 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, TransactionId max_xid; TimestampTz commit_time; + if (xact_redo_hook) + xact_redo_hook(xid, lsn); + Assert(TransactionIdIsValid(xid)); max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts); @@ -6382,3 +6393,9 @@ xact_redo(XLogReaderState *record) else elog(PANIC, "xact_redo: unknown op code %u", info); } + +CommitSeqNo +GetCurrentCSN(void) +{ + return TopTransactionStateData.csn; +} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7f136026277..6e12db59c9c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -135,6 +135,7 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +CommitSeqNo startupCommitSeqNo = COMMITSEQNO_FIRST_NORMAL + 1; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -142,6 +143,11 @@ bool XLOG_DEBUG = false; int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; +/* Hook for plugins to get control in CheckPointGuts() */ +CheckPoint_hook_type CheckPoint_hook = NULL; +double CheckPointProgress; +after_checkpoint_cleanup_hook_type after_checkpoint_cleanup_hook = NULL; + /* * Number of WAL insertion locks to use. A higher value allows more insertions * to happen concurrently, but adds some CPU overhead to flushing the WAL, @@ -5068,6 +5074,7 @@ BootStrapXLOG(void) TransamVariables->nextXid = checkPoint.nextXid; TransamVariables->nextOid = checkPoint.nextOid; TransamVariables->oidCount = 0; + pg_atomic_write_u64(&TransamVariables->nextCommitSeqNo, COMMITSEQNO_FIRST_NORMAL + 1); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -5415,6 +5422,7 @@ StartupXLOG(void) XLogRecPtr missingContrecPtr; TransactionId oldestActiveXID; bool promoted = false; + bool wasInRecovery; /* * We should have an aux process resource owner to use, and we should not @@ -5544,6 +5552,7 @@ StartupXLOG(void) TransamVariables->nextXid = checkPoint.nextXid; TransamVariables->nextOid = checkPoint.nextOid; TransamVariables->oidCount = 0; + pg_atomic_write_u64(&TransamVariables->nextCommitSeqNo, startupCommitSeqNo); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -6042,6 +6051,8 @@ StartupXLOG(void) */ PreallocXlogFiles(EndOfLog, newTLI); + wasInRecovery = InRecovery; + /* * Okay, we're officially UP. */ @@ -6120,6 +6131,9 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + if (wasInRecovery && after_checkpoint_cleanup_hook) + after_checkpoint_cleanup_hook(EndOfLog, 0); + /* * All done with end-of-recovery actions. * @@ -7315,6 +7329,9 @@ CreateCheckPoint(int flags) if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + if (after_checkpoint_cleanup_hook) + after_checkpoint_cleanup_hook(ProcLastRecPtr, flags); + /* Real work is done; log and update stats. */ LogCheckpointEnd(false); @@ -7490,6 +7507,9 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointPredicate(); CheckPointBuffers(flags); + if (CheckPoint_hook) + CheckPoint_hook(checkPointRedo, flags); + /* Perform all queued up fsyncs */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); @@ -9068,6 +9088,19 @@ get_backup_status(void) return sessionBackupState; } +/* + * Check if there is a backup in progress. + * + * We do this check without lock assuming 32-bit reads are atomic. In fact, + * the false result means that there was at least a moment of time when there + * were no backups. + */ +bool +have_backup_in_progress(void) +{ + return (XLogCtl->Insert.runningBackups > 0); +} + /* * do_pg_backup_stop * @@ -9475,3 +9508,5 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } + +void (*RedoShutdownHook) (void) = NULL; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index b45b8331720..a3e7fa810f8 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1856,6 +1856,8 @@ PerformWalRecovery(void) * exit with special return code to request shutdown of * postmaster. Log messages issued from postmaster. */ + if (RedoShutdownHook != NULL) + RedoShutdownHook(); proc_exit(3); case RECOVERY_TARGET_ACTION_PAUSE: diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index a44ccee3b68..043303bc2e3 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -1638,7 +1638,7 @@ expand_all_col_privileges(Oid table_oid, Form_pg_class classForm, AttrNumber curr_att; Assert(classForm->relnatts - FirstLowInvalidHeapAttributeNumber < num_col_privileges); - for (curr_att = FirstLowInvalidHeapAttributeNumber + 1; + for (curr_att = FirstLowInvalidHeapAttributeNumber + 2; curr_att <= classForm->relnatts; curr_att++) { diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 0489cbabcb8..b3873fbd2ac 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -186,6 +186,7 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, int flags) { int i; + bool *depends_on_relation; /* * Keep track of objects for event triggers, if necessary. @@ -213,6 +214,33 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, } } + depends_on_relation = palloc0(sizeof(bool) * targetObjects->numrefs); + + for (i = targetObjects->numrefs - 1; i >= 0; i--) + { + ObjectAddressExtra *thisextra = targetObjects->extras + i; + int j; + + if (thisextra->dependee.classId == RelationRelationId && + thisextra->dependee.objectSubId == 0) + { + depends_on_relation[i] = true; + continue; + } + + for (j = i + 1; j < targetObjects->numrefs; j++) + { + ObjectAddress *depobj = targetObjects->refs + j; + if (depobj->classId == thisextra->dependee.classId && + depobj->objectId == thisextra->dependee.objectId && + depobj->objectSubId == thisextra->dependee.objectSubId) + { + depends_on_relation[i] = depends_on_relation[j]; + break; + } + } + } + /* * Delete all the objects in the proper order, except that if told to, we * should skip the original object(s). @@ -221,13 +249,19 @@ deleteObjectsInList(ObjectAddresses *targetObjects, Relation *depRel, { ObjectAddress *thisobj = targetObjects->refs + i; ObjectAddressExtra *thisextra = targetObjects->extras + i; + int temp_flags = flags; if ((flags & PERFORM_DELETION_SKIP_ORIGINAL) && (thisextra->flags & DEPFLAG_ORIGINAL)) continue; - deleteOneObject(thisobj, depRel, flags); + if (depends_on_relation[i]) + temp_flags |= PERFORM_DELETION_OF_RELATION; + + deleteOneObject(thisobj, depRel, temp_flags); } + + pfree(depends_on_relation); } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index a819b4197ce..92211c04d57 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -119,9 +119,6 @@ static void UpdateIndexRelation(Oid indexoid, Oid heapoid, bool immediate, bool isvalid, bool isready); -static void index_update_stats(Relation rel, - bool hasindex, - double reltuples); static void IndexCheckExclusion(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo); @@ -295,7 +292,7 @@ ConstructTupleDescriptor(Relation heapRelation, int i; /* We need access to the index AM's API struct */ - amroutine = GetIndexAmRoutineByAmId(accessMethodId, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, accessMethodId, false); /* ... and to the table's tuple descriptor */ heapTupDesc = RelationGetDescr(heapRelation); @@ -2651,9 +2648,6 @@ BuildSpeculativeIndexInfo(Relation index, IndexInfo *ii) */ Assert(ii->ii_Unique); - if (index->rd_rel->relam != BTREE_AM_OID) - elog(ERROR, "unexpected non-btree speculative unique index"); - ii->ii_UniqueOps = (Oid *) palloc(sizeof(Oid) * indnkeyatts); ii->ii_UniqueProcs = (Oid *) palloc(sizeof(Oid) * indnkeyatts); ii->ii_UniqueStrats = (uint16 *) palloc(sizeof(uint16) * indnkeyatts); @@ -2777,7 +2771,7 @@ FormIndexDatum(IndexInfo *indexInfo, * index. When updating an index, it's important because some index AMs * expect a relcache flush to occur after REINDEX. */ -static void +void index_update_stats(Relation rel, bool hasindex, double reltuples) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index c590a2adc35..f63faedfcfb 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -87,9 +87,6 @@ static void compute_index_stats(Relation onerel, double totalrows, MemoryContext col_context); static VacAttrStats *examine_attribute(Relation onerel, int attnum, Node *index_expr); -static int acquire_sample_rows(Relation onerel, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, double *totaldeadrows); static int compare_rows(const void *a, const void *b, void *arg); static int acquire_inherited_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, @@ -190,10 +187,7 @@ analyze_rel(Oid relid, RangeVar *relation, if (onerel->rd_rel->relkind == RELKIND_RELATION || onerel->rd_rel->relkind == RELKIND_MATVIEW) { - /* Regular table, so we'll use the regular row acquisition function */ - acquirefunc = acquire_sample_rows; - /* Also get regular table's size */ - relpages = RelationGetNumberOfBlocks(onerel); + table_analyze(onerel, &acquirefunc, &relpages); } else if (onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { @@ -1154,7 +1148,7 @@ block_sampling_read_stream_next(ReadStream *stream, * block. The previous sampling method put too much credence in the row * density near the start of the table. */ -static int +int acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) @@ -1421,9 +1415,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, if (childrel->rd_rel->relkind == RELKIND_RELATION || childrel->rd_rel->relkind == RELKIND_MATVIEW) { - /* Regular table, so use the regular row acquisition function */ - acquirefunc = acquire_sample_rows; - relpages = RelationGetNumberOfBlocks(childrel); + table_analyze(childrel, &acquirefunc, &relpages); } else if (childrel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { diff --git a/src/backend/commands/constraint.c b/src/backend/commands/constraint.c index f7dc42f7452..ea5a1f365b1 100644 --- a/src/backend/commands/constraint.c +++ b/src/backend/commands/constraint.c @@ -109,7 +109,7 @@ unique_key_recheck(PG_FUNCTION_ARGS) IndexFetchTableData *scan = table_index_fetch_begin(trigdata->tg_relation); bool call_again = false; - if (!table_index_fetch_tuple(scan, &tmptid, SnapshotSelf, slot, + if (!table_index_fetch_tuple(scan, PointerGetDatum(&tmptid), SnapshotSelf, slot, &call_again, NULL)) { /* diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 8086607710e..e6c989aea19 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -81,9 +81,6 @@ static void report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es); static double elapsed_time(instr_time *starttime); static bool ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used); -static void ExplainNode(PlanState *planstate, List *ancestors, - const char *relationship, const char *plan_name, - ExplainState *es); static void show_plan_tlist(PlanState *planstate, List *ancestors, ExplainState *es); static void show_expression(Node *node, const char *qlabel, @@ -92,9 +89,6 @@ static void show_expression(Node *node, const char *qlabel, static void show_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, bool useprefix, ExplainState *es); -static void show_scan_qual(List *qual, const char *qlabel, - PlanState *planstate, List *ancestors, - ExplainState *es); static void show_upper_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es); @@ -131,8 +125,6 @@ static void show_memoize_info(MemoizeState *mstate, List *ancestors, static void show_hashagg_info(AggState *aggstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); -static void show_instrumentation_count(const char *qlabel, int which, - PlanState *planstate, ExplainState *es); static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); @@ -1363,7 +1355,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) * to the nesting depth of logical output groups, and therefore is controlled * by ExplainOpenGroup/ExplainCloseGroup. */ -static void +void ExplainNode(PlanState *planstate, List *ancestors, const char *relationship, const char *plan_name, ExplainState *es) @@ -2527,7 +2519,7 @@ show_qual(List *qual, const char *qlabel, /* * Show a qualifier expression for a scan plan node */ -static void +void show_scan_qual(List *qual, const char *qlabel, PlanState *planstate, List *ancestors, ExplainState *es) @@ -3618,7 +3610,7 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es) * * "which" identifies which instrumentation counter to print */ -static void +void show_instrumentation_count(const char *qlabel, int which, PlanState *planstate, ExplainState *es) { diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index b987e023849..c8a926c0463 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -71,6 +71,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" +GetDefaultOpClass_hook_type GetDefaultOpClass_hook = NULL; /* non-export function prototypes */ static bool CompareOpclassOptions(const Datum *opts1, const Datum *opts2, int natts); @@ -91,11 +92,7 @@ static void ComputeIndexAttrs(IndexInfo *indexInfo, Oid ddl_userid, int ddl_sec_context, int *ddl_save_nestlevel); -static char *ChooseIndexName(const char *tabname, Oid namespaceId, - const List *colnames, const List *exclusionOpNames, - bool primary, bool isconstraint); static char *ChooseIndexNameAddition(const List *colnames); -static List *ChooseIndexColumnNames(const List *indexElems); static void ReindexIndex(const ReindexStmt *stmt, const ReindexParams *params, bool isTopLevel); static void RangeVarCallbackForReindexIndex(const RangeVar *relation, @@ -223,7 +220,7 @@ CheckIndexCompatible(Oid oldId, accessMethodName))); accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutineExtended(oldId, accessMethodForm->amhandler); ReleaseSysCache(tuple); amcanorder = amRoutine->amcanorder; @@ -844,7 +841,7 @@ DefineIndex(Oid tableId, } accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); accessMethodId = accessMethodForm->oid; - amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); + amRoutine = GetIndexAmRoutineWithTableAM(rel->rd_rel->relam, accessMethodForm->amhandler); pgstat_progress_update_param(PROGRESS_CREATEIDX_ACCESS_METHOD_OID, accessMethodId); @@ -2317,6 +2314,9 @@ GetDefaultOpClass(Oid type_id, Oid am_id) /* If it's a domain, look at the base type instead */ type_id = getBaseType(type_id); + if (GetDefaultOpClass_hook) + return GetDefaultOpClass_hook(type_id, am_id); + tcategory = TypeCategory(type_id); /* @@ -2532,7 +2532,7 @@ ChooseRelationName(const char *name1, const char *name2, * * The argument list is pretty ad-hoc :-( */ -static char * +char * ChooseIndexName(const char *tabname, Oid namespaceId, const List *colnames, const List *exclusionOpNames, bool primary, bool isconstraint) @@ -2621,7 +2621,7 @@ ChooseIndexNameAddition(const List *colnames) * * Returns a List of plain strings (char *, not String nodes). */ -static List * +List * ChooseIndexColumnNames(const List *indexElems) { List *result = NIL; diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c index b8b5c147c5d..fe91b816c32 100644 --- a/src/backend/commands/opclasscmds.c +++ b/src/backend/commands/opclasscmds.c @@ -42,6 +42,7 @@ #include "parser/parse_oper.h" #include "parser/parse_type.h" #include "utils/acl.h" +#include "postgres_ext.h" #include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" @@ -376,7 +377,7 @@ DefineOpClass(CreateOpClassStmt *stmt) amform = (Form_pg_am) GETSTRUCT(tup); amoid = amform->oid; - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); ReleaseSysCache(tup); maxOpNumber = amroutine->amstrategies; @@ -834,7 +835,7 @@ AlterOpFamily(AlterOpFamilyStmt *stmt) amform = (Form_pg_am) GETSTRUCT(tup); amoid = amform->oid; - amroutine = GetIndexAmRoutineByAmId(amoid, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); ReleaseSysCache(tup); maxOpNumber = amroutine->amstrategies; @@ -881,7 +882,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid, int maxOpNumber, int maxProcNumber, int optsProcNumber, List *items) { - IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false); + IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); List *operators; /* OpFamilyMember list for operators */ List *procedures; /* OpFamilyMember list for support procs */ ListCell *l; @@ -1164,7 +1165,7 @@ assignOperTypes(OpFamilyMember *member, Oid amoid, Oid typeoid) * the family has been created but not yet populated with the required * operators.) */ - IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false); + IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(InvalidOid, amoid, false); if (!amroutine->amcanorderbyop) ereport(ERROR, diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 0ecdecc2564..80e9048d6de 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -700,6 +700,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, ObjectAddress address; LOCKMODE parentLockmode; Oid accessMethodId = InvalidOid; + const TableAmRoutine *tableam = NULL; /* * Truncate relname to appropriate length (probably a waste of time, as @@ -835,6 +836,29 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, if (!OidIsValid(ownerId)) ownerId = GetUserId(); + + /* + * For relations with table AM and partitioned tables, select access + * method to use: an explicitly indicated one, or (in the case of a + * partitioned table) the parent's, if it has one. + */ + if (stmt->accessMethod != NULL) + { + Assert(RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE); + accessMethodId = get_table_am_oid(stmt->accessMethod, false); + } + else if (RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE) + { + if (stmt->partbound) + { + Assert(list_length(inheritOids) == 1); + accessMethodId = get_rel_relam(linitial_oid(inheritOids)); + } + + if (RELKIND_HAS_TABLE_AM(relkind) && !OidIsValid(accessMethodId)) + accessMethodId = get_table_am_oid(default_table_access_method, false); + } + /* * Parse and validate reloptions, if any. */ @@ -843,6 +867,12 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, switch (relkind) { + case RELKIND_RELATION: + case RELKIND_TOASTVALUE: + case RELKIND_MATVIEW: + tableam = GetTableAmRoutineByAmOid(accessMethodId); + (void) tableam_reloptions(tableam, relkind, reloptions, true); + break; case RELKIND_VIEW: (void) view_reloptions(reloptions, true); break; @@ -851,6 +881,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, break; default: (void) heap_reloptions(relkind, reloptions, true); + break; } if (stmt->ofTypename) @@ -941,28 +972,6 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId, } } - /* - * For relations with table AM and partitioned tables, select access - * method to use: an explicitly indicated one, or (in the case of a - * partitioned table) the parent's, if it has one. - */ - if (stmt->accessMethod != NULL) - { - Assert(RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE); - accessMethodId = get_table_am_oid(stmt->accessMethod, false); - } - else if (RELKIND_HAS_TABLE_AM(relkind) || relkind == RELKIND_PARTITIONED_TABLE) - { - if (stmt->partbound) - { - Assert(list_length(inheritOids) == 1); - accessMethodId = get_rel_relam(linitial_oid(inheritOids)); - } - - if (RELKIND_HAS_TABLE_AM(relkind) && !OidIsValid(accessMethodId)) - accessMethodId = get_table_am_oid(default_table_access_method, false); - } - /* * Create the relation. Inherited defaults and constraints are passed in * for immediate handling --- since they don't need parsing, they can be @@ -6304,8 +6313,10 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) /* Write the tuple out to the new relation */ if (newrel) + { table_tuple_insert(newrel, insertslot, mycid, ti_options, bistate); + } ResetExprContext(econtext); @@ -14933,7 +14944,8 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, case RELKIND_RELATION: case RELKIND_TOASTVALUE: case RELKIND_MATVIEW: - (void) heap_reloptions(rel->rd_rel->relkind, newOptions, true); + (void) table_reloptions(rel, rel->rd_rel->relkind, + newOptions, true); break; case RELKIND_PARTITIONED_TABLE: (void) partitioned_table_reloptions(newOptions, true); @@ -18629,12 +18641,14 @@ static void AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) { List *idxes; + List *buildIdxes = NIL; List *attachRelIdxs; Relation *attachrelIdxRels; IndexInfo **attachInfos; ListCell *cell; MemoryContext cxt; MemoryContext oldcxt; + AttrMap *attmap; cxt = AllocSetContextCreate(CurrentMemoryContext, "AttachPartitionEnsureIndexes", @@ -18683,6 +18697,10 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) goto out; } + attmap = build_attrmap_by_name(RelationGetDescr(attachrel), + RelationGetDescr(rel), + false); + /* * For each index on the partitioned table, find a matching one in the * partition-to-be; if one is not found, create one. @@ -18692,7 +18710,6 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) Oid idx = lfirst_oid(cell); Relation idxRel = index_open(idx, AccessShareLock); IndexInfo *info; - AttrMap *attmap; bool found = false; Oid constraintOid; @@ -18708,9 +18725,6 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) /* construct an indexinfo to compare existing indexes against */ info = BuildIndexInfo(idxRel); - attmap = build_attrmap_by_name(RelationGetDescr(attachrel), - RelationGetDescr(rel), - false); constraintOid = get_relation_idx_constraint_oid(RelationGetRelid(rel), idx); /* @@ -18776,19 +18790,7 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) * now. */ if (!found) - { - IndexStmt *stmt; - Oid conOid; - - stmt = generateClonedIndexStmt(NULL, - idxRel, attmap, - &conOid); - DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, - RelationGetRelid(idxRel), - conOid, - -1, - true, false, false, false, false); - } + buildIdxes = lappend_oid(buildIdxes, RelationGetRelid(idxRel)); index_close(idxRel, AccessShareLock); } @@ -18797,6 +18799,25 @@ AttachPartitionEnsureIndexes(List **wqueue, Relation rel, Relation attachrel) /* Clean up. */ for (int i = 0; i < list_length(attachRelIdxs); i++) index_close(attachrelIdxRels[i], AccessShareLock); + + foreach(cell, buildIdxes) + { + Oid idx = lfirst_oid(cell); + Relation idxRel = index_open(idx, AccessShareLock); + IndexStmt *stmt; + Oid conOid; + + stmt = generateClonedIndexStmt(NULL, + idxRel, attmap, + &conOid); + DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid, + RelationGetRelid(idxRel), + conOid, + -1, + true, false, false, false, false); + index_close(idxRel, AccessShareLock); + } + MemoryContextSwitchTo(oldcxt); MemoryContextDelete(cxt); } diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 58b7fc5bbd5..1aee9d64212 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -76,7 +76,7 @@ static void SetTriggerFlags(TriggerDesc *trigdesc, Trigger *trigger); static bool GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tid, + Datum tupleid, LockTupleMode lockmode, TupleTableSlot *oldslot, TupleTableSlot **epqslot, @@ -2681,7 +2681,7 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo, bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, @@ -2695,7 +2695,7 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, bool should_free = false; int i; - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); + Assert(HeapTupleIsValid(fdw_trigtuple) ^ (DatumGetPointer(tupleid) != NULL)); if (fdw_trigtuple == NULL) { TupleTableSlot *epqslot_candidate = NULL; @@ -2772,8 +2772,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *slot, TransitionCaptureState *transition_capture, bool is_crosspart_update) { @@ -2782,20 +2782,11 @@ ExecARDeleteTriggers(EState *estate, if ((trigdesc && trigdesc->trig_delete_after_row) || (transition_capture && transition_capture->tcs_delete_old_table)) { - TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); - - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); - if (fdw_trigtuple == NULL) - GetTupleForTrigger(estate, - NULL, - relinfo, - tupleid, - LockTupleExclusive, - slot, - NULL, - NULL, - NULL); - else + /* + * Put the FDW old tuple to the slot. Otherwise, caller is expected + * to have old tuple alredy fetched to the slot. + */ + if (fdw_trigtuple != NULL) ExecForceStoreHeapTuple(fdw_trigtuple, slot, false); AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, @@ -2932,7 +2923,7 @@ ExecASUpdateTriggers(EState *estate, ResultRelInfo *relinfo, bool ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, @@ -2952,7 +2943,7 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, /* Determine lock mode to use */ lockmode = ExecUpdateLockMode(estate, relinfo); - Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid)); + Assert(HeapTupleIsValid(fdw_trigtuple) ^ (DatumGetPointer(tupleid) != NULL)); if (fdw_trigtuple == NULL) { TupleTableSlot *epqslot_candidate = NULL; @@ -3086,18 +3077,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, * Note: 'src_partinfo' and 'dst_partinfo', when non-NULL, refer to the source * and destination partitions, respectively, of a cross-partition update of * the root partitioned table mentioned in the query, given by 'relinfo'. - * 'tupleid' in that case refers to the ctid of the "old" tuple in the source - * partition, and 'newslot' contains the "new" tuple in the destination - * partition. This interface allows to support the requirements of - * ExecCrossPartitionUpdateForeignKey(); is_crosspart_update must be true in - * that case. + * 'oldslot' contains the "old" tuple in the source partition, and 'newslot' + * contains the "new" tuple in the destination partition. This interface + * allows to support the requirements of ExecCrossPartitionUpdateForeignKey(); + * is_crosspart_update must be true in that case. */ void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, ResultRelInfo *dst_partinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, TransitionCaptureState *transition_capture, @@ -3116,29 +3106,14 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, * separately for DELETE and INSERT to capture transition table rows. * In such case, either old tuple or new tuple can be NULL. */ - TupleTableSlot *oldslot; - ResultRelInfo *tupsrc; - Assert((src_partinfo != NULL && dst_partinfo != NULL) || !is_crosspart_update); - tupsrc = src_partinfo ? src_partinfo : relinfo; - oldslot = ExecGetTriggerOldSlot(estate, tupsrc); - - if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid)) - GetTupleForTrigger(estate, - NULL, - tupsrc, - tupleid, - LockTupleExclusive, - oldslot, - NULL, - NULL, - NULL); - else if (fdw_trigtuple != NULL) + if (fdw_trigtuple != NULL) + { + Assert(oldslot); ExecForceStoreHeapTuple(fdw_trigtuple, oldslot, false); - else - ExecClearTuple(oldslot); + } AfterTriggerSaveEvent(estate, relinfo, src_partinfo, dst_partinfo, @@ -3285,7 +3260,7 @@ static bool GetTupleForTrigger(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tid, + Datum tupleid, LockTupleMode lockmode, TupleTableSlot *oldslot, TupleTableSlot **epqslot, @@ -3310,7 +3285,9 @@ GetTupleForTrigger(EState *estate, */ if (!IsolationUsesXactSnapshot()) lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; - test = table_tuple_lock(relation, tid, estate->es_snapshot, oldslot, + + test = table_tuple_lock(relation, tupleid, + estate->es_snapshot, oldslot, estate->es_output_cid, lockmode, LockWaitBlock, lockflags, @@ -3406,8 +3383,8 @@ GetTupleForTrigger(EState *estate, * We expect the tuple to be present, thus very simple error handling * suffices. */ - if (!table_tuple_fetch_row_version(relation, tid, SnapshotAny, - oldslot)) + if (!table_tuple_fetch_row_version(relation, tupleid, + SnapshotAny, oldslot)) elog(ERROR, "failed to fetch tuple for trigger"); } @@ -3613,18 +3590,22 @@ typedef SetConstraintStateData *SetConstraintState; * cycles. So we need only ensure that ats_firing_id is zero when attaching * a new event to an existing AfterTriggerSharedData record. */ -typedef uint32 TriggerFlags; +typedef uint64 TriggerFlags; -#define AFTER_TRIGGER_OFFSET 0x07FFFFFF /* must be low-order bits */ -#define AFTER_TRIGGER_DONE 0x80000000 -#define AFTER_TRIGGER_IN_PROGRESS 0x40000000 +#define AFTER_TRIGGER_SIZE UINT64CONST(0xFFFF000000000) /* must be low-order bits */ +#define AFTER_TRIGGER_SIZE_SHIFT (36) +#define AFTER_TRIGGER_OFFSET UINT64CONST(0x000000FFFFFFF) /* must be low-order bits */ +#define AFTER_TRIGGER_DONE UINT64CONST(0x0000800000000) +#define AFTER_TRIGGER_IN_PROGRESS UINT64CONST(0x0000400000000) /* bits describing the size and tuple sources of this event */ -#define AFTER_TRIGGER_FDW_REUSE 0x00000000 -#define AFTER_TRIGGER_FDW_FETCH 0x20000000 -#define AFTER_TRIGGER_1CTID 0x10000000 -#define AFTER_TRIGGER_2CTID 0x30000000 -#define AFTER_TRIGGER_CP_UPDATE 0x08000000 -#define AFTER_TRIGGER_TUP_BITS 0x38000000 +#define AFTER_TRIGGER_FDW_REUSE UINT64CONST(0x0000000000000) +#define AFTER_TRIGGER_FDW_FETCH UINT64CONST(0x0000200000000) +#define AFTER_TRIGGER_1CTID UINT64CONST(0x0000100000000) +#define AFTER_TRIGGER_ROWID1 UINT64CONST(0x0000010000000) +#define AFTER_TRIGGER_2CTID UINT64CONST(0x0000300000000) +#define AFTER_TRIGGER_ROWID2 UINT64CONST(0x0000020000000) +#define AFTER_TRIGGER_CP_UPDATE UINT64CONST(0x0000080000000) +#define AFTER_TRIGGER_TUP_BITS UINT64CONST(0x0000380000000) typedef struct AfterTriggerSharedData *AfterTriggerShared; typedef struct AfterTriggerSharedData @@ -3676,6 +3657,9 @@ typedef struct AfterTriggerEventDataZeroCtids } AfterTriggerEventDataZeroCtids; #define SizeofTriggerEvent(evt) \ + (((evt)->ate_flags & AFTER_TRIGGER_SIZE) >> AFTER_TRIGGER_SIZE_SHIFT) + +#define BasicSizeofTriggerEvent(evt) \ (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_CP_UPDATE ? \ sizeof(AfterTriggerEventData) : \ (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID ? \ @@ -4028,14 +4012,34 @@ afterTriggerCopyBitmap(Bitmapset *src) */ static void afterTriggerAddEvent(AfterTriggerEventList *events, - AfterTriggerEvent event, AfterTriggerShared evtshared) + AfterTriggerEvent event, AfterTriggerShared evtshared, + bytea *rowid1, bytea *rowid2) { - Size eventsize = SizeofTriggerEvent(event); - Size needed = eventsize + sizeof(AfterTriggerSharedData); + Size basiceventsize = MAXALIGN(BasicSizeofTriggerEvent(event)); + Size eventsize; + Size needed; AfterTriggerEventChunk *chunk; AfterTriggerShared newshared; AfterTriggerEvent newevent; + if (SizeofTriggerEvent(event) == 0) + { + eventsize = basiceventsize; + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + eventsize += MAXALIGN(VARSIZE(rowid1)); + + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + eventsize += MAXALIGN(VARSIZE(rowid2)); + + event->ate_flags |= eventsize << AFTER_TRIGGER_SIZE_SHIFT; + } + else + { + eventsize = SizeofTriggerEvent(event); + } + + needed = eventsize + sizeof(AfterTriggerSharedData); + /* * If empty list or not enough room in the tail chunk, make a new chunk. * We assume here that a new shared record will always be needed. @@ -4068,7 +4072,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events, * sizes used should be MAXALIGN multiples, to ensure that the shared * records will be aligned safely. */ -#define MIN_CHUNK_SIZE 1024 +#define MIN_CHUNK_SIZE (1024*4) #define MAX_CHUNK_SIZE (1024*1024) #if MAX_CHUNK_SIZE > (AFTER_TRIGGER_OFFSET+1) @@ -4087,6 +4091,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events, chunksize *= 2; /* okay, double it */ else chunksize /= 2; /* too many shared records */ + chunksize = Max(chunksize, MIN_CHUNK_SIZE); chunksize = Min(chunksize, MAX_CHUNK_SIZE); } chunk = MemoryContextAlloc(afterTriggers.event_cxt, chunksize); @@ -4127,7 +4132,26 @@ afterTriggerAddEvent(AfterTriggerEventList *events, /* Insert the data */ newevent = (AfterTriggerEvent) chunk->freeptr; - memcpy(newevent, event, eventsize); + if (!rowid1 && !rowid2) + { + memcpy(newevent, event, eventsize); + } + else + { + Pointer ptr = chunk->freeptr; + + memcpy(newevent, event, basiceventsize); + ptr += basiceventsize; + + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + { + memcpy(ptr, rowid1, MAXALIGN(VARSIZE(rowid1))); + ptr += MAXALIGN(VARSIZE(rowid1)); + } + + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + memcpy(ptr, rowid2, MAXALIGN(VARSIZE(rowid2))); + } /* ... and link the new event to its shared record */ newevent->ate_flags &= ~AFTER_TRIGGER_OFFSET; newevent->ate_flags |= (char *) newshared - (char *) newevent; @@ -4287,6 +4311,7 @@ AfterTriggerExecute(EState *estate, int tgindx; bool should_free_trig = false; bool should_free_new = false; + Pointer ptr; /* * Locate trigger in trigdesc. It might not be present, and in fact the @@ -4322,15 +4347,17 @@ AfterTriggerExecute(EState *estate, { Tuplestorestate *fdw_tuplestore = GetCurrentFDWTuplestore(); - if (!tuplestore_gettupleslot(fdw_tuplestore, true, false, - trig_tuple_slot1)) + if (!tuplestore_force_gettupleslot(fdw_tuplestore, true, false, + trig_tuple_slot1)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); if ((evtshared->ats_event & TRIGGER_EVENT_OPMASK) == TRIGGER_EVENT_UPDATE && - !tuplestore_gettupleslot(fdw_tuplestore, true, false, - trig_tuple_slot2)) + !tuplestore_force_gettupleslot(fdw_tuplestore, true, false, + trig_tuple_slot2)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); + trig_tuple_slot1->tts_tid = event->ate_ctid1; + trig_tuple_slot2->tts_tid = event->ate_ctid2; } /* fall through */ case AFTER_TRIGGER_FDW_REUSE: @@ -4362,13 +4389,26 @@ AfterTriggerExecute(EState *estate, break; default: - if (ItemPointerIsValid(&(event->ate_ctid1))) + ptr = (Pointer) event + MAXALIGN(BasicSizeofTriggerEvent(event)); + if (ItemPointerIsValid(&(event->ate_ctid1)) || + (event->ate_flags & AFTER_TRIGGER_ROWID1)) { + Datum tupleid; + TupleTableSlot *src_slot = ExecGetTriggerOldSlot(estate, src_relInfo); - if (!table_tuple_fetch_row_version(src_rel, - &(event->ate_ctid1), + if (event->ate_flags & AFTER_TRIGGER_ROWID1) + { + tupleid = PointerGetDatum(ptr); + ptr += MAXALIGN(VARSIZE(ptr)); + } + else + { + tupleid = PointerGetDatum(&(event->ate_ctid1)); + } + + if (!table_tuple_fetch_row_version(src_rel, tupleid, SnapshotAny, src_slot)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); @@ -4404,13 +4444,23 @@ AfterTriggerExecute(EState *estate, /* don't touch ctid2 if not there */ if (((event->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID || (event->ate_flags & AFTER_TRIGGER_CP_UPDATE)) && - ItemPointerIsValid(&(event->ate_ctid2))) + (ItemPointerIsValid(&(event->ate_ctid2)) || + (event->ate_flags & AFTER_TRIGGER_ROWID2))) { + Datum tupleid; + TupleTableSlot *dst_slot = ExecGetTriggerNewSlot(estate, dst_relInfo); - if (!table_tuple_fetch_row_version(dst_rel, - &(event->ate_ctid2), + if (event->ate_flags & AFTER_TRIGGER_ROWID2) + { + tupleid = PointerGetDatum(ptr); + } + else + { + tupleid = PointerGetDatum(&(event->ate_ctid2)); + } + if (!table_tuple_fetch_row_version(dst_rel, tupleid, SnapshotAny, dst_slot)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); @@ -4584,7 +4634,7 @@ afterTriggerMarkEvents(AfterTriggerEventList *events, { deferred_found = true; /* add it to move_list */ - afterTriggerAddEvent(move_list, event, evtshared); + afterTriggerAddEvent(move_list, event, evtshared, NULL, NULL); /* mark original copy "done" so we don't do it again */ event->ate_flags |= AFTER_TRIGGER_DONE; } @@ -4688,6 +4738,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, /* caution: trigdesc could be NULL here */ finfo = rInfo->ri_TrigFunctions; instr = rInfo->ri_TrigInstrument; + if (slot1 != NULL) { ExecDropSingleTupleTableSlot(slot1); @@ -6077,6 +6128,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, int tgtype_level; int i; Tuplestorestate *fdw_tuplestore = NULL; + bytea *rowId1 = NULL; + bytea *rowId2 = NULL; /* * Check state. We use a normal test not Assert because it is possible to @@ -6170,6 +6223,21 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, * if so. This preserves the behavior that statement-level triggers fire * just once per statement and fire after row-level triggers. */ + + /* Determine flags */ + if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) + { + if (row_trigger && event == TRIGGER_EVENT_UPDATE) + { + if (relkind == RELKIND_PARTITIONED_TABLE) + new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; + else + new_event.ate_flags = AFTER_TRIGGER_2CTID; + } + else + new_event.ate_flags = AFTER_TRIGGER_1CTID; + } + switch (event) { case TRIGGER_EVENT_INSERT: @@ -6180,6 +6248,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot != NULL); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerSetInvalid(&(new_event.ate_ctid2)); + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + rowId1 = DatumGetByteaP(slot_getsysattr(newslot, RowIdAttributeNumber, &isnull)); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + Assert(!isnull); + } } else { @@ -6199,6 +6274,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot == NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerSetInvalid(&(new_event.ate_ctid2)); + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + rowId1 = DatumGetByteaP(slot_getsysattr(oldslot, RowIdAttributeNumber, &isnull)); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + Assert(!isnull); + } } else { @@ -6214,10 +6296,54 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, tgtype_event = TRIGGER_TYPE_UPDATE; if (row_trigger) { + bool src_rowid = false, + dst_rowid = false; Assert(oldslot != NULL); Assert(newslot != NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid2)); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + Relation src_rel = src_partinfo->ri_RelationDesc; + Relation dst_rel = dst_partinfo->ri_RelationDesc; + + src_rowid = table_get_row_ref_type(src_rel) == + ROW_REF_ROWID; + dst_rowid = table_get_row_ref_type(dst_rel) == + ROW_REF_ROWID; + } + else + { + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + src_rowid = true; + dst_rowid = true; + } + } + + if (src_rowid) + { + Datum val; + bool isnull; + val = slot_getsysattr(oldslot, + RowIdAttributeNumber, + &isnull); + rowId1 = DatumGetByteaP(val); + Assert(!isnull); + new_event.ate_flags |= AFTER_TRIGGER_ROWID1; + } + + if (dst_rowid) + { + Datum val; + bool isnull; + val = slot_getsysattr(newslot, + RowIdAttributeNumber, + &isnull); + rowId2 = DatumGetByteaP(val); + Assert(!isnull); + new_event.ate_flags |= AFTER_TRIGGER_ROWID2; + } /* * Also remember the OIDs of partitions to fetch these tuples @@ -6255,20 +6381,6 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, break; } - /* Determine flags */ - if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) - { - if (row_trigger && event == TRIGGER_EVENT_UPDATE) - { - if (relkind == RELKIND_PARTITIONED_TABLE) - new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; - else - new_event.ate_flags = AFTER_TRIGGER_2CTID; - } - else - new_event.ate_flags = AFTER_TRIGGER_1CTID; - } - /* else, we'll initialize ate_flags for each trigger */ tgtype_level = (row_trigger ? TRIGGER_TYPE_ROW : TRIGGER_TYPE_STATEMENT); @@ -6434,7 +6546,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, new_shared.ats_modifiedcols = afterTriggerCopyBitmap(modifiedCols); afterTriggerAddEvent(&afterTriggers.query_stack[afterTriggers.query_depth].events, - &new_event, &new_shared); + &new_event, &new_shared, rowId1, rowId2); } /* diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 3289e3e0219..1a7f6ae2c9b 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -613,7 +613,7 @@ IndexSupportsBackwardScan(Oid indexid) idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutineByAmId(idxrelrec->relam, false); + amroutine = GetIndexAmRoutineByAmId(indexid, idxrelrec->relam, false); result = amroutine->amcanbackward; diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c index a5395536a13..6913e3b7a6d 100644 --- a/src/backend/executor/execExpr.c +++ b/src/backend/executor/execExpr.c @@ -48,6 +48,8 @@ #include "utils/array.h" #include "utils/builtins.h" #include "utils/jsonfuncs.h" +#include "utils/json.h" +#include "utils/jsonb.h" #include "utils/jsonpath.h" #include "utils/lsyscache.h" #include "utils/typcache.h" diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index aa68c115ba9..d830006d61b 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -4994,7 +4994,9 @@ ExecEvalSysVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext, op->resnull); *op->resvalue = d; /* this ought to be unreachable, but it's cheap enough to check */ - if (unlikely(*op->resnull)) + if (op->d.var.attnum != RowIdAttributeNumber && + op->d.var.attnum != SelfItemPointerAttributeNumber && + unlikely(*op->resnull)) elog(ERROR, "failed to fetch attribute from slot"); } diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 9f05b3654c1..9e09ef1cf1f 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -304,7 +304,6 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, List *arbiterIndexes, bool onlySummarizing) { - ItemPointer tupleid = &slot->tts_tid; List *result = NIL; int i; int numIndices; @@ -314,8 +313,20 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, ExprContext *econtext; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + ItemPointer tupleid; - Assert(ItemPointerIsValid(tupleid)); + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = DatumGetItemPointer(slot_getsysattr(slot, RowIdAttributeNumber, &isnull)); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = &slot->tts_tid; + } /* * Get information from the result relation info structure. @@ -506,6 +517,406 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, return result; } +List * +ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *oldSlot, + EState *estate, + bool noDupErr, + bool *specConflict, + List *arbiterIndexes, + bool onlySummarizing) +{ + List *result = NIL; + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + ItemPointer tupleid; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = DatumGetItemPointer(slot_getsysattr(slot, RowIdAttributeNumber, &isnull)); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = &slot->tts_tid; + } + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + bool applyNoDupErr; + IndexUniqueCheck checkUnique; + bool satisfiesConstraint; + bool new_valid = true; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + /* + * Skip processing of non-summarizing indexes if we only update + * summarizing indexes + */ + if (onlySummarizing && !indexInfo->ii_Summarizing) + continue; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + { + if (!indexRelation->rd_indam->ammvccaware) + continue; + new_valid = false; + } + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + /* Check whether to apply noDupErr to this index */ + applyNoDupErr = noDupErr && + (arbiterIndexes == NIL || + list_member_oid(arbiterIndexes, + indexRelation->rd_index->indexrelid)); + + /* + * The index AM does the actual insertion, plus uniqueness checking. + * + * For an immediate-mode unique index, we just tell the index AM to + * throw error if not unique. + * + * For a deferrable unique index, we tell the index AM to just detect + * possible non-uniqueness, and we add the index OID to the result + * list if further checking is needed. + * + * For a speculative insertion (used by INSERT ... ON CONFLICT), do + * the same as for a deferrable unique index. + */ + if (!indexRelation->rd_index->indisunique) + checkUnique = UNIQUE_CHECK_NO; + else if (applyNoDupErr) + checkUnique = UNIQUE_CHECK_PARTIAL; + else if (indexRelation->rd_index->indimmediate) + checkUnique = UNIQUE_CHECK_YES; + else + checkUnique = UNIQUE_CHECK_PARTIAL; + + if (indexRelation->rd_indam->ammvccaware) + { + Datum valuesOld[INDEX_MAX_KEYS]; + bool isnullOld[INDEX_MAX_KEYS]; + Datum oldTupleid; + bool old_valid = true; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + oldTupleid = slot_getsysattr(oldSlot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&oldSlot->tts_tid)); + oldTupleid = PointerGetDatum(&oldSlot->tts_tid); + } + + econtext = GetPerTupleExprContext(estate); + econtext->ecxt_scantuple = oldSlot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + old_valid = false; + } + + FormIndexDatum(indexInfo, + oldSlot, + estate, + valuesOld, + isnullOld); + + satisfiesConstraint = + index_update(indexRelation, /* index relation */ + new_valid, + old_valid, + values, /* array of index Datums */ + isnull, /* null flags */ + ItemPointerGetDatum(tupleid), /* tid of heap tuple */ + valuesOld, + isnullOld, + oldTupleid, + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexInfo); /* index AM may need this */ + + } + else + { + bool indexUnchanged; + /* + * There's definitely going to be an index_insert() call for this + * index. If we're being called as part of an UPDATE statement, + * consider if the 'indexUnchanged' = true hint should be passed. + */ + indexUnchanged = index_unchanged_by_update(resultRelInfo, + estate, + indexInfo, + indexRelation); + + satisfiesConstraint = + index_insert(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + checkUnique, /* type of uniqueness check to do */ + indexUnchanged, /* UPDATE without logical change? */ + indexInfo); /* index AM may need this */ + } + + /* + * If the index has an associated exclusion constraint, check that. + * This is simpler than the process for uniqueness checks since we + * always insert first and then check. If the constraint is deferred, + * we check now anyway, but don't throw error on violation or wait for + * a conclusive outcome from a concurrent insertion; instead we'll + * queue a recheck event. Similarly, noDupErr callers (speculative + * inserters) will recheck later, and wait for a conclusive outcome + * then. + * + * An index for an exclusion constraint can't also be UNIQUE (not an + * essential property, we just don't allow it in the grammar), so no + * need to preserve the prior state of satisfiesConstraint. + */ + if (indexInfo->ii_ExclusionOps != NULL) + { + bool violationOK; + CEOUC_WAIT_MODE waitMode; + + if (applyNoDupErr) + { + violationOK = true; + waitMode = CEOUC_LIVELOCK_PREVENTING_WAIT; + } + else if (!indexRelation->rd_index->indimmediate) + { + violationOK = true; + waitMode = CEOUC_NOWAIT; + } + else + { + violationOK = false; + waitMode = CEOUC_WAIT; + } + + satisfiesConstraint = + check_exclusion_or_unique_constraint(heapRelation, + indexRelation, indexInfo, + tupleid, values, isnull, + estate, false, + waitMode, violationOK, NULL); + } + + if ((checkUnique == UNIQUE_CHECK_PARTIAL || + indexInfo->ii_ExclusionOps != NULL) && + !satisfiesConstraint) + { + /* + * The tuple potentially violates the uniqueness or exclusion + * constraint, so make a note of the index so that we can re-check + * it later. Speculative inserters are told if there was a + * speculative conflict, since that always requires a restart. + */ + result = lappend_oid(result, RelationGetRelid(indexRelation)); + if (indexRelation->rd_index->indimmediate && specConflict) + *specConflict = true; + } + } + + return result; +} + +void +ExecDeleteIndexTuples(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, + EState *estate) +{ + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + Datum tupleid; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + Assert(ItemPointerIsValid(&slot->tts_tid)); + tupleid = PointerGetDatum(&slot->tts_tid); + } + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* Sanity check: slot must belong to the same rel as the resultRelInfo. */ + Assert(slot->tts_tableOid == RelationGetRelid(heapRelation)); + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + + if (indexRelation == NULL) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + if (!indexRelation->rd_indam->ammvccaware) + continue; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + index_delete(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + indexInfo); /* index AM may need this */ + } +} + /* ---------------------------------------------------------------- * ExecCheckIndexConstraints * diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 2365c6861be..f0dfccd9fab 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -150,7 +150,7 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) Assert(queryDesc->estate == NULL); /* caller must ensure the query's snapshot is active */ - Assert(GetActiveSnapshot() == queryDesc->snapshot); + Assert((ActiveSnapshotSet() ? GetActiveSnapshot() : InvalidSnapshot) == queryDesc->snapshot); /* * If the transaction is read-only, we need to check if any writes are @@ -325,7 +325,7 @@ standard_ExecutorRun(QueryDesc *queryDesc, Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); /* caller must ensure the query's snapshot is active */ - Assert(GetActiveSnapshot() == estate->es_snapshot); + Assert((ActiveSnapshotSet() ? GetActiveSnapshot() : InvalidSnapshot) == estate->es_snapshot); /* * Switch into per-query memory context @@ -869,13 +869,15 @@ InitPlan(QueryDesc *queryDesc, int eflags) Oid relid; Relation relation; ExecRowMark *erm; + RangeTblEntry *rangeEntry; /* ignore "parent" rowmarks; they are irrelevant at runtime */ if (rc->isParent) continue; /* get relation's OID (will produce InvalidOid if subquery) */ - relid = exec_rt_fetch(rc->rti, estate)->relid; + rangeEntry = exec_rt_fetch(rc->rti, estate); + relid = rangeEntry->relid; /* open relation, if we need to access it for this mark type */ switch (rc->markType) @@ -908,6 +910,10 @@ InitPlan(QueryDesc *queryDesc, int eflags) erm->prti = rc->prti; erm->rowmarkId = rc->rowmarkId; erm->markType = rc->markType; + if (erm->markType == ROW_MARK_COPY) + erm->refType = ROW_REF_COPY; + else + erm->refType = rangeEntry->reftype; erm->strength = rc->strength; erm->waitPolicy = rc->waitPolicy; erm->ermActive = false; @@ -1273,6 +1279,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, resultRelInfo->ri_ChildToRootMap = NULL; resultRelInfo->ri_ChildToRootMapValid = false; resultRelInfo->ri_CopyMultiInsertBuffer = NULL; + + resultRelInfo->ri_RowRefType = table_get_row_ref_type(resultRelationDesc); } /* @@ -2407,17 +2415,28 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist) aerm->rowmark = erm; /* Look up the resjunk columns associated with this rowmark */ - if (erm->markType != ROW_MARK_COPY) + if (erm->refType == ROW_REF_TID) { + Assert(erm->markType != ROW_MARK_COPY); /* need ctid for all methods other than COPY */ snprintf(resname, sizeof(resname), "ctid%u", erm->rowmarkId); aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, resname); if (!AttributeNumberIsValid(aerm->ctidAttNo)) elog(ERROR, "could not find junk %s column", resname); + } else if (erm->refType == ROW_REF_ROWID) + { + Assert(erm->markType != ROW_MARK_COPY); + /* need ctid for all methods other than COPY */ + snprintf(resname, sizeof(resname), "rowid%u", erm->rowmarkId); + aerm->ctidAttNo = ExecFindJunkAttributeInTlist(targetlist, + resname); + if (!AttributeNumberIsValid(aerm->ctidAttNo)) + elog(ERROR, "could not find junk %s column", resname); } else { + Assert(erm->markType == ROW_MARK_COPY); /* need wholerow if COPY */ snprintf(resname, sizeof(resname), "wholerow%u", erm->rowmarkId); aerm->wholeAttNo = ExecFindJunkAttributeInTlist(targetlist, @@ -2705,8 +2724,9 @@ EvalPlanQualFetchRowMark(EPQState *epqstate, Index rti, TupleTableSlot *slot) { /* ordinary table, fetch the tuple */ if (!table_tuple_fetch_row_version(erm->relation, - (ItemPointer) DatumGetPointer(datum), - SnapshotAny, slot)) + datum, + SnapshotAny, + slot)) elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck"); return true; } diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index d0a89cd5778..252efe51738 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -166,6 +166,25 @@ build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel, return skey_attoff; } +static Datum +slot_get_tupleid(Relation rel, TupleTableSlot *slot) +{ + Datum tupleid; + + if (table_get_row_ref_type(rel) == ROW_REF_ROWID) + { + bool isnull; + tupleid = slot_getsysattr(slot, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + tupleid = PointerGetDatum(&slot->tts_tid); + } + + return tupleid; +} + /* * Search the relation 'rel' for tuple using the index. * @@ -250,7 +269,8 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(), + res = table_tuple_lock(rel, slot_get_tupleid(rel, outslot), + GetLatestSnapshot(), outslot, GetCurrentCommandId(false), lockmode, @@ -434,7 +454,8 @@ RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, PushActiveSnapshot(GetLatestSnapshot()); - res = table_tuple_lock(rel, &(outslot->tts_tid), GetLatestSnapshot(), + res = table_tuple_lock(rel, slot_get_tupleid(rel, outslot), + GetLatestSnapshot(), outslot, GetCurrentCommandId(false), lockmode, @@ -557,7 +578,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, { bool skip_tuple = false; Relation rel = resultRelInfo->ri_RelationDesc; - ItemPointer tid = &(searchslot->tts_tid); + Datum tupleid = slot_get_tupleid(rel, searchslot); /* For now we support only tables. */ Assert(rel->rd_rel->relkind == RELKIND_RELATION); @@ -569,7 +590,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + tupleid, NULL, slot, NULL, NULL)) skip_tuple = true; /* "do nothing" */ } @@ -577,6 +598,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, { List *recheckIndexes = NIL; TU_UpdateIndexes update_indexes; + TupleTableSlot *oldSlot = NULL; /* Compute stored generated columns */ if (rel->rd_att->constr && @@ -590,19 +612,24 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, if (rel->rd_rel->relispartition) ExecPartitionCheck(resultRelInfo, slot, estate, true); - simple_table_tuple_update(rel, tid, slot, estate->es_snapshot, - &update_indexes); + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + + simple_table_tuple_update(rel, tupleid, slot, estate->es_snapshot, + &update_indexes, oldSlot); if (resultRelInfo->ri_NumIndices > 0 && (update_indexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, true, false, + recheckIndexes = ExecUpdateIndexTuples(resultRelInfo, + slot, + oldSlot, + estate, + false, NULL, NIL, (update_indexes == TU_Summarizing)); /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, - tid, NULL, slot, + NULL, oldSlot, slot, recheckIndexes, NULL, false); list_free(recheckIndexes); @@ -622,7 +649,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, { bool skip_tuple = false; Relation rel = resultRelInfo->ri_RelationDesc; - ItemPointer tid = &searchslot->tts_tid; + Datum tupleid = slot_get_tupleid(rel, searchslot); CheckCmdReplicaIdentity(rel, CMD_DELETE); @@ -631,17 +658,25 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + tupleid, NULL, NULL, NULL, NULL); } if (!skip_tuple) { + TupleTableSlot *oldSlot = NULL; + + oldSlot = ExecGetTriggerOldSlot(estate, resultRelInfo); + /* OK, delete the tuple */ - simple_table_tuple_delete(rel, tid, estate->es_snapshot); + simple_table_tuple_delete(rel, tupleid, estate->es_snapshot, oldSlot); + + /* delete index entries if necessary */ + if (resultRelInfo->ri_NumIndices > 0) + ExecDeleteIndexTuples(resultRelInfo, oldSlot, estate); /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, - tid, NULL, NULL, false); + NULL, oldSlot, NULL, false); } } diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 5737f9f4ebd..5cbe3bf46d1 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -1211,9 +1211,19 @@ ExecGetChildToRootMap(ResultRelInfo *resultRelInfo) ResultRelInfo *rootRelInfo = resultRelInfo->ri_RootResultRelInfo; if (rootRelInfo) - resultRelInfo->ri_ChildToRootMap = - convert_tuples_by_name(RelationGetDescr(resultRelInfo->ri_RelationDesc), - RelationGetDescr(rootRelInfo->ri_RelationDesc)); + { + TupleDesc indesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + TupleDesc outdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); + AttrMap *attrMap; + + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) != ROW_REF_ROWID) + attrMap = build_attrmap_by_name_if_req(indesc, outdesc, false); + else + attrMap = build_attrmap_by_name(indesc, outdesc, false); + if (attrMap) + resultRelInfo->ri_ChildToRootMap = + convert_tuples_by_name_attrmap(indesc, outdesc, attrMap); + } else /* this isn't a child result rel */ resultRelInfo->ri_ChildToRootMap = NULL; @@ -1250,8 +1260,10 @@ ExecGetRootToChildMap(ResultRelInfo *resultRelInfo, EState *estate) * to ignore by passing true for missing_ok. */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); - attrMap = build_attrmap_by_name_if_req(indesc, outdesc, - !childrel->rd_rel->relispartition); + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) != ROW_REF_ROWID) + attrMap = build_attrmap_by_name_if_req(indesc, outdesc, !childrel->rd_rel->relispartition); + else + attrMap = build_attrmap_by_name(indesc, outdesc, !childrel->rd_rel->relispartition); if (attrMap) resultRelInfo->ri_RootToChildMap = convert_tuples_by_name_attrmap(indesc, outdesc, attrMap); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index b49194c0167..a8424922ccc 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -65,7 +65,7 @@ IndexOnlyNext(IndexOnlyScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; - ItemPointer tid; + ItemPointer tid = NULL; /* * extract necessary information from index scan node @@ -117,12 +117,36 @@ IndexOnlyNext(IndexOnlyScanState *node) /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while (true) { bool tuple_from_heap = false; CHECK_FOR_INTERRUPTS(); + if (scandesc->xs_want_rowid) + { + NullableDatum rowid; + /* Time to fetch the next TID from the index */ + rowid = index_getnext_rowid(scandesc, direction); + + /* If we're out of index entries, we're done */ + if (rowid.isnull) + break; + + /* Assert(RowidEquals(rowid, &scan->xs_rowid)); */ + } + else + { + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scandesc, direction); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid)); + } + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, @@ -157,7 +181,8 @@ IndexOnlyNext(IndexOnlyScanState *node) * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, + if (!scandesc->xs_want_rowid && + !VM_ALL_VISIBLE(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { @@ -242,7 +267,7 @@ IndexOnlyNext(IndexOnlyScanState *node) * If we didn't access the heap, then we'll need to take a predicate * lock explicitly, as if we had. For now we do that at page level. */ - if (!tuple_from_heap) + if (!tuple_from_heap && !scandesc->xs_want_rowid) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); diff --git a/src/backend/executor/nodeLockRows.c b/src/backend/executor/nodeLockRows.c index 41754ddfea9..ac401d7a470 100644 --- a/src/backend/executor/nodeLockRows.c +++ b/src/backend/executor/nodeLockRows.c @@ -27,6 +27,7 @@ #include "executor/nodeLockRows.h" #include "foreign/fdwapi.h" #include "miscadmin.h" +#include "utils/datum.h" #include "utils/rel.h" @@ -157,7 +158,16 @@ ExecLockRows(PlanState *pstate) } /* okay, try to lock (and fetch) the tuple */ - tid = *((ItemPointer) DatumGetPointer(datum)); + if (erm->refType == ROW_REF_TID) + { + tid = *((ItemPointer) DatumGetPointer(datum)); + datum = PointerGetDatum(&tid); + } + else + { + Assert(erm->refType = ROW_REF_ROWID); + datum = datumCopy(datum, false, -1); + } switch (erm->markType) { case ROW_MARK_EXCLUSIVE: @@ -182,12 +192,15 @@ ExecLockRows(PlanState *pstate) if (!IsolationUsesXactSnapshot()) lockflags |= TUPLE_LOCK_FLAG_FIND_LAST_VERSION; - test = table_tuple_lock(erm->relation, &tid, estate->es_snapshot, + test = table_tuple_lock(erm->relation, datum, estate->es_snapshot, markSlot, estate->es_output_cid, lockmode, erm->waitPolicy, lockflags, &tmfd); + if (erm->refType == ROW_REF_ROWID) + pfree(DatumGetPointer(datum)); + switch (test) { case TM_WouldBlock: diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 4913e493199..fb0997af2d4 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -135,12 +135,11 @@ static void ExecPendingInserts(EState *estate); static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, - ItemPointer tupleid, - TupleTableSlot *oldslot, + Datum tupleid, + TupleTableSlot *oldSlot, TupleTableSlot *newslot); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning); @@ -153,13 +152,13 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, static TupleTableSlot *ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, bool canSetTag); static void ExecInitMerge(ModifyTableState *mtstate, EState *estate); static TupleTableSlot *ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, bool canSetTag, bool *matched); @@ -167,7 +166,6 @@ static TupleTableSlot *ExecMergeNotMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, bool canSetTag); - /* * Verify that the tuples to be produced by INSERT match the * target relation's rowtype @@ -276,66 +274,6 @@ ExecProcessReturning(ResultRelInfo *resultRelInfo, return ExecProject(projectReturning); } -/* - * ExecCheckTupleVisible -- verify tuple is visible - * - * It would not be consistent with guarantees of the higher isolation levels to - * proceed with avoiding insertion (taking speculative insertion's alternative - * path) on the basis of another tuple that is not visible to MVCC snapshot. - * Check for the need to raise a serialization failure, and do so as necessary. - */ -static void -ExecCheckTupleVisible(EState *estate, - Relation rel, - TupleTableSlot *slot) -{ - if (!IsolationUsesXactSnapshot()) - return; - - if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot)) - { - Datum xminDatum; - TransactionId xmin; - bool isnull; - - xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - - /* - * We should not raise a serialization failure if the conflict is - * against a tuple inserted by our own transaction, even if it's not - * visible to our snapshot. (This would happen, for example, if - * conflicting keys are proposed for insertion in a single command.) - */ - if (!TransactionIdIsCurrentTransactionId(xmin)) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent update"))); - } -} - -/* - * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible() - */ -static void -ExecCheckTIDVisible(EState *estate, - ResultRelInfo *relinfo, - ItemPointer tid, - TupleTableSlot *tempSlot) -{ - Relation rel = relinfo->ri_RelationDesc; - - /* Redundantly check isolation level */ - if (!IsolationUsesXactSnapshot()) - return; - - if (!table_tuple_fetch_row_version(rel, tid, SnapshotAny, tempSlot)) - elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT"); - ExecCheckTupleVisible(estate, rel, tempSlot); - ExecClearTuple(tempSlot); -} - /* * Initialize to compute stored generated columns for a tuple * @@ -576,6 +514,10 @@ ExecInitInsertProjection(ModifyTableState *mtstate, resultRelInfo->ri_newTupleSlot = table_slot_create(resultRelInfo->ri_RelationDesc, &estate->es_tupleTable); + if (node->onConflictAction == ONCONFLICT_UPDATE) + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); /* Build ProjectionInfo if needed (it probably isn't). */ if (need_projection) @@ -1017,12 +959,19 @@ ExecInsert(ModifyTableContext *context, if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0) { /* Perform a speculative insertion. */ - uint32 specToken; - ItemPointerData conflictTid; - bool specConflict; List *arbiterIndexes; + TupleTableSlot *existing = NULL, + *returningSlot, + *inserted; + LockTupleMode lockmode = LockTupleExclusive; arbiterIndexes = resultRelInfo->ri_onConflictArbiterIndexes; + returningSlot = ExecGetReturningSlot(estate, resultRelInfo); + if (onconflict == ONCONFLICT_UPDATE) + { + lockmode = ExecUpdateLockMode(estate, resultRelInfo); + existing = resultRelInfo->ri_onConflict->oc_Existing; + } /* * Do a non-conclusive check for conflicts first. @@ -1039,23 +988,29 @@ ExecInsert(ModifyTableContext *context, */ vlock: CHECK_FOR_INTERRUPTS(); - specConflict = false; - if (!ExecCheckIndexConstraints(resultRelInfo, slot, estate, - &conflictTid, arbiterIndexes)) + + inserted = table_tuple_insert_with_arbiter(resultRelInfo, + slot, estate->es_output_cid, + 0, NULL, arbiterIndexes, estate, + lockmode, existing, returningSlot); + if (!inserted) { - /* committed conflict tuple found */ if (onconflict == ONCONFLICT_UPDATE) { + TupleTableSlot *returning = NULL; + + if (TTS_EMPTY(existing)) + goto vlock; + /* * In case of ON CONFLICT DO UPDATE, execute the UPDATE * part. Be prepared to retry if the UPDATE fails because * of another concurrent UPDATE/DELETE to the conflict * tuple. */ - TupleTableSlot *returning = NULL; if (ExecOnConflictUpdate(context, resultRelInfo, - &conflictTid, slot, canSetTag, + slot, canSetTag, &returning)) { InstrCountTuples2(&mtstate->ps, 1); @@ -1078,57 +1033,13 @@ ExecInsert(ModifyTableContext *context, * ExecGetReturningSlot() in the DO NOTHING case... */ Assert(onconflict == ONCONFLICT_NOTHING); - ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid, - ExecGetReturningSlot(estate, resultRelInfo)); InstrCountTuples2(&mtstate->ps, 1); return NULL; } } - - /* - * Before we start insertion proper, acquire our "speculative - * insertion lock". Others can use that to wait for us to decide - * if we're going to go ahead with the insertion, instead of - * waiting for the whole transaction to complete. - */ - specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId()); - - /* insert the tuple, with the speculative token */ - table_tuple_insert_speculative(resultRelationDesc, slot, - estate->es_output_cid, - 0, - NULL, - specToken); - - /* insert index entries for tuple */ - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, estate, false, true, - &specConflict, - arbiterIndexes, - false); - - /* adjust the tuple's state accordingly */ - table_tuple_complete_speculative(resultRelationDesc, slot, - specToken, !specConflict); - - /* - * Wake up anyone waiting for our decision. They will re-check - * the tuple, see that it's no longer speculative, and wait on our - * XID as if this was a regularly inserted tuple all along. Or if - * we killed the tuple, they will see it's dead, and proceed as if - * the tuple never existed. - */ - SpeculativeInsertionLockRelease(GetCurrentTransactionId()); - - /* - * If there was a conflict, start from the beginning. We'll do - * the pre-check again, which will now find the conflicting tuple - * (unless it aborts before we get there). - */ - if (specConflict) + else { - list_free(recheckIndexes); - goto vlock; + slot = inserted; } /* Since there was no insertion conflict, we're done */ @@ -1136,9 +1047,9 @@ ExecInsert(ModifyTableContext *context, else { /* insert the tuple normally */ - table_tuple_insert(resultRelationDesc, slot, - estate->es_output_cid, - 0, NULL); + slot = table_tuple_insert(resultRelationDesc, slot, + estate->es_output_cid, + 0, NULL); /* insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) @@ -1165,7 +1076,7 @@ ExecInsert(ModifyTableContext *context, ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, NULL, - NULL, + resultRelInfo->ri_oldTupleSlot, slot, NULL, mtstate->mt_transition_capture, @@ -1314,12 +1225,20 @@ ExecPendingInserts(EState *estate) */ static bool ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot **epqreturnslot, TM_Result *result) { if (result) *result = TM_Ok; + /* + * Open the table's indexes, if we have not done so already, so that we + * can delete index entries. + */ + if (resultRelInfo->ri_RelationDesc->rd_rel->relhasindex && + resultRelInfo->ri_IndexRelationDescs == NULL) + ExecOpenIndices(resultRelInfo, false); + /* BEFORE ROW DELETE triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->trig_delete_before_row) @@ -1345,7 +1264,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TM_Result ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, bool changingPart) + Datum tupleid, bool changingPart, int options, + TupleTableSlot *oldSlot) { EState *estate = context->estate; @@ -1353,9 +1273,10 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, estate->es_output_cid, estate->es_snapshot, estate->es_crosscheck_snapshot, - true /* wait for commit */ , + options /* wait for commit */ , &context->tmfd, - changingPart); + changingPart, + oldSlot); } /* @@ -1367,12 +1288,17 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool changingPart) + HeapTuple oldtuple, + TupleTableSlot *slot, bool changingPart) { ModifyTableState *mtstate = context->mtstate; EState *estate = context->estate; TransitionCaptureState *ar_delete_trig_tcs; + /* delete index entries if necessary */ + if (resultRelInfo->ri_NumIndices > 0) + ExecDeleteIndexTuples(resultRelInfo, slot, context->estate); + /* * If this delete is the result of a partition key update that moved the * tuple to a new partition, put this row into the transition OLD TABLE, @@ -1385,8 +1311,8 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, { ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, - tupleid, oldtuple, - NULL, NULL, mtstate->mt_transition_capture, + oldtuple, + slot, NULL, NULL, mtstate->mt_transition_capture, false); /* @@ -1397,10 +1323,30 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } /* AFTER ROW DELETE Triggers */ - ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, + ExecARDeleteTriggers(estate, resultRelInfo, oldtuple, slot, ar_delete_trig_tcs, changingPart); } +/* + * Initializes the tuple slot in a ResultRelInfo for DELETE action. + * + * We mark 'projectNewInfoValid' even though the projections themselves + * are not initialized here. + */ +static void +ExecInitDeleteTupleSlot(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo) +{ + EState *estate = mtstate->ps.state; + + Assert(!resultRelInfo->ri_projectNewInfoValid); + + resultRelInfo->ri_oldTupleSlot = + table_slot_create(resultRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + resultRelInfo->ri_projectNewInfoValid = true; +} + /* ---------------------------------------------------------------- * ExecDelete * @@ -1426,8 +1372,9 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, static TupleTableSlot * ExecDelete(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple oldtuple, + TupleTableSlot *oldSlot, bool processReturning, bool changingPart, bool canSetTag, @@ -1491,6 +1438,11 @@ ExecDelete(ModifyTableContext *context, } else { + int options = TABLE_MODIFY_WAIT | TABLE_MODIFY_FETCH_OLD_TUPLE; + + if (!IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + /* * delete the tuple * @@ -1501,7 +1453,8 @@ ExecDelete(ModifyTableContext *context, * transaction-snapshot mode transactions. */ ldelete: - result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart); + result = ExecDeleteAct(context, resultRelInfo, tupleid, changingPart, + options, oldSlot); if (tmresult) *tmresult = result; @@ -1548,7 +1501,6 @@ ExecDelete(ModifyTableContext *context, case TM_Updated: { - TupleTableSlot *inputslot; TupleTableSlot *epqslot; if (IsolationUsesXactSnapshot()) @@ -1557,87 +1509,29 @@ ExecDelete(ModifyTableContext *context, errmsg("could not serialize access due to concurrent update"))); /* - * Already know that we're going to need to do EPQ, so - * fetch tuple directly into the right slot. + * We need to do EPQ. The latest tuple is already found + * and locked as a result of TABLE_MODIFY_LOCK_UPDATED. */ - EvalPlanQualBegin(context->epqstate); - inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc, - resultRelInfo->ri_RangeTableIndex); + Assert(context->tmfd.traversed); + epqslot = EvalPlanQual(context->epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + oldSlot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; - result = table_tuple_lock(resultRelationDesc, tupleid, - estate->es_snapshot, - inputslot, estate->es_output_cid, - LockTupleExclusive, LockWaitBlock, - TUPLE_LOCK_FLAG_FIND_LAST_VERSION, - &context->tmfd); - - switch (result) + /* + * If requested, skip delete and pass back the updated + * row. + */ + if (epqreturnslot) { - case TM_Ok: - Assert(context->tmfd.traversed); - epqslot = EvalPlanQual(context->epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - inputslot); - if (TupIsNull(epqslot)) - /* Tuple not passing quals anymore, exiting... */ - return NULL; - - /* - * If requested, skip delete and pass back the - * updated row. - */ - if (epqreturnslot) - { - *epqreturnslot = epqslot; - return NULL; - } - else - goto ldelete; - - case TM_SelfModified: - - /* - * This can be reached when following an update - * chain from a tuple updated by another session, - * reaching a tuple that was already updated in - * this transaction. If previously updated by this - * command, ignore the delete, otherwise error - * out. - * - * See also TM_SelfModified response to - * table_tuple_delete() above. - */ - if (context->tmfd.cmax != estate->es_output_cid) - ereport(ERROR, - (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), - errmsg("tuple to be deleted was already modified by an operation triggered by the current command"), - errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); - return NULL; - - case TM_Deleted: - /* tuple already deleted; nothing to do */ - return NULL; - - default: - - /* - * TM_Invisible should be impossible because we're - * waiting for updated row versions, and would - * already have errored out if the first version - * is invisible. - * - * TM_Updated should be impossible, because we're - * locking the latest version via - * TUPLE_LOCK_FLAG_FIND_LAST_VERSION. - */ - elog(ERROR, "unexpected table_tuple_lock status: %u", - result); - return NULL; + *epqreturnslot = epqslot; + return NULL; } - - Assert(false); - break; + else + goto ldelete; } case TM_Deleted: @@ -1671,7 +1565,8 @@ ExecDelete(ModifyTableContext *context, if (tupleDeleted) *tupleDeleted = true; - ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, changingPart); + ExecDeleteEpilogue(context, resultRelInfo, oldtuple, + oldSlot, changingPart); /* Process RETURNING if present and if requested */ if (processReturning && resultRelInfo->ri_projectReturning) @@ -1687,19 +1582,15 @@ ExecDelete(ModifyTableContext *context, /* FDW must have provided a slot containing the deleted row */ Assert(!TupIsNull(slot)); } - else + else if (!slot || TupIsNull(slot)) { + /* Copy old tuple to the returning slot */ slot = ExecGetReturningSlot(estate, resultRelInfo); if (oldtuple != NULL) - { ExecForceStoreHeapTuple(oldtuple, slot, false); - } else - { - if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid, - SnapshotAny, slot)) - elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING"); - } + ExecCopySlot(slot, oldSlot); + Assert(!TupIsNull(slot)); } rslot = ExecProcessReturning(resultRelInfo, slot, context->planSlot); @@ -1740,7 +1631,7 @@ ExecDelete(ModifyTableContext *context, static bool ExecCrossPartitionUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, bool canSetTag, UpdateContext *updateCxt, @@ -1799,12 +1690,16 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, MemoryContextSwitchTo(oldcxt); } + /* Make sure ri_oldTupleSlot is initialized. */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitUpdateProjection(mtstate, resultRelInfo); + /* * Row movement, part 1. Delete the tuple, but skip RETURNING processing. * We want to return rows from INSERT. */ ExecDelete(context, resultRelInfo, - tupleid, oldtuple, + tupleid, oldtuple, resultRelInfo->ri_oldTupleSlot, false, /* processReturning */ true, /* changingPart */ false, /* canSetTag */ @@ -1845,21 +1740,13 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, return true; else { - /* Fetch the most recent version of old tuple. */ - TupleTableSlot *oldSlot; - - /* ... but first, make sure ri_oldTupleSlot is initialized. */ - if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) - ExecInitUpdateProjection(mtstate, resultRelInfo); - oldSlot = resultRelInfo->ri_oldTupleSlot; - if (!table_tuple_fetch_row_version(resultRelInfo->ri_RelationDesc, - tupleid, - SnapshotAny, - oldSlot)) - elog(ERROR, "failed to fetch tuple being updated"); - /* and project the new tuple to retry the UPDATE with */ + /* + * ExecDelete already fetches the most recent version of old tuple + * to resultRelInfo->ri_RelationDesc. So, just project the new + * tuple to retry the UPDATE with. + */ *retry_slot = ExecGetUpdateNewTuple(resultRelInfo, epqslot, - oldSlot); + resultRelInfo->ri_oldTupleSlot); return false; } } @@ -1877,8 +1764,8 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, /* Tuple routing starts from the root table. */ context->cpUpdateReturningSlot = - ExecInsert(context, mtstate->rootResultRelInfo, slot, canSetTag, - inserted_tuple, insert_destrel); + ExecInsert(context, mtstate->rootResultRelInfo, + slot, canSetTag, inserted_tuple, insert_destrel); /* * Reset the transition state that may possibly have been written by @@ -1900,7 +1787,7 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, */ static bool ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, TM_Result *result) { Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -1977,8 +1864,9 @@ ExecUpdatePrepareSlot(ResultRelInfo *resultRelInfo, */ static TM_Result ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag, UpdateContext *updateCxt) + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + bool canSetTag, int options, TupleTableSlot *oldSlot, + UpdateContext *updateCxt) { EState *estate = context->estate; Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -2070,7 +1958,8 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ExecCrossPartitionUpdateForeignKey(context, resultRelInfo, insert_destrel, - tupleid, slot, + tupleid, + resultRelInfo->ri_oldTupleSlot, inserted_tuple); return TM_Ok; @@ -2113,10 +2002,10 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, estate->es_output_cid, estate->es_snapshot, estate->es_crosscheck_snapshot, - true /* wait for commit */ , + options /* wait for commit */ , &context->tmfd, &updateCxt->lockmode, - &updateCxt->updateIndexes); - + &updateCxt->updateIndexes, + oldSlot); return result; } @@ -2128,24 +2017,29 @@ ExecUpdateAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, - ResultRelInfo *resultRelInfo, ItemPointer tupleid, - HeapTuple oldtuple, TupleTableSlot *slot) + ResultRelInfo *resultRelInfo, + HeapTuple oldtuple, TupleTableSlot *slot, + TupleTableSlot *oldSlot) { ModifyTableState *mtstate = context->mtstate; List *recheckIndexes = NIL; /* insert index entries for tuple if necessary */ if (resultRelInfo->ri_NumIndices > 0 && (updateCxt->updateIndexes != TU_None)) - recheckIndexes = ExecInsertIndexTuples(resultRelInfo, - slot, context->estate, - true, false, + { + recheckIndexes = ExecUpdateIndexTuples(resultRelInfo, + slot, + oldSlot, + context->estate, + false, NULL, NIL, (updateCxt->updateIndexes == TU_Summarizing)); + } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, NULL, NULL, - tupleid, oldtuple, slot, + oldtuple, oldSlot, slot, recheckIndexes, mtstate->operation == CMD_INSERT ? mtstate->mt_oc_transition_capture : @@ -2177,7 +2071,7 @@ static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, ResultRelInfo *sourcePartInfo, ResultRelInfo *destPartInfo, - ItemPointer tupleid, + Datum tupleid, TupleTableSlot *oldslot, TupleTableSlot *newslot) { @@ -2234,7 +2128,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, /* Perform the root table's triggers. */ ExecARUpdateTriggers(context->estate, rootRelInfo, sourcePartInfo, destPartInfo, - tupleid, NULL, newslot, NIL, NULL, true); + NULL, oldslot, newslot, NIL, NULL, true); } /* ---------------------------------------------------------------- @@ -2256,6 +2150,7 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, * NULL when the foreign table has no relevant triggers. * * slot contains the new tuple value to be stored. + * oldSlot is the slot to store the old tuple. * planSlot is the output of the ModifyTable's subplan; we use it * to access values from other input tables (for RETURNING), * row-ID junk columns, etc. @@ -2267,8 +2162,8 @@ ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, */ static TupleTableSlot * ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag) + Datum tupleid, HeapTuple oldtuple, TupleTableSlot *slot, + TupleTableSlot *oldSlot, bool canSetTag, bool locked) { EState *estate = context->estate; Relation resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -2321,6 +2216,15 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, } else { + int options = TABLE_MODIFY_WAIT; + + if (!locked) + { + options |= TABLE_MODIFY_FETCH_OLD_TUPLE; + if (!IsolationUsesXactSnapshot()) + options |= TABLE_MODIFY_LOCK_UPDATED; + } + /* * If we generate a new candidate tuple after EvalPlanQual testing, we * must loop back here to try again. (We don't need to redo triggers, @@ -2330,7 +2234,7 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ redo_act: result = ExecUpdateAct(context, resultRelInfo, tupleid, oldtuple, slot, - canSetTag, &updateCxt); + canSetTag, options, oldSlot, &updateCxt); /* * If ExecUpdateAct reports that a cross-partition update was done, @@ -2381,88 +2285,30 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, case TM_Updated: { - TupleTableSlot *inputslot; TupleTableSlot *epqslot; - TupleTableSlot *oldSlot; if (IsolationUsesXactSnapshot()) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); + Assert(!locked); /* - * Already know that we're going to need to do EPQ, so - * fetch tuple directly into the right slot. + * We need to do EPQ. The latest tuple is already found + * and locked as a result of TABLE_MODIFY_LOCK_UPDATED. */ - inputslot = EvalPlanQualSlot(context->epqstate, resultRelationDesc, - resultRelInfo->ri_RangeTableIndex); - - result = table_tuple_lock(resultRelationDesc, tupleid, - estate->es_snapshot, - inputslot, estate->es_output_cid, - updateCxt.lockmode, LockWaitBlock, - TUPLE_LOCK_FLAG_FIND_LAST_VERSION, - &context->tmfd); - - switch (result) - { - case TM_Ok: - Assert(context->tmfd.traversed); - - epqslot = EvalPlanQual(context->epqstate, - resultRelationDesc, - resultRelInfo->ri_RangeTableIndex, - inputslot); - if (TupIsNull(epqslot)) - /* Tuple not passing quals anymore, exiting... */ - return NULL; - - /* Make sure ri_oldTupleSlot is initialized. */ - if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) - ExecInitUpdateProjection(context->mtstate, - resultRelInfo); - - /* Fetch the most recent version of old tuple. */ - oldSlot = resultRelInfo->ri_oldTupleSlot; - if (!table_tuple_fetch_row_version(resultRelationDesc, - tupleid, - SnapshotAny, - oldSlot)) - elog(ERROR, "failed to fetch tuple being updated"); - slot = ExecGetUpdateNewTuple(resultRelInfo, - epqslot, oldSlot); - goto redo_act; - - case TM_Deleted: - /* tuple already deleted; nothing to do */ - return NULL; - - case TM_SelfModified: - - /* - * This can be reached when following an update - * chain from a tuple updated by another session, - * reaching a tuple that was already updated in - * this transaction. If previously modified by - * this command, ignore the redundant update, - * otherwise error out. - * - * See also TM_SelfModified response to - * table_tuple_update() above. - */ - if (context->tmfd.cmax != estate->es_output_cid) - ereport(ERROR, - (errcode(ERRCODE_TRIGGERED_DATA_CHANGE_VIOLATION), - errmsg("tuple to be updated was already modified by an operation triggered by the current command"), - errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows."))); - return NULL; - - default: - /* see table_tuple_lock call in ExecDelete() */ - elog(ERROR, "unexpected table_tuple_lock status: %u", - result); - return NULL; - } + Assert(context->tmfd.traversed); + epqslot = EvalPlanQual(context->epqstate, + resultRelationDesc, + resultRelInfo->ri_RangeTableIndex, + oldSlot); + if (TupIsNull(epqslot)) + /* Tuple not passing quals anymore, exiting... */ + return NULL; + slot = ExecGetUpdateNewTuple(resultRelInfo, + epqslot, + oldSlot); + goto redo_act; } break; @@ -2485,8 +2331,8 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (canSetTag) (estate->es_processed)++; - ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, tupleid, oldtuple, - slot); + ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, oldtuple, + slot, oldSlot); /* Process RETURNING if present */ if (resultRelInfo->ri_projectReturning) @@ -2509,144 +2355,26 @@ ExecUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer conflictTid, TupleTableSlot *excludedSlot, bool canSetTag, TupleTableSlot **returning) { ModifyTableState *mtstate = context->mtstate; ExprContext *econtext = mtstate->ps.ps_ExprContext; - Relation relation = resultRelInfo->ri_RelationDesc; ExprState *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause; TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing; - TM_FailureData tmfd; - LockTupleMode lockmode; - TM_Result test; - Datum xminDatum; - TransactionId xmin; - bool isnull; - - /* Determine lock mode to use */ - lockmode = ExecUpdateLockMode(context->estate, resultRelInfo); + Datum tupleid; - /* - * Lock tuple for update. Don't follow updates when tuple cannot be - * locked without doing so. A row locking conflict here means our - * previous conclusion that the tuple is conclusively committed is not - * true anymore. - */ - test = table_tuple_lock(relation, conflictTid, - context->estate->es_snapshot, - existing, context->estate->es_output_cid, - lockmode, LockWaitBlock, 0, - &tmfd); - switch (test) + if (table_get_row_ref_type(resultRelInfo->ri_RelationDesc) == ROW_REF_ROWID) { - case TM_Ok: - /* success! */ - break; - - case TM_Invisible: - - /* - * This can occur when a just inserted tuple is updated again in - * the same command. E.g. because multiple rows with the same - * conflicting key values are inserted. - * - * This is somewhat similar to the ExecUpdate() TM_SelfModified - * case. We do not want to proceed because it would lead to the - * same row being updated a second time in some unspecified order, - * and in contrast to plain UPDATEs there's no historical behavior - * to break. - * - * It is the user's responsibility to prevent this situation from - * occurring. These problems are why the SQL standard similarly - * specifies that for SQL MERGE, an exception must be raised in - * the event of an attempt to update the same row twice. - */ - xminDatum = slot_getsysattr(existing, - MinTransactionIdAttributeNumber, - &isnull); - Assert(!isnull); - xmin = DatumGetTransactionId(xminDatum); - - if (TransactionIdIsCurrentTransactionId(xmin)) - ereport(ERROR, - (errcode(ERRCODE_CARDINALITY_VIOLATION), - /* translator: %s is a SQL command name */ - errmsg("%s command cannot affect row a second time", - "ON CONFLICT DO UPDATE"), - errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values."))); - - /* This shouldn't happen */ - elog(ERROR, "attempted to lock invisible tuple"); - break; - - case TM_SelfModified: - - /* - * This state should never be reached. As a dirty snapshot is used - * to find conflicting tuples, speculative insertion wouldn't have - * seen this row to conflict with. - */ - elog(ERROR, "unexpected self-updated tuple"); - break; - - case TM_Updated: - if (IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent update"))); - - /* - * As long as we don't support an UPDATE of INSERT ON CONFLICT for - * a partitioned table we shouldn't reach to a case where tuple to - * be lock is moved to another partition due to concurrent update - * of the partition key. - */ - Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); - - /* - * Tell caller to try again from the very start. - * - * It does not make sense to use the usual EvalPlanQual() style - * loop here, as the new version of the row might not conflict - * anymore, or the conflicting tuple has actually been deleted. - */ - ExecClearTuple(existing); - return false; - - case TM_Deleted: - if (IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("could not serialize access due to concurrent delete"))); - - /* see TM_Updated case */ - Assert(!ItemPointerIndicatesMovedPartitions(&tmfd.ctid)); - ExecClearTuple(existing); - return false; - - default: - elog(ERROR, "unrecognized table_tuple_lock status: %u", test); + bool isnull; + tupleid = slot_getsysattr(existing, RowIdAttributeNumber, &isnull); + Assert(!isnull); + } + else + { + tupleid = PointerGetDatum(&existing->tts_tid); } - - /* Success, the tuple is locked. */ - - /* - * Verify that the tuple is visible to our MVCC snapshot if the current - * isolation level mandates that. - * - * It's not sufficient to rely on the check within ExecUpdate() as e.g. - * CONFLICT ... WHERE clause may prevent us from reaching that. - * - * This means we only ever continue when a new command in the current - * transaction could see the row, even though in READ COMMITTED mode the - * tuple will not be visible according to the current statement's - * snapshot. This is in line with the way UPDATE deals with newer tuple - * versions. - */ - ExecCheckTupleVisible(context->estate, relation, existing); /* * Make tuple and any needed join variables available to ExecQual and @@ -2702,9 +2430,10 @@ ExecOnConflictUpdate(ModifyTableContext *context, /* Execute UPDATE with projection */ *returning = ExecUpdate(context, resultRelInfo, - conflictTid, NULL, + tupleid, NULL, resultRelInfo->ri_onConflict->oc_ProjSlot, - canSetTag); + existing, + canSetTag, true); /* * Clear out existing tuple, as there might not be another conflict among @@ -2720,7 +2449,7 @@ ExecOnConflictUpdate(ModifyTableContext *context, */ static TupleTableSlot * ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool canSetTag) + Datum tupleid, HeapTuple oldtuple, bool canSetTag) { TupleTableSlot *rslot = NULL; bool matched; @@ -2786,7 +2515,7 @@ ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * update chain and we never switch from ExecMergeNotMatched() to * ExecMergeMatched(), there is no risk of a livelock. */ - matched = tupleid != NULL || oldtuple != NULL; + matched = DatumGetPointer(tupleid) != NULL || oldtuple != NULL; if (matched) rslot = ExecMergeMatched(context, resultRelInfo, tupleid, oldtuple, canSetTag, &matched); @@ -2846,7 +2575,7 @@ ExecMerge(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static TupleTableSlot * ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple, bool canSetTag, + Datum tupleid, HeapTuple oldtuple, bool canSetTag, bool *matched) { ModifyTableState *mtstate = context->mtstate; @@ -2886,7 +2615,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * either have the tupleid of the target row, or an old tuple from the * target wholerow junk attr. */ - Assert(tupleid != NULL || oldtuple != NULL); + Assert(DatumGetPointer(tupleid) != NULL || oldtuple != NULL); if (oldtuple != NULL) ExecForceStoreHeapTuple(oldtuple, resultRelInfo->ri_oldTupleSlot, false); @@ -2985,7 +2714,7 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, Assert(oldtuple == NULL); result = ExecUpdateAct(context, resultRelInfo, tupleid, - NULL, newslot, canSetTag, + NULL, newslot, canSetTag, TABLE_MODIFY_WAIT, NULL, &updateCxt); /* @@ -3007,7 +2736,8 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (result == TM_Ok) { ExecUpdateEpilogue(context, &updateCxt, resultRelInfo, - tupleid, NULL, newslot); + NULL, newslot, + resultRelInfo->ri_oldTupleSlot); mtstate->mt_merge_updated += 1; } break; @@ -3037,13 +2767,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, Assert(oldtuple == NULL); result = ExecDeleteAct(context, resultRelInfo, tupleid, - false); + false, TABLE_MODIFY_WAIT, NULL); } if (result == TM_Ok) { - ExecDeleteEpilogue(context, resultRelInfo, tupleid, NULL, - false); + ExecDeleteEpilogue(context, resultRelInfo, NULL, + resultRelInfo->ri_oldTupleSlot, false); mtstate->mt_merge_deleted += 1; } break; @@ -3154,7 +2884,6 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, switch (result) { case TM_Ok: - /* * If the tuple was updated and migrated to * another partition concurrently, the current @@ -3196,9 +2925,13 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * join quals no longer pass and we switch to * the NOT MATCHED BY SOURCE case. */ - (void) ExecGetJunkAttribute(epqslot, - resultRelInfo->ri_RowIdAttNo, - &isNull); + /* + * Update tupleid to that of the new tuple, for + * the refetch we do at the top. + */ + tupleid = ExecGetJunkAttribute(epqslot, + resultRelInfo->ri_RowIdAttNo, + &isNull); if (isNull) *matched = false; @@ -3207,8 +2940,8 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, * we need to switch to the NOT MATCHED BY * SOURCE case. */ - if (!table_tuple_fetch_row_version(resultRelationDesc, - &context->tmfd.ctid, + if (!isNull && !table_tuple_fetch_row_version(resultRelationDesc, + tupleid, SnapshotAny, resultRelInfo->ri_oldTupleSlot)) elog(ERROR, "failed to fetch the target tuple"); @@ -3225,6 +2958,11 @@ ExecMergeMatched(ModifyTableContext *context, ResultRelInfo *resultRelInfo, /* * Loop back and process the MATCHED or NOT * MATCHED BY SOURCE actions from the start. + * A non-NULL ctid means that we are still dealing + * with MATCHED case. Restart the loop so that we + * apply all the MATCHED rules again, to ensure + * that the first qualifying WHEN MATCHED action + * is executed. */ goto lmerge_matched; @@ -3763,10 +3501,10 @@ ExecModifyTable(PlanState *pstate) PlanState *subplanstate; TupleTableSlot *slot; TupleTableSlot *oldSlot; + Datum tupleid; ItemPointerData tuple_ctid; HeapTupleData oldtupdata; HeapTuple oldtuple; - ItemPointer tupleid; CHECK_FOR_INTERRUPTS(); @@ -3815,6 +3553,8 @@ ExecModifyTable(PlanState *pstate) */ for (;;) { + RowRefType refType; + /* * Reset the per-output-tuple exprcontext. This is needed because * triggers expect to use that context as workspace. It's a bit ugly @@ -3890,7 +3630,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the caller. @@ -3934,7 +3674,8 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = context.planSlot; - tupleid = NULL; + refType = resultRelInfo->ri_RowRefType; + tupleid = PointerGetDatum(NULL); oldtuple = NULL; /* @@ -3977,7 +3718,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the @@ -3992,9 +3733,24 @@ ExecModifyTable(PlanState *pstate) elog(ERROR, "ctid is NULL"); } - tupleid = (ItemPointer) DatumGetPointer(datum); - tuple_ctid = *tupleid; /* be sure we don't free ctid!! */ - tupleid = &tuple_ctid; + if (refType == ROW_REF_TID) + { + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "ctid is NULL"); + + tuple_ctid = *((ItemPointer) DatumGetPointer(datum)); /* be sure we don't free ctid!! */ + tupleid = PointerGetDatum(&tuple_ctid); + } + else + { + Assert(refType == ROW_REF_ROWID); + /* shouldn't ever get a null result... */ + if (isNull) + elog(ERROR, "rowid is NULL"); + + tupleid = datumCopy(datum, false, -1); + } } /* @@ -4034,7 +3790,7 @@ ExecModifyTable(PlanState *pstate) EvalPlanQualSetSlot(&node->mt_epqstate, context.planSlot); slot = ExecMerge(&context, node->resultRelInfo, - NULL, NULL, node->canSetTag); + PointerGetDatum(NULL), NULL, node->canSetTag); /* * If we got a RETURNING result, return it to the @@ -4098,6 +3854,7 @@ ExecModifyTable(PlanState *pstate) /* Fetch the most recent version of old tuple. */ Relation relation = resultRelInfo->ri_RelationDesc; + Assert(DatumGetPointer(tupleid) != NULL); if (!table_tuple_fetch_row_version(relation, tupleid, SnapshotAny, oldSlot)) @@ -4108,12 +3865,18 @@ ExecModifyTable(PlanState *pstate) /* Now apply the update. */ slot = ExecUpdate(&context, resultRelInfo, tupleid, oldtuple, - slot, node->canSetTag); + slot, resultRelInfo->ri_oldTupleSlot, + node->canSetTag, false); break; case CMD_DELETE: + /* Initialize slot for DELETE to fetch the old tuple */ + if (unlikely(!resultRelInfo->ri_projectNewInfoValid)) + ExecInitDeleteTupleSlot(node, resultRelInfo); + slot = ExecDelete(&context, resultRelInfo, tupleid, oldtuple, - true, false, node->canSetTag, NULL, NULL, NULL); + resultRelInfo->ri_oldTupleSlot, true, false, + node->canSetTag, NULL, NULL, NULL); break; case CMD_MERGE: @@ -4126,6 +3889,9 @@ ExecModifyTable(PlanState *pstate) break; } + if (refType == ROW_REF_ROWID && DatumGetPointer(tupleid) != NULL) + pfree(DatumGetPointer(tupleid)); + /* * If we got a RETURNING result, return it to caller. We'll continue * the work on next call. @@ -4370,10 +4136,20 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) relkind == RELKIND_MATVIEW || relkind == RELKIND_PARTITIONED_TABLE) { - resultRelInfo->ri_RowIdAttNo = - ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); - if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) - elog(ERROR, "could not find junk ctid column"); + if (resultRelInfo->ri_RowRefType == ROW_REF_TID) + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "ctid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk ctid column"); + } + else + { + resultRelInfo->ri_RowIdAttNo = + ExecFindJunkAttributeInTlist(subplan->targetlist, "rowid"); + if (!AttributeNumberIsValid(resultRelInfo->ri_RowIdAttNo)) + elog(ERROR, "could not find junk rowid column"); + } } else if (relkind == RELKIND_FOREIGN_TABLE) { @@ -4683,6 +4459,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_auxmodifytables = lcons(mtstate, estate->es_auxmodifytables); + + return mtstate; } diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index 864a9013b62..f4a124ac4eb 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -377,7 +377,7 @@ TidNext(TidScanState *node) if (node->tss_isCurrentOf) table_tuple_get_latest_tid(scan, &tid); - if (table_tuple_fetch_row_version(heapRelation, &tid, snapshot, slot)) + if (table_tuple_fetch_row_version(heapRelation, PointerGetDatum(&tid), snapshot, slot)) return slot; /* Bad TID or failed snapshot qual; try next */ diff --git a/src/backend/nodes/read.c b/src/backend/nodes/read.c index 4eb42445c52..ffa147ee4c8 100644 --- a/src/backend/nodes/read.c +++ b/src/backend/nodes/read.c @@ -205,6 +205,17 @@ pg_strtok(int *length) return ret_str; } +bool +pg_str_hasfield(void) +{ + const char *local_str = pg_strtok_ptr; + + while (*local_str == ' ' || *local_str == '\n' || *local_str == '\t') + local_str++; + + return (*local_str == ':'); +} + /* * debackslash - * create a palloc'd string holding the given token. diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 4895cee9944..7e02b670931 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -83,6 +83,7 @@ int min_parallel_index_scan_size; /* Hook for plugins to get control in set_rel_pathlist() */ set_rel_pathlist_hook_type set_rel_pathlist_hook = NULL; +set_plain_rel_pathlist_hook_type set_plain_rel_pathlist_hook = NULL; /* Hook for plugins to replace standard_join_search() */ join_search_hook_type join_search_hook = NULL; @@ -772,8 +773,10 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) */ required_outer = rel->lateral_relids; - /* Consider sequential scan */ - add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); + if (!set_plain_rel_pathlist_hook || + set_plain_rel_pathlist_hook(root, rel, rte)) + /* Consider sequential scan */ + add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); /* If appropriate, consider parallel sequential scan */ if (rel->consider_parallel && required_outer == NULL) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index c0fcc7d78df..a698f888d71 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -20,6 +20,7 @@ #include "access/stratnum.h" #include "access/sysattr.h" #include "catalog/pg_am.h" +#include "catalog/pg_amop.h" #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" @@ -32,8 +33,10 @@ #include "optimizer/paths.h" #include "optimizer/prep.h" #include "optimizer/restrictinfo.h" +#include "utils/array.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" +#include "utils/syscache.h" /* XXX see PartCollMatchesExprColl */ @@ -48,14 +51,6 @@ typedef enum ST_ANYSCAN, /* either is okay */ } ScanTypeControl; -/* Data structure for collecting qual clauses that match an index */ -typedef struct -{ - bool nonempty; /* True if lists are not all empty */ - /* Lists of IndexClause nodes, one list per index column */ - List *indexclauses[INDEX_MAX_KEYS]; -} IndexClauseSet; - /* Per-path data used within choose_bitmap_and() */ typedef struct { @@ -129,9 +124,6 @@ static double adjust_rowcount_for_semijoins(PlannerInfo *root, Index outer_relid, double rowcount); static double approximate_joinrel_size(PlannerInfo *root, Relids relids); -static void match_restriction_clauses_to_index(PlannerInfo *root, - IndexOptInfo *index, - IndexClauseSet *clauseset); static void match_join_clauses_to_index(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauseset, @@ -177,6 +169,10 @@ static IndexClause *match_rowcompare_to_indexcol(PlannerInfo *root, RestrictInfo *rinfo, int indexcol, IndexOptInfo *index); +static IndexClause *match_orclause_to_indexcol(PlannerInfo *root, + RestrictInfo *rinfo, + int indexcol, + IndexOptInfo *index); static IndexClause *expand_indexqual_rowcompare(PlannerInfo *root, RestrictInfo *rinfo, int indexcol, @@ -1166,6 +1162,386 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, return result; } +/* + * Utility structure used to group similar OR-clause arguments in + * group_similar_or_args(). It represents information about the OR-clause + * argument and its matching index key. + */ +typedef struct +{ + int indexnum; /* index of the matching index, or -1 if no + * matching index */ + int colnum; /* index of the matching column, or -1 if no + * matching index */ + Oid opno; /* OID of the OpClause operator, or InvalidOid + * if not an OpExpr */ + Oid inputcollid; /* OID of the OpClause input collation */ + int argindex; /* index of the clause in the list of + * arguments */ +} OrArgIndexMatch; + +/* + * Comparison function for OrArgIndexMatch which provides sort order placing + * similar OR-clause arguments together. + */ +static int +or_arg_index_match_cmp(const void *a, const void *b) +{ + const OrArgIndexMatch *match_a = (const OrArgIndexMatch *) a; + const OrArgIndexMatch *match_b = (const OrArgIndexMatch *) b; + + if (match_a->indexnum < match_b->indexnum) + return -1; + else if (match_a->indexnum > match_b->indexnum) + return 1; + + if (match_a->colnum < match_b->colnum) + return -1; + else if (match_a->colnum > match_b->colnum) + return 1; + + if (match_a->opno < match_b->opno) + return -1; + else if (match_a->opno > match_b->opno) + return 1; + + if (match_a->inputcollid < match_b->inputcollid) + return -1; + else if (match_a->inputcollid > match_b->inputcollid) + return 1; + + if (match_a->argindex < match_b->argindex) + return -1; + else if (match_a->argindex > match_b->argindex) + return 1; + + return 0; +} + +/* + * group_similar_or_args + * Transform incoming OR-restrictinfo into a list of sub-restrictinfos, + * each of them containing a subset of OR-clauses from the source rinfo + * matching the same index column with the same operator and collation, + * It may be employed later, during the match_clause_to_indexcol() to + * transform whole OR-sub-rinfo to an SAOP clause. + * + * Similar arguments clauses of form "indexkey op constant" having same + * indexkey, operator, and collation. Constant may comprise either Const + * or Param. + * + * Returns the processed list of arguments. + */ +static List * +group_similar_or_args(PlannerInfo *root, RelOptInfo *rel, RestrictInfo *rinfo) +{ + int n; + int i; + int group_start; + OrArgIndexMatch *matches; + bool matched = false; + ListCell *lc; + ListCell *lc2; + List *orargs; + List *result = NIL; + + Assert(IsA(rinfo->orclause, BoolExpr)); + orargs = ((BoolExpr *) rinfo->orclause)->args; + n = list_length(orargs); + + /* + * To avoid N^2 behavior, take utility pass along the list of OR-clause + * arguments. For each argument, fill the OrArgIndexMatch structure, + * which will be used to sort these arguments at the next step. + */ + i = -1; + matches = (OrArgIndexMatch *) palloc(sizeof(OrArgIndexMatch) * n); + foreach(lc, orargs) + { + Node *arg = lfirst(lc); + RestrictInfo *argrinfo; + OpExpr *clause; + Oid opno; + Node *leftop, + *rightop; + Node *nonConstExpr; + int indexnum; + int colnum; + + i++; + matches[i].argindex = i; + matches[i].indexnum = -1; + matches[i].colnum = -1; + matches[i].opno = InvalidOid; + matches[i].inputcollid = InvalidOid; + + if (!IsA(arg, RestrictInfo)) + continue; + + argrinfo = castNode(RestrictInfo, arg); + + /* Only operator clauses can match */ + if (!IsA(argrinfo->clause, OpExpr)) + continue; + + clause = (OpExpr *) argrinfo->clause; + opno = clause->opno; + + /* Only binary operators can match */ + if (list_length(clause->args) != 2) + continue; + + /* + * Ignore any RelabelType node above the operands. This is needed to + * be able to apply indexscanning in binary-compatible-operator cases. + * Note: we can assume there is at most one RelabelType node; + * eval_const_expressions() will have simplified if more than one. + */ + leftop = get_leftop(clause); + if (IsA(leftop, RelabelType)) + leftop = (Node *) ((RelabelType *) leftop)->arg; + + rightop = get_rightop(clause); + if (IsA(rightop, RelabelType)) + rightop = (Node *) ((RelabelType *) rightop)->arg; + + /* + * Check for clauses of the form: (indexkey operator constant) or + * (constant operator indexkey). But we don't know a particular index + * yet. First check for a constant, which must be Const or Param. + * That's cheaper than search for an index key among all indexes. + */ + if (IsA(leftop, Const) || IsA(leftop, Param)) + { + opno = get_commutator(opno); + + if (!OidIsValid(opno)) + { + /* commutator doesn't exist, we can't reverse the order */ + continue; + } + nonConstExpr = rightop; + } + else if (IsA(rightop, Const) || IsA(rightop, Param)) + { + nonConstExpr = leftop; + } + else + { + continue; + } + + /* + * Match non-constant part to the index key. It's possible that a + * single non-constant part matches multiple index keys. It's OK, we + * just stop with first matching index key. Given that this choice is + * determined the same for every clause, we will group similar clauses + * together anyway. + */ + indexnum = 0; + foreach(lc2, rel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(lc2); + + /* + * Ignore index if it doesn't support bitmap scans or SAOP + * clauses. + */ + if (!index->amhasgetbitmap || !index->amsearcharray) + continue; + + for (colnum = 0; colnum < index->nkeycolumns; colnum++) + { + if (match_index_to_operand(nonConstExpr, colnum, index)) + { + matches[i].indexnum = indexnum; + matches[i].colnum = colnum; + matches[i].opno = opno; + matches[i].inputcollid = clause->inputcollid; + matched = true; + break; + } + } + + /* + * Stop looping through the indexes, if we managed to match + * nonConstExpr to any index column. + */ + if (matches[i].indexnum >= 0) + break; + indexnum++; + } + } + + /* + * Fast-path check: if no clause is matching to the index column, we can + * just give up at this stage and return the clause list as-is. + */ + if (!matched) + { + pfree(matches); + return orargs; + } + + /* Sort clauses to make similar clauses go together */ + qsort(matches, n, sizeof(OrArgIndexMatch), or_arg_index_match_cmp); + + /* + * Group similar clauses into single sub-restrictinfo. Side effect: the + * resulting list of restrictions will be sorted by indexnum and colnum. + */ + group_start = 0; + for (i = 1; i <= n; i++) + { + /* Check if it's a group boundary */ + if (group_start >= 0 && + (i == n || + matches[i].indexnum != matches[group_start].indexnum || + matches[i].colnum != matches[group_start].colnum || + matches[i].opno != matches[group_start].opno || + matches[i].inputcollid != matches[group_start].inputcollid || + matches[i].indexnum == -1)) + { + /* + * One clause in group: add it "as is" to the upper-level OR. + */ + if (i - group_start == 1) + { + result = lappend(result, + list_nth(orargs, + matches[group_start].argindex)); + } + else + { + /* + * Two or more clauses in a group: create a nested OR. + */ + List *args = NIL; + List *rargs = NIL; + RestrictInfo *subrinfo; + int j; + + Assert(i - group_start >= 2); + + /* Construct the list of nested OR arguments */ + for (j = group_start; j < i; j++) + { + Node *arg = list_nth(orargs, matches[j].argindex); + + rargs = lappend(rargs, arg); + if (IsA(arg, RestrictInfo)) + args = lappend(args, ((RestrictInfo *) arg)->clause); + else + args = lappend(args, arg); + } + + /* Construct the nested OR and wrap it with RestrictInfo */ + subrinfo = make_plain_restrictinfo(root, + make_orclause(args), + make_orclause(rargs), + rinfo->is_pushed_down, + rinfo->has_clone, + rinfo->is_clone, + rinfo->pseudoconstant, + rinfo->security_level, + rinfo->required_relids, + rinfo->incompatible_relids, + rinfo->outer_relids); + result = lappend(result, subrinfo); + } + + group_start = i; + } + } + pfree(matches); + return result; +} + +/* + * make_bitmap_paths_for_or_group + * Generate bitmap paths for a group of similar OR-clause arguments + * produced by group_similar_or_args(). + * + * This function considers two cases: (1) matching a group of clauses to + * the index as a whole, and (2) matching the individual clauses one-by-one. + * (1) typically comprises an optimal solution. If not, (2) typically + * comprises fair alternative. + * + * Ideally, we could consider all arbitrary splits of arguments into + * subgroups, but that could lead to unacceptable computational complexity. + * This is why we only consider two cases of above. + */ +static List * +make_bitmap_paths_for_or_group(PlannerInfo *root, RelOptInfo *rel, + RestrictInfo *ri, List *other_clauses) +{ + List *jointlist = NIL; + List *splitlist = NIL; + ListCell *lc; + List *orargs; + List *args = ((BoolExpr *) ri->orclause)->args; + Cost jointcost = 0.0, + splitcost = 0.0; + Path *bitmapqual; + List *indlist; + + /* + * First, try to match the whole group to the one index. + */ + orargs = list_make1(ri); + indlist = build_paths_for_OR(root, rel, + orargs, + other_clauses); + if (indlist != NIL) + { + bitmapqual = choose_bitmap_and(root, rel, indlist); + jointcost = bitmapqual->total_cost; + jointlist = list_make1(bitmapqual); + } + + /* + * If we manage to find a bitmap scan, which uses the group of OR-clause + * arguments as a whole, we can skip matching OR-clause arguments + * one-by-one as long as there are no other clauses, which can bring more + * efficiency to one-by-one case. + */ + if (jointlist != NIL && other_clauses == NIL) + return jointlist; + + /* + * Also try to match all containing clauses one-by-one. + */ + foreach(lc, args) + { + orargs = list_make1(lfirst(lc)); + + indlist = build_paths_for_OR(root, rel, + orargs, + other_clauses); + + if (indlist == NIL) + { + splitlist = NIL; + break; + } + + bitmapqual = choose_bitmap_and(root, rel, indlist); + splitcost += bitmapqual->total_cost; + splitlist = lappend(splitlist, bitmapqual); + } + + /* + * Pick the best option. + */ + if (splitlist == NIL) + return jointlist; + else if (jointlist == NIL) + return splitlist; + else + return (jointcost < splitcost) ? jointlist : splitlist; +} + + /* * generate_bitmap_or_paths * Look through the list of clauses to find OR clauses, and generate @@ -1196,6 +1572,8 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, List *pathlist; Path *bitmapqual; ListCell *j; + List *groupedArgs; + List *inner_other_clauses = NIL; /* Ignore RestrictInfos that aren't ORs */ if (!restriction_is_or_clause(rinfo)) @@ -1206,7 +1584,28 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, * the OR, else we can't use it. */ pathlist = NIL; - foreach(j, ((BoolExpr *) rinfo->orclause)->args) + + /* + * Group the similar OR-clause argument into dedicated RestrictInfos, + * because those RestrictInfos might match to the index as a whole. + */ + groupedArgs = group_similar_or_args(root, rel, rinfo); + + if (groupedArgs != ((BoolExpr *) rinfo->orclause)->args) + { + /* + * Some parts of the rinfo were grouped. In this case, we have a + * set of sub-rinfos that together are an exact duplicate of + * rinfo. Thus, we need to remove the rinfo from other clauses. + * match_clauses_to_index detects duplicated iclauses by comparing + * pointers to original rinfos that would be different. So, we + * must delete rinfo to avoid de-facto duplicated clauses in the + * index clauses list. + */ + inner_other_clauses = list_delete(list_copy(all_clauses), rinfo); + } + + foreach(j, groupedArgs) { Node *orarg = (Node *) lfirst(j); List *indlist; @@ -1226,12 +1625,34 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, andargs, all_clauses)); } + else if (restriction_is_or_clause(castNode(RestrictInfo, orarg))) + { + RestrictInfo *ri = castNode(RestrictInfo, orarg); + + /* + * Generate bitmap paths for the group of similar OR-clause + * arguments. + */ + indlist = make_bitmap_paths_for_or_group(root, + rel, ri, + inner_other_clauses); + + if (indlist == NIL) + { + pathlist = NIL; + break; + } + else + { + pathlist = list_concat(pathlist, indlist); + continue; + } + } else { RestrictInfo *ri = castNode(RestrictInfo, orarg); List *orargs; - Assert(!restriction_is_or_clause(ri)); orargs = list_make1(ri); indlist = build_paths_for_OR(root, rel, @@ -1257,6 +1678,9 @@ generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, pathlist = lappend(pathlist, bitmapqual); } + if (inner_other_clauses != NIL) + list_free(inner_other_clauses); + /* * If we have a match for every arm, then turn them into a * BitmapOrPath, and add to result list. @@ -1964,7 +2388,7 @@ approximate_joinrel_size(PlannerInfo *root, Relids relids) * Identify restriction clauses for the rel that match the index. * Matching clauses are added to *clauseset. */ -static void +void match_restriction_clauses_to_index(PlannerInfo *root, IndexOptInfo *index, IndexClauseSet *clauseset) @@ -2149,7 +2573,10 @@ match_clause_to_index(PlannerInfo *root, * (3) must match the collation of the index, if collation is relevant. * * Our definition of "const" is exceedingly liberal: we allow anything that - * doesn't involve a volatile function or a Var of the index's relation. + * doesn't involve a volatile function or a Var of the index's relation + * except for a boolean OR expression input: due to a trade-off between the + * expected execution speedup and planning complexity, we limit or->saop + * transformation by obvious cases when an index scan can get a profit. * In particular, Vars belonging to other relations of the query are * accepted here, since a clause of that form can be used in a * parameterized indexscan. It's the responsibility of higher code levels @@ -2179,6 +2606,10 @@ match_clause_to_index(PlannerInfo *root, * It is also possible to match ScalarArrayOpExpr clauses to indexes, when * the clause is of the form "indexkey op ANY (arrayconst)". * + * It is also possible to match a list of OR clauses if it might be + * transformed into a single ScalarArrayOpExpr clause. On success, + * the returning index clause will contain a trasformed clause. + * * For boolean indexes, it is also possible to match the clause directly * to the indexkey; or perhaps the clause is (NOT indexkey). * @@ -2228,9 +2659,9 @@ match_clause_to_indexcol(PlannerInfo *root, } /* - * Clause must be an opclause, funcclause, ScalarArrayOpExpr, or - * RowCompareExpr. Or, if the index supports it, we can handle IS - * NULL/NOT NULL clauses. + * Clause must be an opclause, funcclause, ScalarArrayOpExpr, + * RowCompareExpr, or OR-clause that could be converted to SAOP. Or, if + * the index supports it, we can handle IS NULL/NOT NULL clauses. */ if (IsA(clause, OpExpr)) { @@ -2248,6 +2679,10 @@ match_clause_to_indexcol(PlannerInfo *root, { return match_rowcompare_to_indexcol(root, rinfo, indexcol, index); } + else if (restriction_is_or_clause(rinfo)) + { + return match_orclause_to_indexcol(root, rinfo, indexcol, index); + } else if (index->amsearchnulls && IsA(clause, NullTest)) { NullTest *nt = (NullTest *) clause; @@ -2423,7 +2858,7 @@ match_opclause_to_indexcol(PlannerInfo *root, /* * Check for clauses of the form: (indexkey operator constant) or - * (constant operator indexkey). See match_clause_to_indexcol's notes + * (constant operator indexkey). See match_clause_to_indexcol()'s notes * about const-ness. * * Note that we don't ask the support function about clauses that don't @@ -2771,6 +3206,269 @@ match_rowcompare_to_indexcol(PlannerInfo *root, return NULL; } +/* + * match_orclause_to_indexcol() + * Handles the OR-expr case for match_clause_to_indexcol() in the case + * when it could be transformed to ScalarArrayOpExpr. + * + * Given a list of OR-clause args, attempts to transform this BoolExpr into + * a single SAOP expression. On success, returns an IndexClause, containing + * the transformed expression or NULL, if failed. + */ +static IndexClause * +match_orclause_to_indexcol(PlannerInfo *root, + RestrictInfo *rinfo, + int indexcol, + IndexOptInfo *index) +{ + ListCell *lc; + BoolExpr *orclause = (BoolExpr *) rinfo->orclause; + Node *indexExpr = NULL; + List *consts = NIL; + Node *arrayNode = NULL; + ScalarArrayOpExpr *saopexpr = NULL; + Oid matchOpno = InvalidOid; + IndexClause *iclause; + Oid consttype = InvalidOid; + Oid arraytype = InvalidOid; + Oid inputcollid = InvalidOid; + bool firstTime = true; + bool have_param = false; + + Assert(IsA(orclause, BoolExpr)); + Assert(orclause->boolop == OR_EXPR); + + /* Ignore index if it doesn't support SAOP clauses */ + if(!index->amsearcharray) + return NULL; + + /* + * Try to convert a list of OR-clauses to a single SAOP expression. Each + * OR entry must be in the form: (indexkey operator constant) or (constant + * operator indexkey). Operators of all the entries must match. Constant + * might be either Const or Param. To be effective, give up on the first + * non-matching entry. Exit is implemented as a break from the loop, which + * is catched afterwards. + */ + foreach(lc, orclause->args) + { + RestrictInfo *subRinfo; + OpExpr *subClause; + Oid opno; + Node *leftop, + *rightop; + Node *constExpr; + + if (!IsA(lfirst(lc), RestrictInfo)) + break; + + subRinfo = (RestrictInfo *) lfirst(lc); + + /* Only operator clauses can match */ + if (!IsA(subRinfo->clause, OpExpr)) + break; + + subClause = (OpExpr *) subRinfo->clause; + opno = subClause->opno; + + /* Only binary operators can match */ + if (list_length(subClause->args) != 2) + break; + + /* + * The parameters below must match between sub-rinfo and its parent as + * make_restrictinfo() fills them with the same values, and further + * modifications are also the same for the whole subtree. However, + * still make a sanity check. + */ + Assert(subRinfo->is_pushed_down == rinfo->is_pushed_down); + Assert(subRinfo->is_clone == rinfo->is_clone); + Assert(subRinfo->security_level == rinfo->security_level); + Assert(bms_equal(subRinfo->incompatible_relids, rinfo->incompatible_relids)); + Assert(bms_equal(subRinfo->outer_relids, rinfo->outer_relids)); + + /* + * Also, check that required_relids in sub-rinfo is subset of parent's + * required_relids. + */ + Assert(bms_is_subset(subRinfo->required_relids, rinfo->required_relids)); + + /* Only operator returning boolean suits the transformation */ + if (get_op_rettype(opno) != BOOLOID) + break; + + /* + * Check for clauses of the form: (indexkey operator constant) or + * (constant operator indexkey). Determine indexkey side first, check + * the constant later. + */ + leftop = (Node *) linitial(subClause->args); + rightop = (Node *) lsecond(subClause->args); + if (match_index_to_operand(leftop, indexcol, index)) + { + indexExpr = leftop; + constExpr = rightop; + } + else if (match_index_to_operand(rightop, indexcol, index)) + { + opno = get_commutator(opno); + if (!OidIsValid(opno)) + { + /* commutator doesn't exist, we can't reverse the order */ + break; + } + indexExpr = rightop; + constExpr = leftop; + } + else + { + break; + } + + /* + * Ignore any RelabelType node above the operands. This is needed to + * be able to apply indexscanning in binary-compatible-operator cases. + * Note: we can assume there is at most one RelabelType node; + * eval_const_expressions() will have simplified if more than one. + */ + if (IsA(constExpr, RelabelType)) + constExpr = (Node *) ((RelabelType *) constExpr)->arg; + if (IsA(indexExpr, RelabelType)) + indexExpr = (Node *) ((RelabelType *) indexExpr)->arg; + + /* We allow constant to be Const or Param */ + if (!IsA(constExpr, Const) && !IsA(constExpr, Param)) + break; + + /* Forbid transformation for composite types, records. */ + if (type_is_rowtype(exprType(constExpr)) || + type_is_rowtype(exprType(indexExpr))) + break; + + /* + * Save information about the operator, type, and collation for the + * first matching qual. Then, check that subsequent quals match the + * first. + */ + if (firstTime) + { + matchOpno = opno; + consttype = exprType(constExpr); + arraytype = get_array_type(consttype); + inputcollid = subClause->inputcollid; + + /* + * Check that the operator is presented in the opfamily and that + * the expression collation matches the index collation. Also, + * there must be an array type to construct an array later. + */ + if (!IndexCollMatchesExprColl(index->indexcollations[indexcol], inputcollid) || + !op_in_opfamily(matchOpno, index->opfamily[indexcol]) || + !OidIsValid(arraytype)) + break; + firstTime = false; + } + else + { + if (opno != matchOpno || + inputcollid != subClause->inputcollid || + consttype != exprType(constExpr)) + break; + } + + if (IsA(constExpr, Param)) + have_param = true; + consts = lappend(consts, constExpr); + } + + /* + * Catch the break from the loop above. Normally, a foreach() loop ends + * up with a NULL list cell. A non-NULL list cell indicates a break from + * the foreach() loop. Free the consts list and return NULL then. + */ + if (lc != NULL) + { + list_free(consts); + return NULL; + } + + /* + * Assemble an array from the list of constants. It seems more profitable + * to build a const array. But in the presence of parameters, we don't + * have a specific value here and must employ an ArrayExpr instead. + */ + + if (have_param) + { + ArrayExpr *arrayExpr = makeNode(ArrayExpr); + + /* array_collid will be set by parse_collate.c */ + arrayExpr->element_typeid = consttype; + arrayExpr->array_typeid = arraytype; + arrayExpr->multidims = false; + arrayExpr->elements = consts; + arrayExpr->location = -1; + + arrayNode = (Node *) arrayExpr; + } + else + { + int16 typlen; + bool typbyval; + char typalign; + Datum *elems; + int i = 0; + ArrayType *arrayConst; + + get_typlenbyvalalign(consttype, &typlen, &typbyval, &typalign); + + elems = (Datum *) palloc(sizeof(Datum) * list_length(consts)); + foreach_node(Const, value, consts) + { + Assert(!value->constisnull); + + elems[i++] = value->constvalue; + } + + arrayConst = construct_array(elems, i, consttype, + typlen, typbyval, typalign); + arrayNode = (Node *) makeConst(arraytype, -1, inputcollid, + -1, PointerGetDatum(arrayConst), + false, false); + + pfree(elems); + list_free(consts); + } + + /* Build the SAOP expression node */ + saopexpr = makeNode(ScalarArrayOpExpr); + saopexpr->opno = matchOpno; + saopexpr->opfuncid = get_opcode(matchOpno); + saopexpr->hashfuncid = InvalidOid; + saopexpr->negfuncid = InvalidOid; + saopexpr->useOr = true; + saopexpr->inputcollid = inputcollid; + saopexpr->args = list_make2(indexExpr, arrayNode); + saopexpr->location = -1; + + /* + * Finally, build an IndexClause based on the SAOP node. Use + * make_simple_restrictinfo() to get RestrictInfo with clean selectivity + * estimations because it may differ from the estimation made for an OR + * clause. Although it is not a lossy expression, keep the old version of + * rinfo in iclause->rinfo to detect duplicates and recheck the original + * clause. + */ + iclause = makeNode(IndexClause); + iclause->rinfo = rinfo; + iclause->indexquals = list_make1(make_simple_restrictinfo(root, + &saopexpr->xpr)); + iclause->lossy = false; + iclause->indexcol = indexcol; + iclause->indexcols = NIL; + return iclause; +} + /* * expand_indexqual_rowcompare --- expand a single indexqual condition * that is a RowCompareExpr diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index c0af10ebd34..4bb56f50c16 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -165,16 +165,12 @@ static MergeJoin *create_mergejoin_plan(PlannerInfo *root, MergePath *best_path) static HashJoin *create_hashjoin_plan(PlannerInfo *root, HashPath *best_path); static Node *replace_nestloop_params(PlannerInfo *root, Node *expr); static Node *replace_nestloop_params_mutator(Node *node, PlannerInfo *root); -static void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, - List **stripped_indexquals_p, - List **fixed_indexquals_p); static List *fix_indexorderby_references(PlannerInfo *root, IndexPath *index_path); static Node *fix_indexqual_clause(PlannerInfo *root, IndexOptInfo *index, int indexcol, Node *clause, List *indexcolnos); static Node *fix_indexqual_operand(Node *node, IndexOptInfo *index, int indexcol); static List *get_switched_clauses(List *clauses, Relids outerrelids); -static List *order_qual_clauses(PlannerInfo *root, List *clauses); static void copy_generic_path_info(Plan *dest, Path *src); static void copy_plan_costsize(Plan *dest, Plan *src); static void label_sort_with_costsize(PlannerInfo *root, Sort *plan, @@ -4939,6 +4935,14 @@ replace_nestloop_params(PlannerInfo *root, Node *expr) return replace_nestloop_params_mutator(expr, root); } +Node * +replace_nestloop_params_compat(PlannerInfo *root, Node *expr) +{ + /* No setup needed for tree walk, so away we go */ + return replace_nestloop_params_mutator(expr, root); +} + + static Node * replace_nestloop_params_mutator(Node *node, PlannerInfo *root) { @@ -5019,7 +5023,7 @@ replace_nestloop_params_mutator(Node *node, PlannerInfo *root) * are subplans in it (we need two separate copies of the subplan tree, or * things will go awry). */ -static void +void fix_indexqual_references(PlannerInfo *root, IndexPath *index_path, List **stripped_indexquals_p, List **fixed_indexquals_p) { @@ -5312,7 +5316,7 @@ get_switched_clauses(List *clauses, Relids outerrelids) * instead of bare clauses. This is another reason why trying to consider * selectivity in the ordering would likely do the wrong thing. */ -static List * +List * order_qual_clauses(PlannerInfo *root, List *clauses) { typedef struct diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 0c7273b9ccd..2d5b3978ca0 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -2314,6 +2314,7 @@ preprocess_rowmarks(PlannerInfo *root) RowMarkClause *rc = lfirst_node(RowMarkClause, l); RangeTblEntry *rte = rt_fetch(rc->rti, parse->rtable); PlanRowMark *newrc; + RowRefType refType; /* * Currently, it is syntactically impossible to have FOR UPDATE et al @@ -2336,8 +2337,8 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = rc->rti; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - newrc->markType = select_rowmark_type(rte, rc->strength); - newrc->allMarkTypes = (1 << newrc->markType); + newrc->markType = select_rowmark_type(rte, rc->strength, &refType); + newrc->allRefTypes = (1 << refType); newrc->strength = rc->strength; newrc->waitPolicy = rc->waitPolicy; newrc->isParent = false; @@ -2353,6 +2354,7 @@ preprocess_rowmarks(PlannerInfo *root) { RangeTblEntry *rte = lfirst_node(RangeTblEntry, l); PlanRowMark *newrc; + RowRefType refType = ROW_REF_TID; i++; if (!bms_is_member(i, rels)) @@ -2361,8 +2363,8 @@ preprocess_rowmarks(PlannerInfo *root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = i; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - newrc->markType = select_rowmark_type(rte, LCS_NONE); - newrc->allMarkTypes = (1 << newrc->markType); + newrc->markType = select_rowmark_type(rte, LCS_NONE, &refType); + newrc->allRefTypes = (1 << refType); newrc->strength = LCS_NONE; newrc->waitPolicy = LockWaitBlock; /* doesn't matter */ newrc->isParent = false; @@ -2377,11 +2379,13 @@ preprocess_rowmarks(PlannerInfo *root) * Select RowMarkType to use for a given table */ RowMarkType -select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength) +select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength, + RowRefType *refType) { if (rte->rtekind != RTE_RELATION) { /* If it's not a table at all, use ROW_MARK_COPY */ + *refType = ROW_REF_COPY; return ROW_MARK_COPY; } else if (rte->relkind == RELKIND_FOREIGN_TABLE) @@ -2392,10 +2396,12 @@ select_rowmark_type(RangeTblEntry *rte, LockClauseStrength strength) if (fdwroutine->GetForeignRowMarkType != NULL) return fdwroutine->GetForeignRowMarkType(rte, strength); /* Otherwise, use ROW_MARK_COPY by default */ + *refType = ROW_REF_COPY; return ROW_MARK_COPY; } else { + *refType = rte->reftype; /* Regular table, apply the appropriate lock type */ switch (strength) { diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c index 931b9c09bda..9c4671c817e 100644 --- a/src/backend/optimizer/prep/preptlist.c +++ b/src/backend/optimizer/prep/preptlist.c @@ -234,7 +234,7 @@ preprocess_targetlist(PlannerInfo *root) if (rc->rti != rc->prti) continue; - if (rc->allMarkTypes & ~(1 << ROW_MARK_COPY)) + if (rc->allRefTypes & (1 << ROW_REF_TID)) { /* Need to fetch TID */ var = makeVar(rc->rti, @@ -250,7 +250,23 @@ preprocess_targetlist(PlannerInfo *root) true); tlist = lappend(tlist, tle); } - if (rc->allMarkTypes & (1 << ROW_MARK_COPY)) + if (rc->allRefTypes & (1 << ROW_REF_ROWID)) + { + /* Need to fetch TID */ + var = makeVar(rc->rti, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "rowid%u", rc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(tlist) + 1, + pstrdup(resname), + true); + tlist = lappend(tlist, tle); + } + if (rc->allRefTypes & (1 << ROW_REF_COPY)) { /* Need the whole row as a junk var */ var = makeWholeRowVar(rt_fetch(rc->rti, range_table), diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c index 6ba4eba224a..ea012b2c164 100644 --- a/src/backend/optimizer/util/appendinfo.c +++ b/src/backend/optimizer/util/appendinfo.c @@ -895,17 +895,35 @@ add_row_identity_columns(PlannerInfo *root, Index rtindex, relkind == RELKIND_MATVIEW || relkind == RELKIND_PARTITIONED_TABLE) { + RowRefType refType = ROW_REF_TID; + + refType = table_get_row_ref_type(target_relation); + /* * Emit CTID so that executor can find the row to merge, update or * delete. */ - var = makeVar(rtindex, - SelfItemPointerAttributeNumber, - TIDOID, - -1, - InvalidOid, - 0); - add_row_identity_var(root, var, rtindex, "ctid"); + if (refType == ROW_REF_TID) + { + var = makeVar(rtindex, + SelfItemPointerAttributeNumber, + TIDOID, + -1, + InvalidOid, + 0); + add_row_identity_var(root, var, rtindex, "ctid"); + } + else + { + Assert(refType == ROW_REF_ROWID); + var = makeVar(rtindex, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + add_row_identity_var(root, var, rtindex, "rowid"); + } } else if (relkind == RELKIND_FOREIGN_TABLE) { diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index c5b906a9d43..17c36c03202 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -16,6 +16,7 @@ #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/partition.h" #include "catalog/pg_inherits.h" #include "catalog/pg_type.h" @@ -91,7 +92,7 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, LOCKMODE lockmode; PlanRowMark *oldrc; bool old_isParent = false; - int old_allMarkTypes = 0; + int old_allRefTypes = 0; Assert(rte->inh); /* else caller error */ @@ -131,8 +132,8 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, { old_isParent = oldrc->isParent; oldrc->isParent = true; - /* Save initial value of allMarkTypes before children add to it */ - old_allMarkTypes = oldrc->allMarkTypes; + /* Save initial value of allRefTypes before children add to it */ + old_allRefTypes = oldrc->allRefTypes; } /* Scan the inheritance set and expand it */ @@ -239,15 +240,15 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, */ if (oldrc) { - int new_allMarkTypes = oldrc->allMarkTypes; + int new_allRefTypes = oldrc->allRefTypes; Var *var; TargetEntry *tle; char resname[32]; List *newvars = NIL; /* Add TID junk Var if needed, unless we had it already */ - if (new_allMarkTypes & ~(1 << ROW_MARK_COPY) && - !(old_allMarkTypes & ~(1 << ROW_MARK_COPY))) + if (new_allRefTypes & (1 << ROW_REF_TID) && + !(old_allRefTypes & (1 << ROW_REF_TID))) { /* Need to fetch TID */ var = makeVar(oldrc->rti, @@ -266,8 +267,8 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, } /* Add whole-row junk Var if needed, unless we had it already */ - if ((new_allMarkTypes & (1 << ROW_MARK_COPY)) && - !(old_allMarkTypes & (1 << ROW_MARK_COPY))) + if ((new_allRefTypes & (1 << ROW_REF_COPY)) && + !(old_allRefTypes & (1 << ROW_REF_COPY))) { var = makeWholeRowVar(planner_rt_fetch(oldrc->rti, root), oldrc->rti, @@ -282,6 +283,24 @@ expand_inherited_rtentry(PlannerInfo *root, RelOptInfo *rel, newvars = lappend(newvars, var); } + if ((new_allRefTypes & (1 << ROW_REF_ROWID)) && + !(old_allRefTypes & (1 << ROW_REF_ROWID))) + { + var = makeVar(oldrc->rti, + RowIdAttributeNumber, + BYTEAOID, + -1, + InvalidOid, + 0); + snprintf(resname, sizeof(resname), "rowid%u", oldrc->rowmarkId); + tle = makeTargetEntry((Expr *) var, + list_length(root->processed_tlist) + 1, + pstrdup(resname), + true); + root->processed_tlist = lappend(root->processed_tlist, tle); + newvars = lappend(newvars, var); + } + /* Add tableoid junk Var, unless we had it already */ if (!old_isParent) { @@ -450,7 +469,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RelOptInfo *relinfo, * where the hierarchy is flattened during RTE expansion.) * * PlanRowMarks still carry the top-parent's RTI, and the top-parent's - * allMarkTypes field still accumulates values from all descendents. + * allRefTypes field still accumulates values from all descendents. * * "parentrte" and "parentRTindex" are immediate parent's RTE and * RTI. "top_parentrc" is top parent's PlanRowMark. @@ -494,6 +513,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Assert(parentrte->rtekind == RTE_RELATION); /* else this is dubious */ childrte->relid = childOID; childrte->relkind = childrel->rd_rel->relkind; + childrte->reftype = table_get_row_ref_type(childrel); /* A partitioned child will need to be expanded further. */ if (childrte->relkind == RELKIND_PARTITIONED_TABLE) { @@ -583,14 +603,16 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, if (top_parentrc) { PlanRowMark *childrc = makeNode(PlanRowMark); + RowRefType refType; childrc->rti = childRTindex; childrc->prti = top_parentrc->rti; childrc->rowmarkId = top_parentrc->rowmarkId; /* Reselect rowmark type, because relkind might not match parent */ childrc->markType = select_rowmark_type(childrte, - top_parentrc->strength); - childrc->allMarkTypes = (1 << childrc->markType); + top_parentrc->strength, + &refType); + childrc->allRefTypes = (1 << refType); childrc->strength = top_parentrc->strength; childrc->waitPolicy = top_parentrc->waitPolicy; @@ -601,8 +623,8 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, */ childrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); - /* Include child's rowmark type in top parent's allMarkTypes */ - top_parentrc->allMarkTypes |= childrc->allMarkTypes; + /* Include child's rowmark type in top parent's allRefTypes */ + top_parentrc->allRefTypes |= childrc->allRefTypes; root->rowMarks = lappend(root->rowMarks, childrc); } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 86655f05dc8..6a45058cbbd 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -58,6 +58,7 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; /* Hook for plugins to get control in get_relation_info() */ get_relation_info_hook_type get_relation_info_hook = NULL; +skip_tree_height_hook_type skip_tree_height_hook = NULL; static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, @@ -485,7 +486,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->tuples = rel->tuples; } - if (info->relam == BTREE_AM_OID) + if (info->relam == BTREE_AM_OID && (!skip_tree_height_hook || !skip_tree_height_hook(indexRelation))) { /* * For btrees, get tree height while we have the index diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c index 0b406e93342..9e1458401c2 100644 --- a/src/backend/optimizer/util/restrictinfo.c +++ b/src/backend/optimizer/util/restrictinfo.c @@ -21,17 +21,6 @@ #include "optimizer/restrictinfo.h" -static RestrictInfo *make_restrictinfo_internal(PlannerInfo *root, - Expr *clause, - Expr *orclause, - bool is_pushed_down, - bool has_clone, - bool is_clone, - bool pseudoconstant, - Index security_level, - Relids required_relids, - Relids incompatible_relids, - Relids outer_relids); static Expr *make_sub_restrictinfos(PlannerInfo *root, Expr *clause, bool is_pushed_down, @@ -90,36 +79,38 @@ make_restrictinfo(PlannerInfo *root, /* Shouldn't be an AND clause, else AND/OR flattening messed up */ Assert(!is_andclause(clause)); - return make_restrictinfo_internal(root, - clause, - NULL, - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return make_plain_restrictinfo(root, + clause, + NULL, + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } /* - * make_restrictinfo_internal + * make_plain_restrictinfo * - * Common code for the main entry points and the recursive cases. + * Common code for the main entry points and the recursive cases. Also, + * useful while contrucitng RestrictInfos above OR clause, which already has + * RestrictInfos above its subclauses. */ -static RestrictInfo * -make_restrictinfo_internal(PlannerInfo *root, - Expr *clause, - Expr *orclause, - bool is_pushed_down, - bool has_clone, - bool is_clone, - bool pseudoconstant, - Index security_level, - Relids required_relids, - Relids incompatible_relids, - Relids outer_relids) +RestrictInfo * +make_plain_restrictinfo(PlannerInfo *root, + Expr *clause, + Expr *orclause, + bool is_pushed_down, + bool has_clone, + bool is_clone, + bool pseudoconstant, + Index security_level, + Relids required_relids, + Relids incompatible_relids, + Relids outer_relids) { RestrictInfo *restrictinfo = makeNode(RestrictInfo); Relids baserels; @@ -296,17 +287,17 @@ make_sub_restrictinfos(PlannerInfo *root, NULL, incompatible_relids, outer_relids)); - return (Expr *) make_restrictinfo_internal(root, - clause, - make_orclause(orlist), - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return (Expr *) make_plain_restrictinfo(root, + clause, + make_orclause(orlist), + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } else if (is_andclause(clause)) { @@ -328,17 +319,17 @@ make_sub_restrictinfos(PlannerInfo *root, return make_andclause(andlist); } else - return (Expr *) make_restrictinfo_internal(root, - clause, - NULL, - is_pushed_down, - has_clone, - is_clone, - pseudoconstant, - security_level, - required_relids, - incompatible_relids, - outer_relids); + return (Expr *) make_plain_restrictinfo(root, + clause, + NULL, + is_pushed_down, + has_clone, + is_clone, + pseudoconstant, + security_level, + required_relids, + incompatible_relids, + outer_relids); } /* diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index bca627c5463..b437e0f7dff 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -373,6 +373,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type OptSchemaEltList parameter_name_list %type am_type +%type opt_for_tableam %type TriggerForSpec TriggerForType %type TriggerActionTime @@ -5870,17 +5871,21 @@ row_security_cmd: /***************************************************************************** * * QUERY: - * CREATE ACCESS METHOD name HANDLER handler_name + * CREATE ACCESS METHOD name TYPE am_type + * [FOR tableam_name] + * HANDLER handler_name * *****************************************************************************/ -CreateAmStmt: CREATE ACCESS METHOD name TYPE_P am_type HANDLER handler_name +CreateAmStmt: CREATE ACCESS METHOD name TYPE_P am_type + opt_for_tableam HANDLER handler_name { CreateAmStmt *n = makeNode(CreateAmStmt); n->amname = $4; - n->handler_name = $8; n->amtype = $6; + n->tableam_name = $7; + n->handler_name = $9; $$ = (Node *) n; } ; @@ -5890,6 +5895,11 @@ am_type: | TABLE { $$ = AMTYPE_TABLE; } ; +opt_for_tableam: + FOR name { $$ = $2; } + | /*EMPTY*/ { $$ = NULL; } + ; + /***************************************************************************** * * QUERIES : diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c index 2f64eaf0e37..37d9b072b38 100644 --- a/src/backend/parser/parse_relation.c +++ b/src/backend/parser/parse_relation.c @@ -20,6 +20,7 @@ #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/heap.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" @@ -1503,6 +1504,7 @@ addRangeTableEntry(ParseState *pstate, rte->inh = inh; rte->relkind = rel->rd_rel->relkind; rte->rellockmode = lockmode; + rte->reftype = table_get_row_ref_type(rel); /* * Build the list of effective column names using user-supplied aliases @@ -1588,6 +1590,7 @@ addRangeTableEntryForRelation(ParseState *pstate, rte->inh = inh; rte->relkind = rel->rd_rel->relkind; rte->rellockmode = lockmode; + rte->reftype = table_get_row_ref_type(rel); /* * Build the list of effective column names using user-supplied aliases @@ -1656,6 +1659,7 @@ addRangeTableEntryForSubquery(ParseState *pstate, rte->rtekind = RTE_SUBQUERY; rte->subquery = subquery; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias("unnamed_subquery", NIL); numaliases = list_length(eref->colnames); @@ -1763,6 +1767,7 @@ addRangeTableEntryForFunction(ParseState *pstate, rte->functions = NIL; /* we'll fill this list below */ rte->funcordinality = rangefunc->ordinality; rte->alias = alias; + rte->reftype = ROW_REF_COPY; /* * Choose the RTE alias name. We default to using the first function's @@ -2079,6 +2084,7 @@ addRangeTableEntryForTableFunc(ParseState *pstate, rte->coltypmods = tf->coltypmods; rte->colcollations = tf->colcollations; rte->alias = alias; + rte->reftype = ROW_REF_COPY; refname = alias ? alias->aliasname : pstrdup(tf->functype == TFT_XMLTABLE ? "xmltable" : "json_table"); @@ -2156,6 +2162,7 @@ addRangeTableEntryForValues(ParseState *pstate, rte->coltypmods = coltypmods; rte->colcollations = colcollations; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias(refname, NIL); @@ -2252,6 +2259,7 @@ addRangeTableEntryForJoin(ParseState *pstate, rte->joinrightcols = rightcols; rte->join_using_alias = join_using_alias; rte->alias = alias; + rte->reftype = ROW_REF_COPY; eref = alias ? copyObject(alias) : makeAlias("unnamed_join", NIL); numaliases = list_length(eref->colnames); @@ -2332,6 +2340,7 @@ addRangeTableEntryForCTE(ParseState *pstate, rte->rtekind = RTE_CTE; rte->ctename = cte->ctename; rte->ctelevelsup = levelsup; + rte->reftype = ROW_REF_COPY; /* Self-reference if and only if CTE's parse analysis isn't completed */ rte->self_reference = !IsA(cte->ctequery, Query); @@ -2494,6 +2503,7 @@ addRangeTableEntryForENR(ParseState *pstate, * if they access transition tables linked to a table that is altered. */ rte->relid = enrmd->reliddesc; + rte->reftype = ROW_REF_COPY; /* * Build the list of effective column names using user-supplied aliases @@ -3262,6 +3272,9 @@ get_rte_attribute_name(RangeTblEntry *rte, AttrNumber attnum) attnum > 0 && attnum <= list_length(rte->alias->colnames)) return strVal(list_nth(rte->alias->colnames, attnum - 1)); + if (attnum == RowIdAttributeNumber) + return "rowid"; + /* * If the RTE is a relation, go to the system catalogs not the * eref->colnames list. This is a little slower but it will give the diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index eaf46ab6871..ad207acae60 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -2310,19 +2310,6 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt) errdetail("Cannot create a non-deferrable constraint using a deferrable index."), parser_errposition(cxt->pstate, constraint->location))); - /* - * Insist on it being a btree. That's the only kind that supports - * uniqueness at the moment anyway; but we must have an index that - * exactly matches what you'd get from plain ADD CONSTRAINT syntax, - * else dump and reload will produce a different index (breaking - * pg_upgrade in particular). - */ - if (index_rel->rd_rel->relam != get_index_am_oid(DEFAULT_INDEX_TYPE, false)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("index \"%s\" is not a btree", index_name), - parser_errposition(cxt->pstate, constraint->location))); - /* Must get indclass the hard way */ indclassDatum = SysCacheGetAttrNotNull(INDEXRELID, index_rel->rd_indextuple, diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 8f27026d193..0f85dc13407 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -458,6 +458,7 @@ AutoVacLauncherMain(char *startup_data, size_t startup_data_len) * transaction. */ LWLockReleaseAll(); + CustomErrorCleanup(); pgstat_report_wait_end(); UnlockBuffers(); /* this is probably dead code, but let's be safe: */ @@ -2680,7 +2681,9 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW || ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE); - relopts = extractRelOptions(tup, pg_class_desc, NULL); + relopts = extractRelOptions(tup, pg_class_desc, + GetTableAmRoutineByAmOid(((Form_pg_class) GETSTRUCT(tup))->relam), + NULL); if (relopts == NULL) return NULL; diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index 78f4263eeb1..4dae7ce9c3c 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -101,6 +101,7 @@ static void ShutdownAuxiliaryProcess(int code, Datum arg) { LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); } diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 0f75548759a..74cc63cc89f 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -167,6 +167,7 @@ BackgroundWriterMain(char *startup_data, size_t startup_data_len) * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); UnlockBuffers(); ReleaseAuxProcessResources(false); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 199f008bcda..b6767a39911 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -54,11 +54,20 @@ #include "storage/proc.h" #include "storage/procsignal.h" #include "storage/shmem.h" +#include "storage/sinvaladt.h" #include "storage/smgr.h" #include "storage/spin.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner.h" +#include "utils/syscache.h" + +/* + * Included for InitializeTimeouts and RegisterTimeout functions that + * needed for correct working of OrioleDB checkpoint. + * See comment for InitializeTimeouts call in CheckpointerMain for details. + */ +#include "utils/timeout.h" /*---------- @@ -204,6 +213,21 @@ CheckpointerMain(char *startup_data, size_t startup_data_len) */ pqsignal(SIGCHLD, SIG_DFL); + /* + * To use OrioleDB checkpoint, we must initialize the data for the primary + * lock mechanism (lock.h) to work correctly. Because locks of this type are + * needed by the OrioleDB module for debug events and relation locks, but + * they are not used by the postgres checkpointer and are not initialized + * for it. + */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + InitDeadLockChecking(); + RegisterTimeout(DEADLOCK_TIMEOUT, CheckDeadLockAlert); + RelationCacheInitialize(); + InitCatalogCache(); + SharedInvalBackendInit(false); + + /* * Initialize so that first time-driven event happens at the correct time. */ @@ -266,6 +290,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len) * files. */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); UnlockBuffers(); diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 02f91431f5f..35af55cd678 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -760,6 +760,22 @@ pgarch_readyXlog(char *xlog) for (int i = 0; i < arch_files->arch_files_size; i++) arch_files->arch_files[i] = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); + /* + * Preload the WAL files if the relevant callback is provided. + */ + if (ArchiveCallbacks->archive_preload_file_cb) + { + for (int i = 0; i < arch_files->arch_files_size; i++) + { + char *xlog1 = arch_files->arch_files[i]; + char pathname[MAXPGPATH]; + + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog1); + ArchiveCallbacks->archive_preload_file_cb(archive_module_state, + xlog1, pathname); + } + } + /* Return the highest priority file. */ arch_files->arch_files_size--; strcpy(xlog, arch_files->arch_files[arch_files->arch_files_size]); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index bf0241aed0c..16fde3e8ec4 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -85,10 +85,6 @@ #include #endif -#ifdef HAVE_PTHREAD_IS_THREADED_NP -#include -#endif - #include "access/xlog.h" #include "access/xlogrecovery.h" #include "common/file_perm.h" @@ -137,7 +133,8 @@ #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ -#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ +#define BACKEND_TYPE_SYSTEM_BGWORKER 0x0010 /* system bgworker process */ +#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */ /* * List of active backends (or child processes anyway; we don't actually @@ -439,7 +436,7 @@ static void MaybeStartSlotSyncWorker(void); * even during recovery. */ #define PgArchStartupAllowed() \ - (((XLogArchivingActive() && pmState == PM_RUN) || \ + (((XLogArchivingActive() && (pmState == PM_RUN || pmState == PM_SHUTDOWN)) || \ (XLogArchivingAlways() && \ (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) && \ PgArchCanRestart()) @@ -482,6 +479,12 @@ int postmaster_alive_fds[2] = {-1, -1}; HANDLE PostmasterHandle; #endif +bool +IsFatalError(void) +{ + return FatalError; +} + /* * Postmaster main entry point */ @@ -1323,24 +1326,6 @@ PostmasterMain(int argc, char *argv[]) */ } -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * On macOS, libintl replaces setlocale() with a version that calls - * CFLocaleCopyCurrent() when its second argument is "" and every relevant - * environment variable is unset or empty. CFLocaleCopyCurrent() makes - * the process multithreaded. The postmaster calls sigprocmask() and - * calls fork() without an immediate exec(), both of which have undefined - * behavior in a multithreaded program. A multithreaded postmaster is the - * normal case on Windows, which offers neither fork() nor sigprocmask(). - */ - if (pthread_is_threaded_np() != 0) - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("postmaster became multithreaded during startup"), - errhint("Set the LC_ALL environment variable to a valid locale."))); -#endif - /* * Remember postmaster startup time */ @@ -1749,15 +1734,6 @@ ServerLoop(void) if (StartWorkerNeeded || HaveCrashedWorker) maybe_start_bgworkers(); -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * With assertions enabled, check regularly for appearance of - * additional threads. All builds check at start and exit. - */ - Assert(pthread_is_threaded_np() == 0); -#endif - /* * Lastly, check to see if it's time to do some things that we don't * want to do every single time through the loop, because they're a @@ -1888,8 +1864,9 @@ processCancelRequest(int backendPID, int32 cancelAuthCode) /* * canAcceptConnections --- check to see if database state allows connections * of the specified type. backend_type can be BACKEND_TYPE_NORMAL, - * BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet - * know whether a NORMAL connection might turn into a walsender.) + * BACKEND_TYPE_AUTOVAC, BACKEND_TYPE_BGWORKER or BACKEND_TYPE_SYSTEM_BGWORKER. + * (Note that we don't yet know whether a NORMAL connection might turn into + * a walsender.) */ static CAC_state canAcceptConnections(int backend_type) @@ -1903,7 +1880,8 @@ canAcceptConnections(int backend_type) * bgworker_should_start_now() decided whether the DB state allows them. */ if (pmState != PM_RUN && pmState != PM_HOT_STANDBY && - backend_type != BACKEND_TYPE_BGWORKER) + backend_type != BACKEND_TYPE_BGWORKER && + backend_type != BACKEND_TYPE_SYSTEM_BGWORKER) { if (Shutdown > NoShutdown) return CAC_SHUTDOWN; /* shutdown is pending */ @@ -2534,6 +2512,13 @@ process_pm_child_exit(void) if (PgArchPID != 0) signal_child(PgArchPID, SIGUSR2); + /* + * Terminate system background workers since checpoint is + * complete. + */ + SignalSomeChildren(SIGTERM, + BACKEND_TYPE_SYSTEM_BGWORKER); + /* * Waken walsenders for the last time. No regular backends * should be around anymore. @@ -2965,7 +2950,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) * Background workers were already processed above; ignore them * here. */ - if (bp->bkend_type == BACKEND_TYPE_BGWORKER) + if (bp->bkend_type == BACKEND_TYPE_BGWORKER || + bp->bkend_type == BACKEND_TYPE_SYSTEM_BGWORKER) continue; if (take_action) @@ -3156,7 +3142,7 @@ PostmasterStateMachine(void) /* Signal all backend children except walsenders */ SignalSomeChildren(SIGTERM, - BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); + BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND - BACKEND_TYPE_SYSTEM_BGWORKER); /* and the autovac launcher too */ if (AutoVacPID != 0) signal_child(AutoVacPID, SIGTERM); @@ -3198,7 +3184,7 @@ PostmasterStateMachine(void) * here. Walsenders and archiver are also disregarded, they will be * terminated later after writing the checkpoint record. */ - if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && + if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND - BACKEND_TYPE_SYSTEM_BGWORKER) == 0 && StartupPID == 0 && WalReceiverPID == 0 && WalSummarizerPID == 0 && @@ -3667,21 +3653,6 @@ report_fork_failure_to_client(ClientSocket *client_sock, int errnum) static void ExitPostmaster(int status) { -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * There is no known cause for a postmaster to become multithreaded after - * startup. Recheck to account for the possibility of unknown causes. - * This message uses LOG level, because an unclean shutdown at this point - * would usually not look much different from a clean shutdown. - */ - if (pthread_is_threaded_np() != 0) - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg_internal("postmaster became multithreaded"), - errdetail("Please report this to <%s>.", PACKAGE_BUGREPORT))); -#endif - /* should cleanup shared memory and kill all backends */ /* @@ -4300,16 +4271,20 @@ do_start_bgworker(RegisteredBgWorker *rw) * specified start_time? */ static bool -bgworker_should_start_now(BgWorkerStartTime start_time) +bgworker_should_start_now(BgWorkerStartTime start_time, int flags) { switch (pmState) { case PM_NO_CHILDREN: case PM_WAIT_DEAD_END: case PM_SHUTDOWN_2: + break; + case PM_SHUTDOWN: case PM_WAIT_BACKENDS: case PM_STOP_BACKENDS: + if (flags & BGWORKER_CLASS_SYSTEM) + return true; break; case PM_RUN: @@ -4384,7 +4359,10 @@ assign_backendlist_entry(RegisteredBgWorker *rw) bn->cancel_key = MyCancelKey; bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); - bn->bkend_type = BACKEND_TYPE_BGWORKER; + if (rw->rw_worker.bgw_flags & BGWORKER_CLASS_SYSTEM) + bn->bkend_type = BACKEND_TYPE_SYSTEM_BGWORKER; + else + bn->bkend_type = BACKEND_TYPE_BGWORKER; bn->dead_end = false; bn->bgworker_notify = false; @@ -4482,7 +4460,8 @@ maybe_start_bgworkers(void) } } - if (bgworker_should_start_now(rw->rw_worker.bgw_start_time)) + if (bgworker_should_start_now(rw->rw_worker.bgw_start_time, + rw->rw_worker.bgw_flags)) { /* reset crash time before trying to start worker */ rw->rw_crashed_at = 0; diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index ef6f98ebcd7..5cea0f97a30 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -75,6 +75,8 @@ static volatile sig_atomic_t startup_progress_timer_expired = false; */ int log_startup_progress_interval = 10000; /* 10 sec */ +HandleStartupProcInterrupts_hook_type HandleStartupProcInterrupts_hook = NULL; + /* Signal handlers */ static void StartupProcTriggerHandler(SIGNAL_ARGS); static void StartupProcSigHupHandler(SIGNAL_ARGS); @@ -157,6 +159,9 @@ HandleStartupProcInterrupts(void) static uint32 postmaster_poll_count = 0; #endif + if (HandleStartupProcInterrupts_hook) + HandleStartupProcInterrupts_hook(); + /* * Process any requests or signals received recently. */ diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 6e7918a78d4..3cb439d377a 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -164,6 +164,7 @@ WalWriterMain(char *startup_data, size_t startup_data_len) * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); UnlockBuffers(); diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c index 95c09c95167..db41c955ec1 100644 --- a/src/backend/replication/logical/proto.c +++ b/src/backend/replication/logical/proto.c @@ -814,7 +814,7 @@ logicalrep_write_tuple(StringInfo out, Relation rel, TupleTableSlot *slot, continue; } - if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + if (att->attlen == -1 && (VARATT_IS_EXTERNAL_ONDISK(values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(values[i]))) { /* * Unchanged toasted datum. (Note that we don't promise to detect diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index f139e7b01e9..4429127c434 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -834,7 +834,7 @@ IsIndexUsableForReplicaIdentityFull(IndexInfo *indexInfo, AttrMap *attrmap) IndexAmRoutine *amroutine; /* The given index access method must implement amgettuple. */ - amroutine = GetIndexAmRoutineByAmId(indexInfo->ii_Am, false); + amroutine = GetIndexAmRoutineByAmId(InvalidOid, indexInfo->ii_Am, false); Assert(amroutine->amgettuple != NULL); } #endif diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index ae676145e60..12ebfdb4702 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -219,6 +219,8 @@ struct SnapBuild */ TransactionId next_phase_at; + CSNSnapshotData csnSnapshotData; + /* * Array of transactions which could have catalog changes that committed * between xmin and xmax. @@ -418,6 +420,17 @@ SnapBuildCurrentState(SnapBuild *builder) return builder->state; } +/* + * An which transaction id the next phase of initial snapshot building will + * happen? + */ +TransactionId +SnapBuildNextPhaseAt(SnapBuild *builder) +{ + return builder->next_phase_at; +} + + /* * Return the LSN at which the two-phase decoding was first enabled. */ @@ -565,6 +578,8 @@ SnapBuildBuildSnapshot(SnapBuild *builder) snapshot->regd_count = 0; snapshot->snapXactCompletionCount = 0; + snapshot->csnSnapshotData = builder->csnSnapshotData; + return snapshot; } @@ -662,6 +677,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) snap->snapshot_type = SNAPSHOT_MVCC; snap->xcnt = newxcnt; snap->xip = newxip; + snap->csnSnapshotData = builder->csnSnapshotData; return snap; } @@ -1042,6 +1058,8 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, TransactionId xmax = xid; + builder->csnSnapshotData.xlogptr = lsn; + /* * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor * will they be part of a snapshot. So we don't need to record anything. @@ -1229,6 +1247,10 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact ReorderBufferTXN *txn; TransactionId xmin; + builder->csnSnapshotData.snapshotcsn = running->csn; + builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.xlogptr = lsn; + /* * If we're not consistent yet, inspect the record to see whether it * allows to get closer to being consistent. If we are consistent, dump @@ -1256,6 +1278,9 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * we hit fast paths in heapam_visibility.c. */ builder->xmin = running->oldestRunningXid; + builder->csnSnapshotData.snapshotcsn = running->csn; + builder->csnSnapshotData.xmin = 0; + builder->csnSnapshotData.xlogptr = lsn; /* Remove transactions we don't need to keep track off anymore */ SnapBuildPurgeOlderTxn(builder); @@ -2174,3 +2199,10 @@ SnapBuildSnapshotExists(XLogRecPtr lsn) return ret == 0; } + +void +SnapBuildUpdateCSNSnaphot(SnapBuild *builder, + CSNSnapshotData *csnSnapshotData) +{ + builder->csnSnapshotData = *csnSnapshotData; +} diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index d091a1dd27c..f5a83e52eef 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2417,9 +2417,8 @@ apply_handle_insert(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* Process and store remote tuple in the slot */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); @@ -2573,9 +2572,8 @@ apply_handle_update(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* * Populate updatedCols so that per-column triggers can fire, and so @@ -2753,9 +2751,8 @@ apply_handle_delete(StringInfo s) /* Initialize the executor state. */ edata = create_edata_for_relation(rel); estate = edata->estate; - remoteslot = ExecInitExtraTupleSlot(estate, - RelationGetDescr(rel->localrel), - &TTSOpsVirtual); + remoteslot = table_slot_create(rel->localrel, + &estate->es_tupleTable); /* Build the search tuple. */ oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 00e7024563e..e6a4f0063a1 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -1320,8 +1320,8 @@ pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot, * VARTAG_INDIRECT. See ReorderBufferToastReplace. */ if (att->attlen == -1 && - VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) && - !VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i])) + (VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(new_slot->tts_values[i])) && + !(VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i]) || VARATT_IS_EXTERNAL_ORIOLEDB(old_slot->tts_values[i])) ) { if (!tmp_new_slot) { diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index c3181e3295e..71be0e15f61 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -327,6 +327,7 @@ void WalSndErrorCleanup(void) { LWLockReleaseAll(); + CustomErrorCleanup(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 6d59a2bb8dc..e9696b52d9f 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -23,6 +23,7 @@ #include "access/relation.h" #include "access/sysattr.h" #include "access/table.h" +#include "access/tableam.h" #include "catalog/dependency.h" #include "commands/trigger.h" #include "executor/executor.h" diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 61816730955..e179056de9f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3086,6 +3086,7 @@ BufferSync(int flags) BufferDesc *bufHdr = NULL; CkptTsStatus *ts_stat = (CkptTsStatus *) DatumGetPointer(binaryheap_first(ts_heap)); + double progress; buf_id = CkptBufferIds[ts_stat->index].buf_id; Assert(buf_id != -1); @@ -3140,7 +3141,10 @@ BufferSync(int flags) * * (This will check for barrier events even if it doesn't sleep.) */ - CheckpointWriteDelay(flags, (double) num_processed / num_to_scan); + progress = (double) num_processed / num_to_scan; + progress = CheckPointProgress + progress * (1 - CheckPointProgress); + + CheckpointWriteDelay(flags, progress); } /* diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 9fc930e98f8..e7b083ee7a7 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -306,6 +306,8 @@ static GlobalVisState GlobalVisTempRels; */ static TransactionId ComputeXidHorizonsResultLastXmin; +snapshot_hook_type snapshot_hook = NULL; + #ifdef XIDCACHE_DEBUG /* counters for XidCache measurement */ @@ -749,6 +751,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) proc->delayChkptFlags = 0; proc->recoveryConflictPending = false; + proc->lastCommittedCSN = pg_atomic_fetch_add_u64(&TransamVariables->nextCommitSeqNo, 1); /* must be cleared with xid/xmin: */ /* avoid unnecessarily dirtying shared cachelines */ @@ -2234,6 +2237,8 @@ GetSnapshotData(Snapshot snapshot) if (GetSnapshotDataReuse(snapshot)) { + if (snapshot_hook) + snapshot_hook(snapshot); LWLockRelease(ProcArrayLock); return snapshot; } @@ -2415,6 +2420,9 @@ GetSnapshotData(Snapshot snapshot) if (!TransactionIdIsValid(MyProc->xmin)) MyProc->xmin = TransactionXmin = xmin; + if (snapshot_hook) + snapshot_hook(snapshot); + LWLockRelease(ProcArrayLock); /* maintain state for GlobalVis* */ @@ -2850,6 +2858,7 @@ GetRunningTransactionData(void) CurrentRunningXacts->oldestRunningXid = oldestRunningXid; CurrentRunningXacts->oldestDatabaseRunningXid = oldestDatabaseRunningXid; CurrentRunningXacts->latestCompletedXid = latestCompletedXid; + CurrentRunningXacts->csn = pg_atomic_read_u64(&TransamVariables->nextCommitSeqNo); Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 872679ca447..17ddeb893c6 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -1353,6 +1353,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) xlrec.nextXid = CurrRunningXacts->nextXid; xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; + xlrec.csn = CurrRunningXacts->csn; /* Header */ XLogBeginInsert(); diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 0400a507779..98421b6dda5 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -635,6 +635,27 @@ GetLockMethodLocalHash(void) } #endif +/* + * Returns true if any LOCKMODE lock with given locktag exist in LocalMethodLocalHash. + */ +bool +DoLocalLockExist(const LOCKTAG *locktag) +{ + HASH_SEQ_STATUS scan_status; + LOCALLOCK* locallock; + + hash_seq_init(&scan_status, LockMethodLocalHash); + while ((locallock = (LOCALLOCK *) hash_seq_search(&scan_status)) != NULL) + { + if (memcmp(&locallock->tag.lock, locktag, sizeof(LOCKTAG)) == 0) + { + hash_seq_term(&scan_status); + return true; + } + } + return false; +} + /* * LockHasWaiters -- look up 'locktag' and check if releasing this * lock would wake up other processes waiting for it. @@ -784,7 +805,7 @@ LockAcquireExtended(const LOCKTAG *locktag, bool reportMemoryError, LOCALLOCK **locallockp) { - LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LOCKMETHODID lockmethodid; LockMethod lockMethodTable; LOCALLOCKTAG localtag; LOCALLOCK *locallock; @@ -796,6 +817,15 @@ LockAcquireExtended(const LOCKTAG *locktag, LWLock *partitionLock; bool found_conflict; bool log_lock = false; + bool no_log_lock = false; + + if (locktag->locktag_lockmethodid == NO_LOG_LOCKMETHOD) + { + ((LOCKTAG *)locktag)->locktag_lockmethodid = DEFAULT_LOCKMETHOD; + no_log_lock = true; + } + + lockmethodid = locktag->locktag_lockmethodid; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -910,7 +940,8 @@ LockAcquireExtended(const LOCKTAG *locktag, if (lockmode >= AccessExclusiveLock && locktag->locktag_type == LOCKTAG_RELATION && !RecoveryInProgress() && - XLogStandbyInfoActive()) + XLogStandbyInfoActive() && + !no_log_lock) { LogAccessExclusiveLockPrepare(); log_lock = true; @@ -1087,6 +1118,8 @@ LockAcquireExtended(const LOCKTAG *locktag, */ if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) { + int i; + AbortStrongLockAcquire(); if (dontWait) @@ -1136,7 +1169,27 @@ LockAcquireExtended(const LOCKTAG *locktag, PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock); LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode); LWLockRelease(partitionLock); - elog(ERROR, "LockAcquire failed"); + /* + * We've been removed from the queue without obtaining a lock. + * That's OK, we're going to return LOCKACQUIRE_NOT_AVAIL, but + * need to release a local lock first. + */ + locallock->nLocks--; + for (i = 0; i < locallock->numLockOwners; i++) + { + if (locallock->lockOwners[i].owner == owner) + { + locallock->lockOwners[i].nLocks--; + if (locallock->lockOwners[i].nLocks == 0) + { + ResourceOwnerForgetLock(owner, locallock); + locallock->lockOwners[i] = locallock->lockOwners[--locallock->numLockOwners]; + } + break; + } + } + + return LOCKACQUIRE_NOT_AVAIL; } } PROCLOCK_PRINT("LockAcquire: granted", proclock); @@ -4646,8 +4699,8 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) LWLockRelease(&proc->fpInfoLock); /* Time to wait. */ - (void) LockAcquire(&tag, ShareLock, false, false); - + if (LockAcquire(&tag, ShareLock, false, false) == LOCKACQUIRE_NOT_AVAIL) + return false; LockRelease(&tag, ShareLock, false); return XactLockForVirtualXact(vxid, xid, wait); } diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index ce29da90121..bbfafd2a73e 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -861,6 +861,7 @@ ProcKill(int code, Datum arg) * facility by releasing our PGPROC ... */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); @@ -982,6 +983,7 @@ AuxiliaryProcKill(int code, Datum arg) /* Release any LW locks I am holding (see notes above) */ LWLockReleaseAll(); + CustomErrorCleanup(); /* Cancel any pending condition variable sleep, too */ ConditionVariableCancelSleep(); @@ -1251,7 +1253,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait) * If InHotStandby we set lock waits slightly later for clarity with other * code. */ - if (!InHotStandby) + if (!InHotStandby && !InRecovery) { if (LockTimeout > 0) { @@ -1611,7 +1613,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait) * already caused QueryCancelPending to become set, we want the cancel to * be reported as a lock timeout, not a user cancel. */ - if (!InHotStandby) + if (!InHotStandby && !InRecovery) { if (LockTimeout > 0) { diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c index dd39a994c8d..b7ebe6a5f76 100644 --- a/src/backend/utils/adt/amutils.c +++ b/src/backend/utils/adt/amutils.c @@ -195,7 +195,7 @@ indexam_property(FunctionCallInfo fcinfo, /* * Get AM information. If we don't have a valid AM OID, return NULL. */ - routine = GetIndexAmRoutineByAmId(amoid, true); + routine = GetIndexAmRoutineByAmId(index_oid, amoid, true); if (routine == NULL) PG_RETURN_NULL(); @@ -455,7 +455,7 @@ pg_indexam_progress_phasename(PG_FUNCTION_ARGS) IndexAmRoutine *routine; char *name; - routine = GetIndexAmRoutineByAmId(amoid, true); + routine = GetIndexAmRoutineByAmId(InvalidOid, amoid, true); if (routine == NULL || !routine->ambuildphasename) PG_RETURN_NULL(); diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index e790f856ab3..b26e51246c1 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -16,8 +16,11 @@ #include "funcapi.h" #include "miscadmin.h" #include "storage/predicate_internals.h" +#include "storage/proc.h" +#include "storage/procarray.h" #include "utils/array.h" #include "utils/builtins.h" +#include "utils/wait_event.h" /* diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 1c57f12695e..b3b396b6a78 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -133,6 +133,7 @@ typedef struct static HTAB *collation_cache = NULL; +pg_newlocale_from_collation_hook_type pg_newlocale_from_collation_hook = NULL; #if defined(WIN32) && defined(LC_MESSAGES) static char *IsoLocaleName(const char *); @@ -1673,6 +1674,7 @@ pg_newlocale_from_collation(Oid collid) { char *actual_versionstr; char *collversionstr; + int level = WARNING; collversionstr = TextDatumGetCString(datum); @@ -1695,8 +1697,11 @@ pg_newlocale_from_collation(Oid collid) NameStr(collform->collname)))); } + if (pg_newlocale_from_collation_hook && pg_newlocale_from_collation_hook()) + level = ERROR; + if (strcmp(actual_versionstr, collversionstr) != 0) - ereport(WARNING, + ereport(level, (errmsg("collation \"%s\" has version mismatch", NameStr(collform->collname)), errdetail("The collation in the database was created using version %s, " diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 62601a6d80c..9760febe7cc 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -244,6 +244,7 @@ RI_FKey_check(TriggerData *trigdata) TupleTableSlot *newslot; RI_QueryKey qkey; SPIPlanPtr qplan; + Relation rel = trigdata->tg_relation; riinfo = ri_FetchConstraintInfo(trigdata->tg_trigger, trigdata->tg_relation, false); @@ -261,7 +262,7 @@ RI_FKey_check(TriggerData *trigdata) * and lock on the buffer to call HeapTupleSatisfiesVisibility. Caller * should be holding pin, but not lock. */ - if (!table_tuple_satisfies_snapshot(trigdata->tg_relation, newslot, SnapshotSelf)) + if (!table_tuple_satisfies_snapshot(rel, newslot, SnapshotSelf)) return PointerGetDatum(NULL); /* @@ -1327,7 +1328,7 @@ RI_FKey_fk_upd_check_required(Trigger *trigger, Relation fk_rel, * this if we knew the INSERT trigger already fired, but there is no easy * way to know that.) */ - if (slot_is_current_xact_tuple(oldslot)) + if (table_tuple_is_current(fk_rel, oldslot)) return true; /* If all old and new key values are equal, no check is needed */ diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index cd9c3eddd1d..216db91f335 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1323,7 +1323,7 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, amrec = (Form_pg_am) GETSTRUCT(ht_am); /* Fetch the index AM's API struct */ - amroutine = GetIndexAmRoutine(amrec->amhandler); + amroutine = GetIndexAmRoutineExtended(indexrelid, amrec->amhandler); /* * Get the index expressions, if any. (NOTE: we do not use the relcache diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 5f5d7959d8e..884d12da88c 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6339,12 +6339,32 @@ get_actual_variable_endpoint(Relation heapRel, index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); - /* Fetch first/next tuple in specified direction */ - while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) + while (true) { - BlockNumber block = ItemPointerGetBlockNumber(tid); + BlockNumber block = InvalidBlockNumber; - if (!VM_ALL_VISIBLE(heapRel, + /* Fetch first/next tuple in specified direction */ + if (index_scan->xs_want_rowid) + { + NullableDatum rowid; + rowid = index_getnext_rowid(index_scan, indexscandir); + + if (rowid.isnull) + break; + } + else + { + tid = index_getnext_tid(index_scan, indexscandir); + + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &index_scan->xs_heaptid)); + block = ItemPointerGetBlockNumber(tid); + } + + if (!index_scan->xs_want_rowid && + !VM_ALL_VISIBLE(heapRel, block, &vmbuffer)) { diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index d2e2e9bbba0..66625735b21 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -5139,7 +5139,7 @@ pg_column_toast_chunk_id(PG_FUNCTION_ARGS) attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0)); - if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + if (!(VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr))) PG_RETURN_NULL(); VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); diff --git a/src/backend/utils/adt/waitfuncs.c b/src/backend/utils/adt/waitfuncs.c index e135c9e5e45..c68b36121e3 100644 --- a/src/backend/utils/adt/waitfuncs.c +++ b/src/backend/utils/adt/waitfuncs.c @@ -38,6 +38,7 @@ Datum pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) { + PGPROC *blocked_proc; int blocked_pid = PG_GETARG_INT32(0); ArrayType *interesting_pids_a = PG_GETARG_ARRAYTYPE_P(1); PGPROC *proc; @@ -109,5 +110,9 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) if (GetSafeSnapshotBlockingPids(blocked_pid, &dummy, 1) > 0) PG_RETURN_BOOL(true); + blocked_proc = BackendPidGetProc(blocked_pid); + if ((blocked_proc->wait_event_info & 0xFF000000) == PG_WAIT_EXTENSION) + PG_RETURN_BOOL(true); + PG_RETURN_BOOL(false); } diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 111d8a280a0..1b2ad91424c 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -38,6 +38,7 @@ #include "utils/memutils.h" #include "utils/rel.h" #include "utils/resowner.h" +#include "utils/resowner_private.h" #include "utils/syscache.h" @@ -64,6 +65,10 @@ /* Cache management header --- pointer is NULL until created */ static CatCacheHeader *CacheHdr = NULL; +SearchCatCacheInternal_hook_type SearchCatCacheInternal_hook = NULL; +SearchCatCacheList_hook_type SearchCatCacheList_hook = NULL; +GetCatCacheHashValue_hook_type GetCatCacheHashValue_hook = NULL; + static inline HeapTuple SearchCatCacheInternal(CatCache *cache, int nkeys, Datum v1, Datum v2, @@ -137,7 +142,7 @@ static const ResourceOwnerDesc catlistref_resowner_desc = }; /* Convenience wrappers over ResourceOwnerRemember/Forget */ -static inline void +void ResourceOwnerRememberCatCacheRef(ResourceOwner owner, HeapTuple tuple) { ResourceOwnerRemember(owner, PointerGetDatum(tuple), &catcache_resowner_desc); @@ -147,7 +152,7 @@ ResourceOwnerForgetCatCacheRef(ResourceOwner owner, HeapTuple tuple) { ResourceOwnerForget(owner, PointerGetDatum(tuple), &catcache_resowner_desc); } -static inline void +void ResourceOwnerRememberCatCacheListRef(ResourceOwner owner, CatCList *list) { ResourceOwnerRemember(owner, PointerGetDatum(list), &catlistref_resowner_desc); @@ -1324,6 +1329,14 @@ SearchCatCacheInternal(CatCache *cache, dlist_head *bucket; CatCTup *ct; + if (SearchCatCacheInternal_hook) + { + ct = SearchCatCacheInternal_hook(cache, nkeys, v1, v2, v3, v4); + + if (ct) + return &ct->tuple; + } + /* Make sure we're in an xact, even if this ends up being a cache hit */ Assert(IsTransactionState()); @@ -1616,6 +1629,11 @@ GetCatCacheHashValue(CatCache *cache, Datum v3, Datum v4) { + if (GetCatCacheHashValue_hook) + { + return GetCatCacheHashValue_hook(cache, cache->cc_nkeys, + v1, v2, v3, v4); + } /* * one-time startup overhead for each cache */ @@ -1666,6 +1684,14 @@ SearchCatCacheList(CatCache *cache, MemoryContext oldcxt; int i; + if (SearchCatCacheList_hook) + { + cl = SearchCatCacheList_hook(cache, nkeys, v1, v2, v3); + + if (cl) + return cl; + } + /* * one-time startup overhead for each cache */ diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 603aa4157be..4b779ccd951 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -251,6 +251,7 @@ int debug_discard_caches = 0; #define MAX_SYSCACHE_CALLBACKS 64 #define MAX_RELCACHE_CALLBACKS 10 +#define MAX_USERCACHE_CALLBACKS 10 static struct SYSCACHECALLBACK { @@ -272,6 +273,14 @@ static struct RELCACHECALLBACK static int relcache_callback_count = 0; +static struct USERCACHECALLBACK +{ + UsercacheCallbackFunction function; + Datum arg; +} usercache_callback_list[MAX_RELCACHE_CALLBACKS]; + +static int usercache_callback_count = 0; + /* ---------------------------------------------------------------- * Invalidation subgroup support functions * ---------------------------------------------------------------- @@ -692,6 +701,16 @@ InvalidateSystemCachesExtended(bool debug_discard) ccitem->function(ccitem->arg, InvalidOid); } + + for (i = 0; i < usercache_callback_count; i++) + { + struct USERCACHECALLBACK *ccitem = usercache_callback_list + i; + + ccitem->function(ccitem->arg, + InvalidOid, + InvalidOid, + InvalidOid); + } } /* @@ -773,6 +792,19 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) else if (msg->sn.dbId == MyDatabaseId) InvalidateCatalogSnapshot(); } + else if (msg->id == SHAREDINVALUSERCACHE_ID) + { + int i; + for (i = 0; i < usercache_callback_count; i++) + { + struct USERCACHECALLBACK *ccitem = usercache_callback_list + i; + + ccitem->function(ccitem->arg, + msg->usr.arg1, + msg->usr.arg2, + msg->usr.arg3); + } + } else elog(FATAL, "unrecognized SI message ID: %d", msg->id); } @@ -1429,6 +1461,25 @@ CacheInvalidateRelcacheByRelid(Oid relid) ReleaseSysCache(tup); } +/* + * CacheInvalidateRelcacheByDbidRelid + */ +void +CacheInvalidateRelcacheByDbidRelid(Oid dbid, Oid relid) +{ + SharedInvalidationMessage msg; + + PrepareInvalidationState(); + + msg.rc.id = SHAREDINVALRELCACHE_ID; + msg.rc.dbId = dbid; + msg.rc.relId = relid; + /* check AddCatcacheInvalidationMessage() for an explanation */ + VALGRIND_MAKE_MEM_DEFINED(&msg, sizeof(msg)); + + SendSharedInvalidMessages(&msg, 1); +} + /* * CacheInvalidateSmgr @@ -1567,6 +1618,22 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, ++relcache_callback_count; } +/* + * CacheRegisterUsercacheCallback + */ +void +CacheRegisterUsercacheCallback(UsercacheCallbackFunction func, + Datum arg) +{ + if (usercache_callback_count >= MAX_USERCACHE_CALLBACKS) + elog(FATAL, "out of usercache_callback_list slots"); + + usercache_callback_list[usercache_callback_count].function = func; + usercache_callback_list[usercache_callback_count].arg = arg; + + ++usercache_callback_count; +} + /* * CallSyscacheCallbacks * diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 930cc03ee20..33bd7bcda8f 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -33,6 +33,7 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/parallel.h" +#include "access/relation.h" #include "access/reloptions.h" #include "access/sysattr.h" #include "access/table.h" @@ -319,6 +320,7 @@ static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid, StrategyNumber numSupport); static void RelationCacheInitFileRemoveInDir(const char *tblspcpath); static void unlink_initfile(const char *initfilename, int elevel); +static void release_rd_amcache(Relation rel); /* @@ -463,8 +465,9 @@ AllocateRelationDesc(Form_pg_class relp) static void RelationParseRelOptions(Relation relation, HeapTuple tuple) { - bytea *options; - amoptions_function amoptsfn; + bytea *options; + amoptions_function amoptsfn; + const TableAmRoutine *tableam = NULL; relation->rd_options = NULL; @@ -476,9 +479,10 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) { case RELKIND_RELATION: case RELKIND_TOASTVALUE: - case RELKIND_VIEW: case RELKIND_MATVIEW: + case RELKIND_VIEW: case RELKIND_PARTITIONED_TABLE: + tableam = relation->rd_tableam; amoptsfn = NULL; break; case RELKIND_INDEX: @@ -490,11 +494,12 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple) } /* - * Fetch reloptions from tuple; have to use a hardwired descriptor because - * we might not have any other for pg_class yet (consider executing this - * code for pg_class itself) - */ - options = extractRelOptions(tuple, GetPgClassDescriptor(), amoptsfn); + * Fetch reloptions from tuple; have to use a hardwired descriptor because + * we might not have any other for pg_class yet (consider executing this + * code for pg_class itself) + */ + options = extractRelOptions(tuple, GetPgClassDescriptor(), + tableam, amoptsfn); /* * Copy parsed data into CacheMemoryContext. To guard against the @@ -1408,7 +1413,7 @@ InitIndexAmRoutine(Relation relation) * Call the amhandler in current, short-lived memory context, just in case * it leaks anything (it probably won't, but let's be paranoid). */ - tmp = GetIndexAmRoutine(relation->rd_amhandler); + tmp = GetIndexAmRoutineExtended(relation->rd_id, relation->rd_amhandler); /* OK, now transfer the data into relation's rd_indexcxt. */ cached = (IndexAmRoutine *) MemoryContextAlloc(relation->rd_indexcxt, @@ -2270,9 +2275,7 @@ RelationReloadIndexInfo(Relation relation) RelationCloseSmgr(relation); /* Must free any AM cached data upon relcache flush */ - if (relation->rd_amcache) - pfree(relation->rd_amcache); - relation->rd_amcache = NULL; + release_rd_amcache(relation); /* * If it's a shared index, we might be called before backend startup has @@ -2492,8 +2495,7 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) pfree(relation->rd_options); if (relation->rd_indextuple) pfree(relation->rd_indextuple); - if (relation->rd_amcache) - pfree(relation->rd_amcache); + release_rd_amcache(relation); if (relation->rd_fdwroutine) pfree(relation->rd_fdwroutine); if (relation->rd_indexcxt) @@ -2580,9 +2582,7 @@ RelationClearRelation(Relation relation, bool rebuild) RelationCloseSmgr(relation); /* Free AM cached data, if any */ - if (relation->rd_amcache) - pfree(relation->rd_amcache); - relation->rd_amcache = NULL; + release_rd_amcache(relation); /* * Treat nailed-in system relations separately, they always need to be @@ -6896,3 +6896,9 @@ ResOwnerReleaseRelation(Datum res) RelationCloseCleanup((Relation) res); } + +static void +release_rd_amcache(Relation rel) +{ + table_free_rd_amcache(rel); +} diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 3e03dfc9910..802ec4b218f 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -94,6 +94,7 @@ static int SysCacheSupportingRelOidSize; static int oid_compare(const void *a, const void *b); +SysCacheGetAttr_hook_type SysCacheGetAttr_hook = NULL; /* * InitCatalogCache - initialize the caches @@ -480,6 +481,7 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull) { + TupleDesc cc_tupdesc = SysCache[cacheId]->cc_tupdesc; /* * We just need to get the TupleDesc out of the cache entry, and then we * can apply heap_getattr(). Normally the cache control data is already @@ -489,14 +491,18 @@ SysCacheGetAttr(int cacheId, HeapTuple tup, if (cacheId < 0 || cacheId >= SysCacheSize || !PointerIsValid(SysCache[cacheId])) elog(ERROR, "invalid cache ID: %d", cacheId); - if (!PointerIsValid(SysCache[cacheId]->cc_tupdesc)) + + if (!PointerIsValid(cc_tupdesc) && SysCacheGetAttr_hook) + cc_tupdesc = SysCacheGetAttr_hook(SysCache[cacheId]); + if (!PointerIsValid(cc_tupdesc)) { InitCatCachePhase2(SysCache[cacheId], false); Assert(PointerIsValid(SysCache[cacheId]->cc_tupdesc)); + cc_tupdesc = SysCache[cacheId]->cc_tupdesc; } return heap_getattr(tup, attributeNumber, - SysCache[cacheId]->cc_tupdesc, + cc_tupdesc, isNull); } diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index aa4720cb598..b18e50df27d 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -292,6 +292,8 @@ static int32 NextRecordTypmod = 0; /* number of entries used */ * as identifiers, so we start the counter at INVALID_TUPLEDESC_IDENTIFIER. */ static uint64 tupledesc_id_counter = INVALID_TUPLEDESC_IDENTIFIER; +load_typcache_tupdesc_hook_type load_typcache_tupdesc_hook = NULL; +load_enum_cache_data_hook_type load_enum_cache_data_hook = NULL; static void load_typcache_tupdesc(TypeCacheEntry *typentry); static void load_rangetype_info(TypeCacheEntry *typentry); @@ -881,6 +883,12 @@ load_typcache_tupdesc(TypeCacheEntry *typentry) { Relation rel; + if (load_typcache_tupdesc_hook) + { + load_typcache_tupdesc_hook(typentry); + return; + } + if (!OidIsValid(typentry->typrelid)) /* should not happen */ elog(ERROR, "invalid typrelid for composite type %u", typentry->type_id); @@ -2563,6 +2571,12 @@ load_enum_cache_data(TypeCacheEntry *tcache) int bm_size, start_pos; + if (load_enum_cache_data_hook) + { + load_enum_cache_data_hook(tcache); + return; + } + /* Check that this is actually an enum */ if (tcache->typtype != TYPTYPE_ENUM) ereport(ERROR, diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index b924b524d0b..9524530282e 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -3770,3 +3770,12 @@ write_stderr(const char *fmt,...) #endif va_end(ap); } + +CustomErrorCleanupHookType CustomErrorCleanupHook = NULL; + +void +CustomErrorCleanup(void) +{ + if (CustomErrorCleanupHook) + CustomErrorCleanupHook(); +} diff --git a/src/backend/utils/fmgr/fmgr.c b/src/backend/utils/fmgr/fmgr.c index e48a86be54b..5b7888c705f 100644 --- a/src/backend/utils/fmgr/fmgr.c +++ b/src/backend/utils/fmgr/fmgr.c @@ -72,7 +72,7 @@ extern Datum fmgr_security_definer(PG_FUNCTION_ARGS); * or name, but search by Oid is much faster. */ -static const FmgrBuiltin * +const FmgrBuiltin * fmgr_isbuiltin(Oid id) { uint16 index; @@ -97,7 +97,7 @@ fmgr_isbuiltin(Oid id) * the array with the same name, but they should all point to the same * routine. */ -static const FmgrBuiltin * +const FmgrBuiltin * fmgr_lookupByName(const char *name) { int i; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 0805398e24d..e4f9e14a91e 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -81,7 +81,7 @@ static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); static void process_settings(Oid databaseid, Oid roleid); - +base_init_startup_hook_type base_init_startup_hook = NULL; /*** InitPostgres support ***/ @@ -657,6 +657,9 @@ BaseInit(void) */ InitFileAccess(); + if (base_init_startup_hook) + base_init_startup_hook(); + /* * Initialize statistics reporting. This needs to happen early to ensure * that pgstat's shutdown callback runs after the shutdown callbacks of diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 947a868e569..d3a41533552 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -1100,6 +1100,36 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward, } } +/* + * Same as tuplestore_gettupleslot(), but foces tuple storage to slot. Thus, + * it can work with slot types different than minimal tuple. + */ +bool +tuplestore_force_gettupleslot(Tuplestorestate *state, bool forward, + bool copy, TupleTableSlot *slot) +{ + MinimalTuple tuple; + bool should_free; + + tuple = (MinimalTuple) tuplestore_gettuple(state, forward, &should_free); + + if (tuple) + { + if (copy && !should_free) + { + tuple = heap_copy_minimal_tuple(tuple); + should_free = true; + } + ExecForceStoreMinimalTuple(tuple, slot, should_free); + return true; + } + else + { + ExecClearTuple(slot); + return false; + } +} + /* * tuplestore_advance - exported function to adjust position without fetching * diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 7d2b34d4f20..df9f4394f07 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -101,6 +101,10 @@ TransactionId RecentXmin = FirstNormalTransactionId; /* (table, ctid) => (cmin, cmax) mapping during timetravel */ static HTAB *tuplecid_data = NULL; +snapshot_hook_type snapshot_register_hook = NULL; +snapshot_hook_type snapshot_deregister_hook = NULL; +reset_xmin_hook_type reset_xmin_hook = NULL; + /* * Elements of the active snapshot stack. * @@ -201,6 +205,11 @@ typedef struct SerializedSnapshotData CommandId curcid; TimestampTz whenTaken; XLogRecPtr lsn; + CSNSnapshotData csnSnapshotData; + uint64 undoRegularLocation; + uint64 undoRegularXmin; + uint64 undoSystemLocation; + uint64 undoSystemXmin; } SerializedSnapshotData; /* @@ -263,6 +272,8 @@ GetTransactionSnapshot(void) /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(FirstXactSnapshot); } else CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); @@ -403,6 +414,8 @@ GetNonHistoricCatalogSnapshot(Oid relid) * CatalogSnapshot pointer is already valid. */ pairingheap_add(&RegisteredSnapshots, &CatalogSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(CatalogSnapshot); } return CatalogSnapshot; @@ -424,6 +437,8 @@ InvalidateCatalogSnapshot(void) if (CatalogSnapshot) { pairingheap_remove(&RegisteredSnapshots, &CatalogSnapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(CatalogSnapshot); CatalogSnapshot = NULL; SnapshotResetXmin(); } @@ -501,6 +516,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, CurrentSnapshot->xmin = sourcesnap->xmin; CurrentSnapshot->xmax = sourcesnap->xmax; CurrentSnapshot->xcnt = sourcesnap->xcnt; + CurrentSnapshot->csnSnapshotData = sourcesnap->csnSnapshotData; Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount()); if (sourcesnap->xcnt > 0) memcpy(CurrentSnapshot->xip, sourcesnap->xip, @@ -558,6 +574,8 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(FirstXactSnapshot); } FirstSnapshotSet = true; @@ -820,7 +838,11 @@ RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner) ResourceOwnerRememberSnapshot(owner, snap); if (snap->regd_count == 1) + { pairingheap_add(&RegisteredSnapshots, &snap->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(snap); + } return snap; } @@ -863,7 +885,11 @@ UnregisterSnapshotNoOwner(Snapshot snapshot) snapshot->regd_count--; if (snapshot->regd_count == 0) + { pairingheap_remove(&RegisteredSnapshots, &snapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(snapshot); + } if (snapshot->regd_count == 0 && snapshot->active_count == 0) { @@ -915,6 +941,9 @@ SnapshotResetXmin(void) { Snapshot minSnapshot; + if (reset_xmin_hook) + reset_xmin_hook(); + if (ActiveSnapshot != NULL) return; @@ -1008,6 +1037,8 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) Assert(FirstXactSnapshot->regd_count > 0); Assert(!pairingheap_is_empty(&RegisteredSnapshots)); pairingheap_remove(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(FirstXactSnapshot); } FirstXactSnapshot = NULL; @@ -1039,6 +1070,8 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin) pairingheap_remove(&RegisteredSnapshots, &esnap->snapshot->ph_node); + if (snapshot_deregister_hook) + snapshot_deregister_hook(esnap->snapshot); } exportedSnapshots = NIL; @@ -1167,6 +1200,8 @@ ExportSnapshot(Snapshot snapshot) snapshot->regd_count++; pairingheap_add(&RegisteredSnapshots, &snapshot->ph_node); + if (snapshot_register_hook) + snapshot_register_hook(snapshot); /* * Fill buf with a text serialization of the snapshot, plus identification @@ -1729,6 +1764,13 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; + serialized_snapshot.csnSnapshotData.xmin = snapshot->csnSnapshotData.xmin; + serialized_snapshot.csnSnapshotData.snapshotcsn = snapshot->csnSnapshotData.snapshotcsn; + serialized_snapshot.csnSnapshotData.xlogptr = snapshot->csnSnapshotData.xlogptr; + serialized_snapshot.undoRegularXmin = snapshot->undoRegularLocationPhNode.xmin; + serialized_snapshot.undoRegularLocation = snapshot->undoRegularLocationPhNode.undoLocation; + serialized_snapshot.undoSystemXmin = snapshot->undoSystemLocationPhNode.xmin; + serialized_snapshot.undoSystemLocation = snapshot->undoSystemLocationPhNode.undoLocation; /* * Ignore the SubXID array if it has overflowed, unless the snapshot was @@ -1804,6 +1846,13 @@ RestoreSnapshot(char *start_address) snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; snapshot->snapXactCompletionCount = 0; + snapshot->csnSnapshotData.xmin = serialized_snapshot.csnSnapshotData.xmin; + snapshot->csnSnapshotData.snapshotcsn = serialized_snapshot.csnSnapshotData.snapshotcsn; + snapshot->csnSnapshotData.xlogptr = serialized_snapshot.csnSnapshotData.xlogptr; + snapshot->undoRegularLocationPhNode.xmin = serialized_snapshot.undoRegularXmin; + snapshot->undoRegularLocationPhNode.undoLocation = serialized_snapshot.undoRegularLocation; + snapshot->undoSystemLocationPhNode.xmin = serialized_snapshot.undoSystemXmin; + snapshot->undoSystemLocationPhNode.undoLocation = serialized_snapshot.undoSystemLocation; /* Copy XIDs, if present. */ if (serialized_snapshot.xcnt > 0) diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index 12b138b2f2c..4f93864cf7e 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -21,6 +21,7 @@ LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) OBJS = \ $(WIN32RES) \ datapagemap.o \ + extension.o \ file_ops.o \ filemap.o \ libpq_source.o \ @@ -35,19 +36,21 @@ EXTRA_CLEAN = xlogreader.c all: pg_rewind pg_rewind: $(OBJS) | submake-libpq submake-libpgport - $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LDFLAGS_EX_BE) $(LIBS) -o $@$(X) xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% rm -f $@ && $(LN_S) $< . install: all installdirs $(INSTALL_PROGRAM) pg_rewind$(X) '$(DESTDIR)$(bindir)/pg_rewind$(X)' + $(INSTALL_DATA) $(srcdir)/pg_rewind_ext.h '$(DESTDIR)$(includedir)' installdirs: - $(MKDIR_P) '$(DESTDIR)$(bindir)' + $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(includedir)' uninstall: rm -f '$(DESTDIR)$(bindir)/pg_rewind$(X)' + rm -f '$(DESTDIR)$(includedir)/pg_rewind_ext.h' clean distclean: rm -f pg_rewind$(X) $(OBJS) xlogreader.c diff --git a/src/bin/pg_rewind/extension.c b/src/bin/pg_rewind/extension.c new file mode 100644 index 00000000000..29ec4b5a6f6 --- /dev/null +++ b/src/bin/pg_rewind/extension.c @@ -0,0 +1,132 @@ +/*------------------------------------------------------------------------- + * + * extension.c + * Functions for processing shared libraries loaded by pg_rewind. + * + * Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#ifndef WIN32 +#include + +/* + * On macOS, insists on including . If we're not + * using stdbool, undef bool to undo the damage. + */ +#ifndef PG_USE_STDBOOL +#ifdef bool +#undef bool +#endif +#endif +#endif /* !WIN32 */ + +#include + +#include "access/xlog_internal.h" +#include "pg_rewind.h" + +/* signature for pg_rewind extension library rewind function */ +typedef void (*PG_rewind_t) (const char *datadir_target, char *datadir_source, + char *connstr_source, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug); + +static bool +file_exists(const char *argv0, const char *name) +{ + struct stat st; + + Assert(name != NULL); + + if (stat(name, &st) == 0) + return !S_ISDIR(st.st_mode); + else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES)) + { + const char *progname; + + progname = get_progname(argv0); + pg_log_error("could not access file \"%s\": %m", name); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + return false; +} + +static char * +expand_dynamic_library_name(const char *argv0, const char *name) +{ + char *full; + char my_exec_path[MAXPGPATH]; + char pkglib_path[MAXPGPATH]; + + Assert(name); + + if (find_my_exec(argv0, my_exec_path) < 0) + pg_fatal("%s: could not locate my own executable path", argv0); + get_pkglib_path(my_exec_path, pkglib_path); + full = palloc(strlen(pkglib_path) + 1 + strlen(name) + 1); + sprintf(full, "%s/%s", pkglib_path, name); + if (file_exists(argv0, full)) + return full; + pfree(full); + + full = palloc(strlen(pkglib_path) + 1 + strlen(name) + 1 + + strlen(DLSUFFIX) + 1); + sprintf(full, "%s/%s%s", pkglib_path, name, DLSUFFIX); + if (file_exists(argv0, full)) + return full; + pfree(full); + + return pstrdup(name); +} + +void +process_extensions(SimpleStringList *extensions, const char *datadir_target, + char *datadir_source, char *connstr_source, + XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug) +{ + SimpleStringListCell *cell; + + if (extensions->head == NULL) + return; /* nothing to do */ + + for (cell = extensions->head; cell; cell = cell->next) + { + char *filename = cell->val; + char *fullname; + void *lib_handle; + PG_rewind_t PG_rewind; + char *load_error; + + fullname = expand_dynamic_library_name(argv0, filename); + + lib_handle = dlopen(fullname, RTLD_NOW | RTLD_GLOBAL); + if (lib_handle == NULL) + { + load_error = dlerror(); + pg_fatal("could not load library \"%s\": %s", fullname, load_error); + } + + PG_rewind = dlsym(lib_handle, "_PG_rewind"); + + if (PG_rewind == NULL) + pg_fatal("could not find function \"_PG_rewind\" in \"%s\"", + fullname); + pfree(fullname); + + if (showprogress) + pg_log_info("performing rewind for '%s' extension", filename); + PG_rewind(datadir_target, datadir_source, connstr_source, startpoint, + tliIndex, endpoint, restoreCommand, argv0, debug); + + pg_log_debug("loaded library \"%s\"", filename); + } +} diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 4458324c9d8..83a2476a7e1 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -53,6 +53,7 @@ #define FILEHASH_INITIAL_SIZE 1000 static filehash_hash *filehash; +static SimpleStringList extensions_exclude = {NULL, NULL}; static bool isRelDataFile(const char *path); static char *datasegpath(RelFileLocator rlocator, ForkNumber forknum, @@ -260,6 +261,8 @@ process_target_file(const char *path, file_type_t type, size_t size, * from the target data folder all paths which have been filtered out from * the source data folder when processing the source files. */ + if (check_file_excluded(path, false)) + return; /* * Like in process_source_file, pretend that pg_wal is always a directory. @@ -404,6 +407,31 @@ check_file_excluded(const char *path, bool is_source) } } + /* + * Exclude extensions directories + */ + if (extensions_exclude.head != NULL) + { + SimpleStringListCell *cell; + + for (cell = extensions_exclude.head; cell; cell = cell->next) + { + char *exclude_dir = cell->val; + + snprintf(localpath, sizeof(localpath), "%s/", exclude_dir); + if (strstr(path, localpath) == path) + { + if (is_source) + pg_log_debug("entry \"%s\" excluded from source file list", + path); + else + pg_log_debug("entry \"%s\" excluded from target file list", + path); + return true; + } + } + } + return false; } @@ -820,3 +848,15 @@ decide_file_actions(void) return filemap; } + +void +extensions_exclude_add(char **exclude_dirs) +{ + int i; + + for (i = 0; exclude_dirs[i] != NULL; i++) + { + simple_string_list_append(&extensions_exclude, + pstrdup(exclude_dirs[i])); + } +} diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build index e0f88bde221..e56d5ae24f6 100644 --- a/src/bin/pg_rewind/meson.build +++ b/src/bin/pg_rewind/meson.build @@ -2,6 +2,7 @@ pg_rewind_sources = files( 'datapagemap.c', + 'extension.c', 'file_ops.c', 'filemap.c', 'libpq_source.c', @@ -23,6 +24,7 @@ pg_rewind = executable('pg_rewind', pg_rewind_sources, dependencies: [frontend_code, libpq, lz4, zstd], c_args: ['-DFRONTEND'], # needed for xlogreader et al + export_dynamic: true, kwargs: default_bin_args, ) bin_targets += pg_rewind @@ -48,3 +50,7 @@ tests += { } subdir('po', if_found: libintl) + +install_headers( + 'pg_rewind_ext.h' +) \ No newline at end of file diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 22f7351fdcd..ca8ec05220e 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -38,7 +38,7 @@ static const char *const RmgrNames[RM_MAX_ID + 1] = { #define RmgrName(rmid) (((rmid) <= RM_MAX_BUILTIN_ID) ? \ RmgrNames[rmid] : "custom") -static void extractPageInfo(XLogReaderState *record); +static void extractPageInfo(XLogReaderState *record, void *arg); static int xlogreadfd = -1; static XLogSegNo xlogreadsegno = 0; @@ -54,17 +54,11 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf); -/* - * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline - * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of - * the data blocks touched by the WAL records, and return them in a page map. - * - * 'endpoint' is the end of the last record to read. The record starting at - * 'endpoint' is the first one that is not read. - */ void -extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, - XLogRecPtr endpoint, const char *restoreCommand) +SimpleXLogRead(const char *datadir, XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, const char *restoreCommand, + void (*page_callback) (XLogReaderState *, void *arg), + void *arg) { XLogRecord *record; XLogReaderState *xlogreader; @@ -97,7 +91,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, LSN_FORMAT_ARGS(errptr)); } - extractPageInfo(xlogreader); + page_callback(xlogreader, arg); } while (xlogreader->EndRecPtr < endpoint); /* @@ -116,6 +110,22 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, } } +/* + * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline + * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of + * the data blocks touched by the WAL records, and return them in a page map. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. + */ +void +extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, const char *restoreCommand) +{ + SimpleXLogRead(datadir, startpoint, tliIndex, endpoint, restoreCommand, + extractPageInfo, NULL); +} + /* * Reads one WAL record. Returns the end position of the record, without * doing anything with the record itself. @@ -365,7 +375,7 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, * Extract information on which blocks the current record modifies. */ static void -extractPageInfo(XLogReaderState *record) +extractPageInfo(XLogReaderState *record, void *arg) { int block_id; RmgrId rmid = XLogRecGetRmid(record); diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 052c83b8757..50873c5d2cb 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -77,6 +77,8 @@ bool do_sync = true; bool restore_wal = false; DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC; +static SimpleStringList extensions = {NULL, NULL}; + /* Target history */ TimeLineHistoryEntry *targetHistory; int targetNentries; @@ -110,6 +112,7 @@ usage(const char *progname) printf(_(" --debug write a lot of debug messages\n")); printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n")); printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); + printf(_(" -e, --extension=PATH path to library performing rewind for extension\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); @@ -135,6 +138,7 @@ main(int argc, char **argv) {"progress", no_argument, NULL, 'P'}, {"debug", no_argument, NULL, 3}, {"sync-method", required_argument, NULL, 6}, + {"extension", required_argument, NULL, 'e'}, {NULL, 0, NULL, 0} }; int option_index; @@ -173,7 +177,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "cD:nNPRe", long_options, &option_index)) != -1) { switch (c) { @@ -227,6 +231,10 @@ main(int argc, char **argv) exit(1); break; + case 'e': /* -e or --extension */ + simple_string_list_append(&extensions, optarg); + break; + default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -463,6 +471,12 @@ main(int argc, char **argv) /* Initialize the hash table to track the status of each file */ filehash_init(); + if (extensions.head != NULL) + process_extensions(&extensions, datadir_target, datadir_source, + connstr_source, chkptrec, lastcommontliIndex, + target_wal_endrec, restore_command, argv[0], + debug); + /* * Collect information about all files in the both data directories. */ diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index ec43cbe2c67..4397259e0d0 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -15,7 +15,9 @@ #include "common/logging.h" #include "common/file_utils.h" #include "datapagemap.h" +#include "fe_utils/simple_list.h" #include "libpq-fe.h" +#include "pg_rewind_ext.h" #include "storage/block.h" #include "storage/relfilelocator.h" @@ -55,4 +57,12 @@ extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries); +/* in extension.c */ +extern void process_extensions(SimpleStringList *extensions, + const char *datadir_target, char *datadir_source, + char *connstr_source, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, const char *argv0, + bool debug); + #endif /* PG_REWIND_H */ diff --git a/src/bin/pg_rewind/pg_rewind_ext.h b/src/bin/pg_rewind/pg_rewind_ext.h new file mode 100644 index 00000000000..3616d94f588 --- /dev/null +++ b/src/bin/pg_rewind/pg_rewind_ext.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * pg_rewind_ext.h + * + * + * Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#ifndef PG_REWIND_EXT_H +#define PG_REWIND_EXT_H + +#include "access/xlogreader.h" + +/* in parsexlog.c */ +/* + * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline + * index 'tliIndex' in target timeline history, until 'endpoint'. + * Pass all WAL records to 'page_callback'. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. + */ +extern void SimpleXLogRead(const char *datadir, XLogRecPtr startpoint, + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand, + void (*page_callback) (XLogReaderState *, + void *arg), + void *arg); + + +/* in filemap.c */ +/* Add NULL-terminated list of dirs that pg_rewind can skip copying */ +extern void extensions_exclude_add(char **exclude_dirs); + +/* signature for pg_rewind extension library rewind function */ +extern PGDLLEXPORT void _PG_rewind(const char *datadir_target, + char *datadir_source, char *connstr_source, + XLogRecPtr startpoint, int tliIndex, + XLogRecPtr endpoint, + const char *restoreCommand, + const char *argv0, bool debug); + +#endif /* PG_REWIND_EXT_H */ diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 86ffb3c8683..a53cd9fd236 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -210,6 +210,11 @@ double throttle_delay = 0; */ int64 latency_limit = 0; +/* + * tableam selection + */ +char *tableam = NULL; + /* * tablespace selection */ @@ -893,6 +898,7 @@ usage(void) " --partition-method=(range|hash)\n" " partition pgbench_accounts with this method (default: range)\n" " --partitions=NUM partition pgbench_accounts into NUM parts (default: 0)\n" + " --tableam=TABLEAM create tables using the specified tableam\n" " --tablespace=TABLESPACE create tables in the specified tablespace\n" " --unlogged-tables create tables as unlogged tables\n" "\nOptions to select what to run:\n" @@ -4778,14 +4784,34 @@ createPartitions(PGconn *con) appendPQExpBufferStr(&query, "maxvalue"); appendPQExpBufferChar(&query, ')'); + + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } } else if (partition_method == PART_HASH) + { printfPQExpBuffer(&query, "create%s table pgbench_accounts_%d\n" " partition of pgbench_accounts\n" " for values with (modulus %d, remainder %d)", unlogged_tables ? " unlogged" : "", p, partitions, p - 1); + + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } + } else /* cannot get there */ Assert(0); @@ -4872,10 +4898,20 @@ initCreateTables(PGconn *con) if (partition_method != PART_NONE && strcmp(ddl->table, "pgbench_accounts") == 0) appendPQExpBuffer(&query, " partition by %s (aid)", PARTITION_METHOD[partition_method]); - else if (ddl->declare_fillfactor) + else { + if (tableam != NULL) + { + char *escape_tableam; + + escape_tableam = PQescapeIdentifier(con, tableam, strlen(tableam)); + appendPQExpBuffer(&query, " using %s", escape_tableam); + PQfreemem(escape_tableam); + } + /* fillfactor is only expected on actual tables */ - appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); + if (ddl->declare_fillfactor) + appendPQExpBuffer(&query, " with (fillfactor=%d)", fillfactor); } if (tablespace != NULL) @@ -6663,6 +6699,7 @@ main(int argc, char **argv) {"verbose-errors", no_argument, NULL, 15}, {"exit-on-abort", no_argument, NULL, 16}, {"debug", no_argument, NULL, 17}, + {"tableam", required_argument, NULL, 18}, {NULL, 0, NULL, 0} }; @@ -7003,6 +7040,10 @@ main(int argc, char **argv) case 17: /* debug */ pg_logging_increase_verbosity(); break; + case 18: /* tableam */ + initialization_option_set = true; + tableam = pg_strdup(optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index f25c9d58a7d..c6f57f7d192 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -107,12 +107,42 @@ typedef void (*ambuildempty_function) (Relation indexRelation); typedef bool (*aminsert_function) (Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_tid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); +/* extended version of aminsert taking Datum tupleid */ +typedef bool (*aminsert_extended_function) (Relation indexRelation, + Datum *values, + bool *isnull, + Datum tupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); + +/* update this tuple */ +typedef bool (*amupdate_function) (Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +/* delete this tuple */ +typedef bool (*amdelete_function) (Relation indexRelation, + Datum *values, bool *isnull, + Datum tupleid, + Relation heapRelation, + struct IndexInfo *indexInfo); + /* cleanup after insert */ typedef void (*aminsertcleanup_function) (Relation indexRelation, struct IndexInfo *indexInfo); @@ -252,6 +282,8 @@ typedef struct IndexAmRoutine bool amusemaintenanceworkmem; /* does AM store tuple information only at block granularity? */ bool amsummarizing; + /* does AM can provide MVCC */ + bool ammvccaware; /* OR of parallel vacuum flags. See vacuum.h for flags. */ uint8 amparallelvacuumoptions; /* type of data stored in index, or InvalidOid if variable */ @@ -267,7 +299,10 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; + aminsert_extended_function aminsertextended; aminsertcleanup_function aminsertcleanup; + amupdate_function amupdate; + amdelete_function amdelete; ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ @@ -293,7 +328,13 @@ typedef struct IndexAmRoutine /* Functions in access/index/amapi.c */ +extern IndexAmRoutine *GetIndexAmRoutineWithTableAM(Oid tamoid, Oid amhandler); extern IndexAmRoutine *GetIndexAmRoutine(Oid amhandler); -extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid amoid, bool noerror); +extern IndexAmRoutine *GetIndexAmRoutineExtended(Oid indoid, Oid amhandler); +extern IndexAmRoutine *GetIndexAmRoutineByAmId(Oid indoid, Oid amoid, bool noerror); + +typedef IndexAmRoutine *(*IndexAMRoutineHookType) (Oid tamoid, Oid amhandler); + +extern IndexAMRoutineHookType IndexAMRoutineHook; #endif /* AMAPI_H */ diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h index a5a9772621c..442d2c96b7b 100644 --- a/src/include/access/brin_internal.h +++ b/src/include/access/brin_internal.h @@ -92,7 +92,7 @@ extern IndexBuildResult *brinbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void brinbuildempty(Relation index); extern bool brininsert(Relation idxRel, Datum *values, bool *nulls, - ItemPointer heaptid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/detoast.h b/src/include/access/detoast.h index 12d8cdb356a..9d78980e986 100644 --- a/src/include/access/detoast.h +++ b/src/include/access/detoast.h @@ -63,6 +63,13 @@ extern struct varlena *detoast_attr_slice(struct varlena *attr, int32 sliceoffset, int32 slicelength); +/* ---------- + * toast_decompress_datum - + * + * Decompress a compressed version of a varlena datum + */ +extern struct varlena *toast_decompress_datum(struct varlena *attr); + /* ---------- * toast_raw_datum_size - * @@ -79,4 +86,11 @@ extern Size toast_raw_datum_size(Datum value); */ extern Size toast_datum_size(Datum value); +/* + * for in_memory module + */ +typedef struct varlena* (*ToastFunc) (struct varlena *attr); +extern void register_o_detoast_func(ToastFunc func); +extern void deregister_o_detoast_func(void); + #endif /* DETOAST_H */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index fdcfbe8db74..5752a3cf1ef 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,13 +144,28 @@ extern void index_close(Relation relation, LOCKMODE lockmode); extern bool index_insert(Relation indexRelation, Datum *values, bool *isnull, - ItemPointer heap_t_ctid, + ItemPointer tupleid, Relation heapRelation, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); extern void index_insert_cleanup(Relation indexRelation, struct IndexInfo *indexInfo); +extern bool index_update(Relation indexRelation, + bool new_valid, + bool old_valid, + Datum *values, + bool *isnull, + Datum tupleid, + Datum *valuesOld, + bool *isnullOld, + Datum oldTupleid, + Relation heapRelation, + IndexUniqueCheck checkUnique, + struct IndexInfo *indexInfo); +extern bool index_delete(Relation indexRelation, Datum *values, bool *isnull, + Datum tupleid, Relation heapRelation, + struct IndexInfo *indexInfo); extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, @@ -176,6 +191,9 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, ParallelIndexScanDesc pscan); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); +extern NullableDatum index_getnext_rowid(IndexScanDesc scan, + ScanDirection direction); +extern Datum index_getnext_tupleid(IndexScanDesc scan, ScanDirection direction); struct TupleTableSlot; extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot); extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 3013a44bae1..2e81017f014 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -115,7 +115,7 @@ extern IndexBuildResult *ginbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void ginbuildempty(Relation index); extern bool gininsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 7b8749c8db0..284fb49c517 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -401,7 +401,7 @@ typedef struct GiSTOptions /* gist.c */ extern void gistbuildempty(Relation index); extern bool gistinsert(Relation r, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 9c7d81525b4..e787974a3cf 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -364,7 +364,7 @@ extern IndexBuildResult *hashbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void hashbuildempty(Relation index); extern bool hashinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 9e9aec88a62..871c640c8db 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -322,19 +322,22 @@ extern void heap_multi_insert(Relation relation, struct TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate); extern TM_Result heap_delete(Relation relation, ItemPointer tid, - CommandId cid, Snapshot crosscheck, bool wait, - struct TM_FailureData *tmfd, bool changingPart); + CommandId cid, Snapshot crosscheck, int options, + struct TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot); extern void heap_finish_speculative(Relation relation, ItemPointer tid); extern void heap_abort_speculative(Relation relation, ItemPointer tid); extern TM_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, + CommandId cid, Snapshot crosscheck, int options, struct TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); -extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, - CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, - bool follow_updates, - Buffer *buffer, struct TM_FailureData *tmfd); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); +extern TM_Result heap_lock_tuple(Relation relation, ItemPointer tid, + TupleTableSlot *slot, + CommandId cid, LockTupleMode mode, + LockWaitPolicy wait_policy, bool follow_updates, + struct TM_FailureData *tmfd); extern void heap_inplace_update(Relation relation, HeapTuple tuple); extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 74930433480..9ba149aa47d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1164,7 +1164,7 @@ typedef struct BTOptions */ extern void btbuildempty(Relation index); extern bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); @@ -1288,6 +1288,7 @@ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); extern void _bt_preprocess_keys(IndexScanDesc scan); extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts); diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index 81829b8270a..8ddc75df287 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -21,6 +21,7 @@ #include "access/amapi.h" #include "access/htup.h" +#include "access/tableam.h" #include "access/tupdesc.h" #include "nodes/pg_list.h" #include "storage/lock.h" @@ -224,6 +225,7 @@ extern Datum transformRelOptions(Datum oldOptions, List *defList, bool acceptOidsOff, bool isReset); extern List *untransformRelOptions(Datum options); extern bytea *extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, + const TableAmRoutine *tableam, amoptions_function amoptions); extern void *build_reloptions(Datum reloptions, bool validate, relopt_kind kind, diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 521043304ab..24b04709012 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -122,6 +122,7 @@ typedef struct IndexScanDescData struct ScanKeyData *keyData; /* array of index qualifier descriptors */ struct ScanKeyData *orderByData; /* array of ordering op descriptors */ bool xs_want_itup; /* caller requests index tuples */ + bool xs_want_rowid; /* caller requests index tuples */ bool xs_temp_snap; /* unregister snapshot at scan end? */ /* signaling to index AM about killing index tuples */ @@ -145,6 +146,7 @@ typedef struct IndexScanDescData struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */ ItemPointerData xs_heaptid; /* result */ + NullableDatum xs_rowid; /* result if xs_want_rowid */ bool xs_heap_continue; /* T if must keep walking, potential * further results */ IndexFetchTableData *xs_heapfetch; diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index d6a49531200..b9cc48aba37 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -197,7 +197,7 @@ extern IndexBuildResult *spgbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void spgbuildempty(Relation index); extern bool spginsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo); diff --git a/src/include/access/sysattr.h b/src/include/access/sysattr.h index e88dec71ee9..867b5eb489e 100644 --- a/src/include/access/sysattr.h +++ b/src/include/access/sysattr.h @@ -24,6 +24,7 @@ #define MaxTransactionIdAttributeNumber (-4) #define MaxCommandIdAttributeNumber (-5) #define TableOidAttributeNumber (-6) -#define FirstLowInvalidHeapAttributeNumber (-7) +#define RowIdAttributeNumber (-7) +#define FirstLowInvalidHeapAttributeNumber (-8) #endif /* SYSATTR_H */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index da661289c1f..573a2576935 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -17,11 +17,15 @@ #ifndef TABLEAM_H #define TABLEAM_H +#include "access/amapi.h" #include "access/relscan.h" #include "access/sdir.h" #include "access/xact.h" #include "executor/tuptable.h" #include "storage/read_stream.h" +#include "nodes/execnodes.h" +#include "storage/bufmgr.h" +#include "utils/guc.h" #include "utils/rel.h" #include "utils/snapshot.h" @@ -40,6 +44,16 @@ struct TBMIterateResult; struct VacuumParams; struct ValidateIndexState; +typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, + double *totaldeadrows); + +/* in commands/analyze.c */ +extern int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); + /* * Bitmask values for the flags argument to the scan_begin callback. */ @@ -267,6 +281,11 @@ typedef struct TM_IndexDeleteOp /* Follow update chain and lock latest version of tuple */ #define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1) +/* "options" flag bits for table_tuple_update and table_tuple_delete */ +#define TABLE_MODIFY_WAIT 0x0001 +#define TABLE_MODIFY_FETCH_OLD_TUPLE 0x0002 +#define TABLE_MODIFY_LOCK_UPDATED 0x0004 + /* Typedef for callback function for table_index_build_scan */ typedef void (*IndexBuildCallback) (Relation index, @@ -303,6 +322,9 @@ typedef struct TableAmRoutine */ const TupleTableSlotOps *(*slot_callbacks) (Relation rel); + RowRefType (*get_row_ref_type) (Relation rel); + + void (*free_rd_amcache) (Relation rel); /* ------------------------------------------------------------------------ * Table scan callbacks. @@ -455,7 +477,7 @@ typedef struct TableAmRoutine * future searches. */ bool (*index_fetch_tuple) (struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead); @@ -472,7 +494,7 @@ typedef struct TableAmRoutine * test, returns true, false otherwise. */ bool (*tuple_fetch_row_version) (Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot); @@ -508,23 +530,19 @@ typedef struct TableAmRoutine */ /* see table_tuple_insert() for reference about parameters */ - void (*tuple_insert) (Relation rel, TupleTableSlot *slot, + TupleTableSlot *(*tuple_insert) (Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate); - /* see table_tuple_insert_speculative() for reference about parameters */ - void (*tuple_insert_speculative) (Relation rel, - TupleTableSlot *slot, - CommandId cid, - int options, - struct BulkInsertStateData *bistate, - uint32 specToken); - - /* see table_tuple_complete_speculative() for reference about parameters */ - void (*tuple_complete_speculative) (Relation rel, - TupleTableSlot *slot, - uint32 specToken, - bool succeeded); + TupleTableSlot *(*tuple_insert_with_arbiter) (ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot); /* see table_multi_insert() for reference about parameters */ void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots, @@ -532,29 +550,31 @@ typedef struct TableAmRoutine /* see table_tuple_delete() for reference about parameters */ TM_Result (*tuple_delete) (Relation rel, - ItemPointer tid, + Datum tupleid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, + int options, TM_FailureData *tmfd, - bool changingPart); + bool changingPart, + TupleTableSlot *oldSlot); /* see table_tuple_update() for reference about parameters */ TM_Result (*tuple_update) (Relation rel, - ItemPointer otid, + Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, + int options, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); /* see table_tuple_lock() for reference about parameters */ TM_Result (*tuple_lock) (Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, @@ -873,6 +893,14 @@ typedef struct TableAmRoutine struct SampleScanState *scanstate, TupleTableSlot *slot); + /* Check if tuple in the slot belongs to the current transaction */ + bool (*tuple_is_current) (Relation rel, TupleTableSlot *slot); + + void (*analyze_table) (Relation relation, + AcquireSampleRowsFunc *func, + BlockNumber *totalpages); + + bytea *(*reloptions) (char relkind, Datum reloptions, bool validate); } TableAmRoutine; @@ -1239,7 +1267,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan) */ static inline bool table_index_fetch_tuple(struct IndexFetchTableData *scan, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead) @@ -1252,7 +1280,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding"); - return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, + return scan->rel->rd_tableam->index_fetch_tuple(scan, tupleid, snapshot, slot, call_again, all_dead); } @@ -1286,7 +1314,7 @@ extern bool table_index_fetch_tuple_check(Relation rel, */ static inline bool table_tuple_fetch_row_version(Relation rel, - ItemPointer tid, + Datum tupleid, Snapshot snapshot, TupleTableSlot *slot) { @@ -1298,7 +1326,7 @@ table_tuple_fetch_row_version(Relation rel, if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding"); - return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); + return rel->rd_tableam->tuple_fetch_row_version(rel, tupleid, snapshot, slot); } /* @@ -1398,45 +1426,32 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) * insertion. But note that any toasting of fields within the slot is NOT * reflected in the slots contents. */ -static inline void +static inline TupleTableSlot * table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate) { - rel->rd_tableam->tuple_insert(rel, slot, cid, options, - bistate); + return rel->rd_tableam->tuple_insert(rel, slot, cid, options, bistate); } -/* - * Perform a "speculative insertion". These can be backed out afterwards - * without aborting the whole transaction. Other sessions can wait for the - * speculative insertion to be confirmed, turning it into a regular tuple, or - * aborted, as if it never existed. Speculatively inserted tuples behave as - * "value locks" of short duration, used to implement INSERT .. ON CONFLICT. - * - * A transaction having performed a speculative insertion has to either abort, - * or finish the speculative insertion with - * table_tuple_complete_speculative(succeeded = ...). - */ -static inline void -table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot, - CommandId cid, int options, - struct BulkInsertStateData *bistate, - uint32 specToken) -{ - rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options, - bistate, specToken); -} - -/* - * Complete "speculative insertion" started in the same transaction. If - * succeeded is true, the tuple is fully inserted, if false, it's removed. - */ -static inline void -table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot, - uint32 specToken, bool succeeded) +static inline TupleTableSlot * +table_tuple_insert_with_arbiter(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + CommandId cid, int options, + struct BulkInsertStateData *bistate, + List *arbiterIndexes, + EState *estate, + LockTupleMode lockmode, + TupleTableSlot *lockedSlot, + TupleTableSlot *tempSlot) { - rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken, - succeeded); + Relation rel = resultRelInfo->ri_RelationDesc; + + return rel->rd_tableam->tuple_insert_with_arbiter(resultRelInfo, + slot, cid, options, + bistate, arbiterIndexes, + estate, + lockmode, lockedSlot, + tempSlot); } /* @@ -1462,7 +1477,7 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, } /* - * Delete a tuple. + * Delete a tuple (and optionally lock the last tuple version). * * NB: do not call this directly unless prepared to deal with * concurrent-update conditions. Use simple_table_tuple_delete instead. @@ -1473,11 +1488,21 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, * cid - delete command ID (used for visibility test, and stored into * cmax if successful) * crosscheck - if not InvalidSnapshot, also check tuple against this - * wait - true if should wait for any conflicting update to commit/abort + * options: + * If TABLE_MODIFY_WAIT, wait for any conflicting update to commit/abort. + * If TABLE_MODIFY_FETCH_OLD_TUPLE option is given, the existing tuple is + * fetched into oldSlot when the update is successful. + * If TABLE_MODIFY_LOCK_UPDATED option is given and the tuple is + * concurrently updated, then the last tuple version is locked and fetched + * into oldSlot. + * * Output parameters: * tmfd - filled in failure cases (see below) * changingPart - true iff the tuple is being moved to another partition * table due to an update of the partition key. Otherwise, false. + * oldSlot - slot to save the deleted or locked tuple. Can be NULL if none of + * TABLE_MODIFY_FETCH_OLD_TUPLE or TABLE_MODIFY_LOCK_UPDATED options + * is specified. * * Normal, successful return value is TM_Ok, which means we did actually * delete it. Failure return codes are TM_SelfModified, TM_Updated, and @@ -1488,17 +1513,19 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, * TM_FailureData for additional info. */ static inline TM_Result -table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, - Snapshot snapshot, Snapshot crosscheck, bool wait, - TM_FailureData *tmfd, bool changingPart) +table_tuple_delete(Relation rel, Datum tupleid, CommandId cid, + Snapshot snapshot, Snapshot crosscheck, int options, + TM_FailureData *tmfd, bool changingPart, + TupleTableSlot *oldSlot) { - return rel->rd_tableam->tuple_delete(rel, tid, cid, + return rel->rd_tableam->tuple_delete(rel, tupleid, cid, snapshot, crosscheck, - wait, tmfd, changingPart); + options, tmfd, changingPart, + oldSlot); } /* - * Update a tuple. + * Update a tuple (and optionally lock the last tuple version). * * NB: do not call this directly unless you are prepared to deal with * concurrent-update conditions. Use simple_table_tuple_update instead. @@ -1510,13 +1537,23 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * cid - update command ID (used for visibility test, and stored into * cmax/cmin if successful) * crosscheck - if not InvalidSnapshot, also check old tuple against this - * wait - true if should wait for any conflicting update to commit/abort + * options: + * If TABLE_MODIFY_WAIT, wait for any conflicting update to commit/abort. + * If TABLE_MODIFY_FETCH_OLD_TUPLE option is given, the existing tuple is + * fetched into oldSlot when the update is successful. + * If TABLE_MODIFY_LOCK_UPDATED option is given and the tuple is + * concurrently updated, then the last tuple version is locked and fetched + * into oldSlot. + * * Output parameters: * tmfd - filled in failure cases (see below) * lockmode - filled with lock mode acquired on tuple * update_indexes - in success cases this is set to true if new index entries * are required for this tuple - * + * oldSlot - slot to save the deleted or locked tuple. Can be NULL if none of + * TABLE_MODIFY_FETCH_OLD_TUPLE or TABLE_MODIFY_LOCK_UPDATED options + * is specified. + * Normal, successful return value is TM_Ok, which means we did actually * update it. Failure return codes are TM_SelfModified, TM_Updated, and * TM_BeingModified (the last only possible if wait == false). @@ -1532,15 +1569,17 @@ table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, * for additional info. */ static inline TM_Result -table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, +table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, - bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, - TU_UpdateIndexes *update_indexes) + int options, TM_FailureData *tmfd, LockTupleMode *lockmode, + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot) { - return rel->rd_tableam->tuple_update(rel, otid, slot, + return rel->rd_tableam->tuple_update(rel, tupleid, slot, cid, snapshot, crosscheck, - wait, tmfd, - lockmode, update_indexes); + options, tmfd, + lockmode, update_indexes, + oldSlot); } /* @@ -1577,12 +1616,12 @@ table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, * comments for struct TM_FailureData for additional info. */ static inline TM_Result -table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, +table_tuple_lock(Relation rel, Datum tupleid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd) { - return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot, + return rel->rd_tableam->tuple_lock(rel, tupleid, snapshot, slot, cid, mode, wait_policy, flags, tmfd); } @@ -2046,6 +2085,11 @@ table_scan_sample_next_tuple(TableScanDesc scan, slot); } +static inline bool +table_tuple_is_current(Relation rel, TupleTableSlot *slot) +{ + return rel->rd_tableam->tuple_is_current(rel, slot); +} /* ---------------------------------------------------------------------------- * Functions to make modifications a bit simpler. @@ -2053,11 +2097,13 @@ table_scan_sample_next_tuple(TableScanDesc scan, */ extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); -extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, - Snapshot snapshot); -extern void simple_table_tuple_update(Relation rel, ItemPointer otid, +extern void simple_table_tuple_delete(Relation rel, Datum tupleid, + Snapshot snapshot, + TupleTableSlot *oldSlot); +extern void simple_table_tuple_update(Relation rel, Datum tupleid, TupleTableSlot *slot, Snapshot snapshot, - TU_UpdateIndexes *update_indexes); + TU_UpdateIndexes *update_indexes, + TupleTableSlot *oldSlot); /* ---------------------------------------------------------------------------- @@ -2098,12 +2144,60 @@ extern void table_block_relation_estimate_size(Relation rel, */ extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler); +extern const TableAmRoutine *GetTableAmRoutineByAmOid(Oid amoid); +extern const TableAmRoutine *GetHeapamTableAmRoutine(void); -/* ---------------------------------------------------------------------------- - * Functions in heapam_handler.c - * ---------------------------------------------------------------------------- - */ +static inline RowRefType +table_get_row_ref_type(Relation rel) +{ + if (rel->rd_tableam) + return rel->rd_tableam->get_row_ref_type(rel); + else + return ROW_REF_TID; +} -extern const TableAmRoutine *GetHeapamTableAmRoutine(void); +static inline void +table_free_rd_amcache(Relation rel) +{ + if (rel->rd_tableam) + { + rel->rd_tableam->free_rd_amcache(rel); + } + else + { + if (rel->rd_amcache) + pfree(rel->rd_amcache); + rel->rd_amcache = NULL; + } +} + +static inline void +table_analyze(Relation relation, AcquireSampleRowsFunc *func, + BlockNumber *totalpages) +{ + if (relation->rd_tableam->analyze_table) + { + relation->rd_tableam->analyze_table(relation, func, totalpages); + } + else + { + *func = acquire_sample_rows; + *totalpages = RelationGetNumberOfBlocks(relation); + } +} + +static inline bytea * +table_reloptions(Relation rel, char relkind, + Datum reloptions, bool validate) +{ + return rel->rd_tableam->reloptions(relkind, reloptions, validate); +} + +static inline bytea * +tableam_reloptions(const TableAmRoutine *tableam, char relkind, + Datum reloptions, bool validate) +{ + return tableam->reloptions(relkind, reloptions, validate); +} #endif /* TABLEAM_H */ diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 28a2d287fd5..bd6430c2865 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -15,7 +15,9 @@ #define TRANSAM_H #include "access/xlogdefs.h" - +#ifndef FRONTEND +#include "port/atomics.h" +#endif /* ---------------- * Special transaction ID values @@ -196,6 +198,22 @@ FullTransactionIdAdvance(FullTransactionId *dest) #define FirstUnpinnedObjectId 12000 #define FirstNormalObjectId 16384 +#define COMMITSEQNO_INPROGRESS UINT64CONST(0x0) +#define COMMITSEQNO_NON_DELETED UINT64CONST(0x1) +#define COMMITSEQNO_ABORTED UINT64CONST(0x2) +#define COMMITSEQNO_FROZEN UINT64CONST(0x3) +#define COMMITSEQNO_COMMITTING UINT64CONST(0x4) +#define COMMITSEQNO_FIRST_NORMAL UINT64CONST(0x5) +#define COMMITSEQNO_MAX_NORMAL UINT64CONST(0x7FFFFFFFFFFFFFFF) + +#define COMMITSEQNO_IS_INPROGRESS(csn) ((csn) == COMMITSEQNO_INPROGRESS || (csn) == COMMITSEQNO_NON_DELETED) +#define COMMITSEQNO_IS_NON_DELETED(csn) ((csn) == COMMITSEQNO_NON_DELETED) +#define COMMITSEQNO_IS_ABORTED(csn) ((csn) == COMMITSEQNO_ABORTED) +#define COMMITSEQNO_IS_FROZEN(csn) ((csn) == COMMITSEQNO_FROZEN) +#define COMMITSEQNO_IS_NORMAL(csn) ((csn) >= COMMITSEQNO_FIRST_NORMAL) +#define COMMITSEQNO_IS_COMMITTING(csn) ((csn) == COMMITSEQNO_COMMITTING) +#define COMMITSEQNO_IS_COMMITTED(csn) ((csn) >= COMMITSEQNO_FROZEN) + /* * TransamVariables is a data structure in shared memory that is used to track * OID and XID assignment state. For largely historical reasons, there is @@ -252,9 +270,13 @@ typedef struct TransamVariablesData */ TransactionId oldestClogXid; /* oldest it's safe to look up in clog */ +#ifndef FRONTEND + pg_atomic_uint64 nextCommitSeqNo; +#else + CommitSeqNo nextCommitSeqNo; +#endif } TransamVariablesData; - /* ---------------- * extern declarations * ---------------- @@ -294,6 +316,7 @@ extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); extern void StopGeneratingPinnedObjectIds(void); +extern CommitSeqNo GetCurrentCSN(void); #ifdef USE_ASSERT_CHECKING extern void AssertTransactionIdInAllowableRange(TransactionId xid); diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 6d4439f0524..327328da54c 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -527,4 +527,7 @@ extern void EnterParallelMode(void); extern void ExitParallelMode(void); extern bool IsInParallelMode(void); +typedef void (*xact_redo_hook_type) (TransactionId xid, XLogRecPtr lsn); +extern xact_redo_hook_type xact_redo_hook; + #endif /* XACT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 2c507ea618c..da077b00ee1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -56,6 +56,7 @@ extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +extern PGDLLIMPORT CommitSeqNo startupCommitSeqNo; /* Archive modes */ typedef enum ArchiveMode @@ -292,6 +293,7 @@ extern void do_pg_backup_start(const char *backupidstr, bool fast, StringInfo tblspcmapfile); extern void do_pg_backup_stop(BackupState *state, bool waitforarchive); extern void do_pg_abort_backup(int code, Datum arg); +extern bool have_backup_in_progress(void); extern void register_persistent_abort_backup_handler(void); extern SessionBackupState get_backup_status(void); @@ -307,4 +309,14 @@ extern SessionBackupState get_backup_status(void); /* files to signal promotion to primary */ #define PROMOTE_SIGNAL_FILE "promote" +typedef void (*CheckPoint_hook_type) (XLogRecPtr checkPointRedo, int flags); +extern PGDLLIMPORT CheckPoint_hook_type CheckPoint_hook; +extern double CheckPointProgress; +typedef void (*after_checkpoint_cleanup_hook_type)(XLogRecPtr checkPointRedo, + int flags); +extern PGDLLIMPORT after_checkpoint_cleanup_hook_type + after_checkpoint_cleanup_hook; + +extern void (*RedoShutdownHook) (void); + #endif /* XLOG_H */ diff --git a/src/include/archive/archive_module.h b/src/include/archive/archive_module.h index 763af76e542..d73b9661a4f 100644 --- a/src/include/archive/archive_module.h +++ b/src/include/archive/archive_module.h @@ -37,13 +37,17 @@ typedef struct ArchiveModuleState */ typedef void (*ArchiveStartupCB) (ArchiveModuleState *state); typedef bool (*ArchiveCheckConfiguredCB) (ArchiveModuleState *state); -typedef bool (*ArchiveFileCB) (ArchiveModuleState *state, const char *file, const char *path); +typedef void (*ArchivePreloadFileCB) (ArchiveModuleState *state, + const char *file, const char *path); +typedef bool (*ArchiveFileCB) (ArchiveModuleState *state, + const char *file, const char *path); typedef void (*ArchiveShutdownCB) (ArchiveModuleState *state); typedef struct ArchiveModuleCallbacks { ArchiveStartupCB startup_cb; ArchiveCheckConfiguredCB check_configured_cb; + ArchivePreloadFileCB archive_preload_file_cb; ArchiveFileCB archive_file_cb; ArchiveShutdownCB shutdown_cb; } ArchiveModuleCallbacks; diff --git a/src/include/c.h b/src/include/c.h index dc1841346cd..b8f75ac8329 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -637,7 +637,7 @@ typedef double float8; /* * Oid, RegProcedure, TransactionId, SubTransactionId, MultiXactId, - * CommandId + * CommandId, CommitSeqNo */ /* typedef Oid is in postgres_ext.h */ @@ -668,6 +668,8 @@ typedef uint32 CommandId; #define FirstCommandId ((CommandId) 0) #define InvalidCommandId (~(CommandId)0) +typedef uint64 CommitSeqNo; + /* ---------------- * Variable-length datatypes all share the 'struct varlena' header. diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 6908ca7180a..c9b59706373 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -96,6 +96,8 @@ typedef struct ObjectAddresses ObjectAddresses; #define PERFORM_DELETION_SKIP_EXTENSIONS 0x0010 /* keep extensions */ #define PERFORM_DELETION_CONCURRENT_LOCK 0x0020 /* normal drop with * concurrent lock mode */ +#define PERFORM_DELETION_OF_RELATION 0x0040 /* used for orioledb + * extension */ /* in dependency.c */ diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 7d434f8e653..0beab397c79 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -215,4 +215,6 @@ itemptr_decode(ItemPointer itemptr, int64 encoded) ItemPointerSet(itemptr, block, offset); } +extern void index_update_stats(Relation rel, bool hasindex, double reltuples); + #endif /* INDEX_H */ diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 29c511e3196..628e43dc33f 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -41,6 +41,10 @@ extern char *makeObjectName(const char *name1, const char *name2, extern char *ChooseRelationName(const char *name1, const char *name2, const char *label, Oid namespaceid, bool isconstraint); +extern List *ChooseIndexColumnNames(const List *indexElems); +extern char *ChooseIndexName(const char *tabname, Oid namespaceId, + const List *colnames, const List *exclusionOpNames, + bool primary, bool isconstraint); extern bool CheckIndexCompatible(Oid oldId, const char *accessMethodName, const List *attributeList, @@ -158,4 +162,7 @@ extern int defGetTypeLength(DefElem *def); extern List *defGetStringList(DefElem *def); extern void errorConflictingDefElem(DefElem *defel, ParseState *pstate) pg_attribute_noreturn(); +typedef Oid (*GetDefaultOpClass_hook_type)(Oid type_id, Oid am_id); +extern PGDLLIMPORT GetDefaultOpClass_hook_type GetDefaultOpClass_hook; + #endif /* DEFREM_H */ diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 9b8b351d9a2..5a6fabe8ed9 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -107,6 +107,14 @@ extern void ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, const instr_time *planduration, const BufferUsage *bufusage, const MemoryContextCounters *mem_counters); +extern void ExplainNode(PlanState *planstate, List *ancestors, + const char *relationship, const char *plan_name, + ExplainState *es); +extern void show_scan_qual(List *qual, const char *qlabel, + PlanState *planstate, List *ancestors, + ExplainState *es); +extern void show_instrumentation_count(const char *qlabel, int which, + PlanState *planstate, ExplainState *es); extern void ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc); extern void ExplainPrintTriggers(ExplainState *es, QueryDesc *queryDesc); diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index 8a5a9fe6422..c16e6b6e5a0 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -209,15 +209,15 @@ extern void ExecASDeleteTriggers(EState *estate, extern bool ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, TM_FailureData *tmfd); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *slot, TransitionCaptureState *transition_capture, bool is_crosspart_update); extern bool ExecIRDeleteTriggers(EState *estate, @@ -231,7 +231,7 @@ extern void ExecASUpdateTriggers(EState *estate, extern bool ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, ResultRelInfo *relinfo, - ItemPointer tupleid, + Datum tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, @@ -240,8 +240,8 @@ extern void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, ResultRelInfo *dst_partinfo, - ItemPointer tupleid, HeapTuple fdw_trigtuple, + TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, TransitionCaptureState *transition_capture, diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 759f9a87d38..dfea1e93e33 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -378,6 +378,9 @@ extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc); extern void analyze_rel(Oid relid, RangeVar *relation, VacuumParams *params, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy); +extern int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); extern bool std_typanalyze(VacAttrStats *stats); /* in utils/misc/sampling.c --- duplicate of declarations in utils/sampling.h */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 9770752ea3c..1833f4d84b1 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -633,6 +633,16 @@ extern List *ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, bool noDupErr, bool *specConflict, List *arbiterIndexes, bool onlySummarizing); +extern List *ExecUpdateIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *oldSlot, + EState *estate, + bool noDupErr, + bool *specConflict, List *arbiterIndexes, + bool onlySummarizing); +extern void ExecDeleteIndexTuples(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + EState *estate); extern bool ExecCheckIndexConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate, ItemPointer conflictTid, diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h index fcde3876b28..777e59c86e9 100644 --- a/src/include/foreign/fdwapi.h +++ b/src/include/foreign/fdwapi.h @@ -13,6 +13,7 @@ #define FDWAPI_H #include "access/parallel.h" +#include "access/tableam.h" #include "nodes/execnodes.h" #include "nodes/pathnodes.h" @@ -148,11 +149,6 @@ typedef void (*ExplainForeignModify_function) (ModifyTableState *mtstate, typedef void (*ExplainDirectModify_function) (ForeignScanState *node, struct ExplainState *es); -typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, - double *totaldeadrows); - typedef bool (*AnalyzeForeignTable_function) (Relation relation, AcquireSampleRowsFunc *func, BlockNumber *totalpages); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index cd1b16296b5..48c7fec14ac 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -457,6 +457,8 @@ typedef struct ResultRelInfo /* relation descriptor for result relation */ Relation ri_RelationDesc; + RowRefType ri_RowRefType; + /* # of indices existing on result relation */ int ri_NumIndices; @@ -754,6 +756,7 @@ typedef struct ExecRowMark Index prti; /* parent range table index, if child */ Index rowmarkId; /* unique identifier for resjunk columns */ RowMarkType markType; /* see enum in nodes/plannodes.h */ + RowRefType refType; LockClauseStrength strength; /* LockingClause's strength, or LCS_NONE */ LockWaitPolicy waitPolicy; /* NOWAIT and SKIP LOCKED */ bool ermActive; /* is this mark relevant for current tuple? */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 67c90a2bd32..82443390a85 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1103,6 +1103,7 @@ typedef struct RangeTblEntry Index perminfoindex pg_node_attr(query_jumble_ignore); /* sampling info, or NULL */ struct TableSampleClause *tablesample; + RowRefType reftype; /* * Fields valid for a subquery RTE (else NULL): @@ -2992,6 +2993,7 @@ typedef struct CreateAmStmt char *amname; /* access method name */ List *handler_name; /* handler function name */ char amtype; /* type of access method */ + char *tableam_name; /* table AM name */ } CreateAmStmt; /* ---------------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 1aeeaec95e1..9b41e298b0b 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -1353,7 +1353,7 @@ typedef enum RowMarkType * child relations will also have entries with isParent = true. The child * entries have rti == child rel's RT index and prti == top parent's RT index, * and can therefore be recognized as children by the fact that prti != rti. - * The parent's allMarkTypes field gets the OR of (1<nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ + CommitSeqNo csn; /* current csn */ TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; } xl_running_xacts; diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h index 3fb9647b87c..8b692cafea1 100644 --- a/src/include/utils/catcache.h +++ b/src/include/utils/catcache.h @@ -227,4 +227,28 @@ extern void PrepareToInvalidateCacheTuple(Relation relation, HeapTuple newtuple, void (*function) (int, uint32, Oid)); +typedef CatCTup *(*SearchCatCacheInternal_hook_type)(CatCache *cache, + int nkeys, + Datum v1, Datum v2, + Datum v3, Datum v4); +extern SearchCatCacheInternal_hook_type SearchCatCacheInternal_hook; + +typedef CatCList *(*SearchCatCacheList_hook_type)(CatCache *cache, + int nkeys, + Datum v1, + Datum v2, + Datum v3); +extern SearchCatCacheList_hook_type SearchCatCacheList_hook; + +typedef TupleDesc (*SysCacheGetAttr_hook_type)(CatCache *SysCache); +extern SysCacheGetAttr_hook_type SysCacheGetAttr_hook; + +typedef uint32 (*GetCatCacheHashValue_hook_type)(CatCache *cache, + int nkeys, + Datum v1, + Datum v2, + Datum v3, + Datum v4); +extern GetCatCacheHashValue_hook_type GetCatCacheHashValue_hook; + #endif /* CATCACHE_H */ diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index e54eca5b489..f583eca37ee 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -537,4 +537,10 @@ extern void write_jsonlog(ErrorData *edata); */ extern void write_stderr(const char *fmt,...) pg_attribute_printf(1, 2); +typedef void (*CustomErrorCleanupHookType) (void); + +extern CustomErrorCleanupHookType CustomErrorCleanupHook; + +extern void CustomErrorCleanup(void); + #endif /* ELOG_H */ diff --git a/src/include/utils/fmgrtab.h b/src/include/utils/fmgrtab.h index 151dd74055d..f8666ba7087 100644 --- a/src/include/utils/fmgrtab.h +++ b/src/include/utils/fmgrtab.h @@ -46,4 +46,7 @@ extern PGDLLIMPORT const Oid fmgr_last_builtin_oid; /* highest function OID in #define InvalidOidBuiltinMapping PG_UINT16_MAX extern PGDLLIMPORT const uint16 fmgr_builtin_oid_index[]; +extern const FmgrBuiltin *fmgr_isbuiltin(Oid id); +extern const FmgrBuiltin *fmgr_lookupByName(const char *name); + #endif /* FMGRTAB_H */ diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 24695facf22..69498b9f77f 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -22,6 +22,7 @@ extern PGDLLIMPORT int debug_discard_caches; typedef void (*SyscacheCallbackFunction) (Datum arg, int cacheid, uint32 hashvalue); typedef void (*RelcacheCallbackFunction) (Datum arg, Oid relid); +typedef void (*UsercacheCallbackFunction) (Datum arg, Oid arg1, Oid arg2, Oid arg3); extern void AcceptInvalidationMessages(void); @@ -48,6 +49,8 @@ extern void CacheInvalidateRelcacheByTuple(HeapTuple classTuple); extern void CacheInvalidateRelcacheByRelid(Oid relid); +extern void CacheInvalidateRelcacheByDbidRelid(Oid dbid, Oid relid); + extern void CacheInvalidateSmgr(RelFileLocatorBackend rlocator); extern void CacheInvalidateRelmap(Oid databaseId); @@ -59,6 +62,9 @@ extern void CacheRegisterSyscacheCallback(int cacheid, extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, Datum arg); +extern void CacheRegisterUsercacheCallback(UsercacheCallbackFunction func, + Datum arg); + extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue); extern void InvalidateSystemCaches(void); diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 040968d6ff2..9ce2a266dce 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -101,6 +101,8 @@ extern void make_icu_collator(const char *iculocstr, extern bool pg_locale_deterministic(pg_locale_t locale); extern pg_locale_t pg_newlocale_from_collation(Oid collid); +typedef bool (*pg_newlocale_from_collation_hook_type)(); +extern pg_newlocale_from_collation_hook_type pg_newlocale_from_collation_hook; extern char *get_collation_actual_version(char collprovider, const char *collcollate); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h new file mode 100644 index 00000000000..d32a3a42ef0 --- /dev/null +++ b/src/include/utils/resowner_private.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * resowner_private.h + * POSTGRES resource owner private definitions. + * + * See utils/resowner/README for more info. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/resowner_private.h + * + *------------------------------------------------------------------------- + */ +#ifndef RESOWNER_PRIVATE_H +#define RESOWNER_PRIVATE_H + +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/lock.h" +#include "utils/catcache.h" +#include "utils/plancache.h" +#include "utils/resowner.h" +#include "utils/snapshot.h" + + +extern void ResourceOwnerRememberCatCacheRef(ResourceOwner owner, + HeapTuple tuple); +extern void ResourceOwnerRememberCatCacheListRef(ResourceOwner owner, + CatCList *list); + +#endif /* RESOWNER_PRIVATE_H */ diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 9398a84051c..3f6952d9895 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -18,6 +18,9 @@ #include "utils/resowner.h" #include "utils/snapshot.h" +#ifndef SNAPSHOT_H +typedef void (*snapshot_hook_type) (Snapshot snapshot); +#endif extern PGDLLIMPORT bool FirstSnapshotSet; @@ -78,7 +81,7 @@ extern void PushActiveSnapshotWithLevel(Snapshot snapshot, int snap_level); extern void PushCopiedSnapshot(Snapshot snapshot); extern void UpdateActiveSnapshotCommandId(void); extern void PopActiveSnapshot(void); -extern Snapshot GetActiveSnapshot(void); +extern PGDLLIMPORT Snapshot GetActiveSnapshot(void); extern bool ActiveSnapshotSet(void); extern Snapshot RegisterSnapshot(Snapshot snapshot); @@ -127,4 +130,10 @@ extern void SerializeSnapshot(Snapshot snapshot, char *start_address); extern Snapshot RestoreSnapshot(char *start_address); extern void RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc); +typedef void (*reset_xmin_hook_type) (void); + +extern snapshot_hook_type snapshot_register_hook; +extern snapshot_hook_type snapshot_deregister_hook; +extern reset_xmin_hook_type reset_xmin_hook; + #endif /* SNAPMGR_H */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 8d1e31e888e..9eec035622d 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -122,6 +122,20 @@ typedef struct SnapshotData *Snapshot; #define InvalidSnapshot ((Snapshot) NULL) +typedef struct +{ + uint64 undoLocation; /* undo log location retained by this snapshot */ + uint64 xmin; + pairingheap_node ph_node; +} RetainUndoLocationPHNode; + +typedef struct CSNSnapshotData +{ + uint64 xmin; + CommitSeqNo snapshotcsn; + XLogRecPtr xlogptr; +} CSNSnapshotData; + /* * Struct representing all kind of possible snapshots. * @@ -214,6 +228,12 @@ typedef struct SnapshotData * transactions completed since the last GetSnapshotData(). */ uint64 snapXactCompletionCount; + + RetainUndoLocationPHNode undoRegularLocationPhNode; + RetainUndoLocationPHNode undoSystemLocationPhNode; + CSNSnapshotData csnSnapshotData; } SnapshotData; +typedef void (*snapshot_hook_type) (Snapshot snapshot); + #endif /* SNAPSHOT_H */ diff --git a/src/include/utils/tuplestore.h b/src/include/utils/tuplestore.h index 419613c17ba..cf291a0d17a 100644 --- a/src/include/utils/tuplestore.h +++ b/src/include/utils/tuplestore.h @@ -70,6 +70,9 @@ extern bool tuplestore_in_memory(Tuplestorestate *state); extern bool tuplestore_gettupleslot(Tuplestorestate *state, bool forward, bool copy, TupleTableSlot *slot); +extern bool tuplestore_force_gettupleslot(Tuplestorestate *state, bool forward, + bool copy, TupleTableSlot *slot); + extern bool tuplestore_advance(Tuplestorestate *state, bool forward); extern bool tuplestore_skiptuples(Tuplestorestate *state, diff --git a/src/include/utils/typcache.h b/src/include/utils/typcache.h index f506cc4aa35..7c84978b7fa 100644 --- a/src/include/utils/typcache.h +++ b/src/include/utils/typcache.h @@ -207,4 +207,9 @@ extern void SharedRecordTypmodRegistryInit(SharedRecordTypmodRegistry *, extern void SharedRecordTypmodRegistryAttach(SharedRecordTypmodRegistry *); +typedef void (*load_typcache_tupdesc_hook_type)(TypeCacheEntry *typentry); +extern PGDLLIMPORT load_typcache_tupdesc_hook_type load_typcache_tupdesc_hook; +typedef void (*load_enum_cache_data_hook_type)(TypeCacheEntry *tcache); +extern PGDLLIMPORT load_enum_cache_data_hook_type load_enum_cache_data_hook; + #endif /* TYPCACHE_H */ diff --git a/src/include/varatt.h b/src/include/varatt.h index f04435e9ef3..0b4c09e639d 100644 --- a/src/include/varatt.h +++ b/src/include/varatt.h @@ -38,6 +38,25 @@ typedef struct varatt_external Oid va_toastrelid; /* RelID of TOAST table containing it */ } varatt_external; +typedef struct OToastExternal +{ + uint16 data_size; /* length of OToastExternal data */ + int16 attnum; + int32 raw_size; /* original data size */ + int32 toasted_size; /* compressed original data size */ + /* for fetching data from TOAST tree */ + CommitSeqNo csn; + /* for finding TOAST tree */ + Oid datoid; + Oid relid; + Oid relnode; + /* for storing primary index tuple */ + uint8 formatFlags; /* primary index tuple flags */ + char data[FLEXIBLE_ARRAY_MEMBER]; /* data (primary index tuple) */ +} OToastExternal; + +#define ORIOLEDB_EXT_FORMAT_FLAGS_BITS 6 + /* * These macros define the "saved size" portion of va_extinfo. Its remaining * two high-order bits identify the compression method. @@ -86,17 +105,21 @@ typedef enum vartag_external VARTAG_INDIRECT = 1, VARTAG_EXPANDED_RO = 2, VARTAG_EXPANDED_RW = 3, - VARTAG_ONDISK = 18 + VARTAG_ONDISK = 18, + VARTAG_ORIOLEDB = 34 } vartag_external; /* this test relies on the specific tag values above */ #define VARTAG_IS_EXPANDED(tag) \ (((tag) & ~1) == VARTAG_EXPANDED_RO) +#define O_TOAST_EXTERNAL_SZ offsetof(OToastExternal, data) + #define VARTAG_SIZE(tag) \ ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \ (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ + (tag) == VARTAG_ORIOLEDB ? O_TOAST_EXTERNAL_SZ : \ (AssertMacro(false), 0)) /* @@ -282,11 +305,16 @@ typedef struct #define VARDATA_SHORT(PTR) VARDATA_1B(PTR) #define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) -#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) +#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR)) \ + + (VARATT_IS_EXTERNAL_ORIOLEDB(PTR) ? \ + *((uint16 *) VARDATA_1B_E(PTR)) \ + : 0)) + #define VARDATA_EXTERNAL(PTR) VARDATA_1B_E(PTR) #define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) #define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) + #define VARATT_IS_EXTERNAL_ONDISK(PTR) \ (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) #define VARATT_IS_EXTERNAL_INDIRECT(PTR) \ @@ -299,6 +327,9 @@ typedef struct (VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) #define VARATT_IS_EXTERNAL_NON_EXPANDED(PTR) \ (VARATT_IS_EXTERNAL(PTR) && !VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) +#define VARATT_IS_EXTERNAL_ORIOLEDB(PTR) \ + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ORIOLEDB) + #define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) #define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index 5618050b306..192d3303f55 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -37,6 +37,7 @@ pgxs_kv = { 'PACKAGE_VERSION': pg_version, 'PG_MAJORVERSION': pg_version_major, 'PG_VERSION_NUM': pg_version_num, + 'ORIOLEDB_PATCHSET_VERSION': orioledb_patchset_version, 'configure_input': 'meson', 'vpath_build': 'yes', diff --git a/src/test/isolation/expected/eval-plan-qual-2.out b/src/test/isolation/expected/eval-plan-qual-2.out new file mode 100644 index 00000000000..117a3d3be8d --- /dev/null +++ b/src/test/isolation/expected/eval-plan-qual-2.out @@ -0,0 +1,37 @@ +Parsed test spec with 3 sessions + +starting permutation: read_u wx2 wb1 c2 c1 read_u read +step read_u: SELECT * FROM accounts; +accountid|balance|balance2 +---------+-------+-------- +checking | 600| 1200 +savings | 600| 1200 +(2 rows) + +step wx2: UPDATE accounts SET balance = balance + 450 WHERE accountid = 'checking' RETURNING balance; +balance +------- + 1050 +(1 row) + +step wb1: DELETE FROM accounts WHERE balance = 600 RETURNING *; +step c2: COMMIT; +step wb1: <... completed> +accountid|balance|balance2 +---------+-------+-------- +savings | 600| 1200 +(1 row) + +step c1: COMMIT; +step read_u: SELECT * FROM accounts; +accountid|balance|balance2 +---------+-------+-------- +checking | 1050| 2100 +(1 row) + +step read: SELECT * FROM accounts ORDER BY accountid; +accountid|balance|balance2 +---------+-------+-------- +checking | 1050| 2100 +(1 row) + diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index 143109aa4da..f4df2146488 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -36,6 +36,7 @@ test: fk-partitioned-2 test: fk-snapshot test: subxid-overflow test: eval-plan-qual +test: eval-plan-qual-2 test: eval-plan-qual-trigger test: inplace-inval test: intra-grant-inplace diff --git a/src/test/isolation/specs/eval-plan-qual-2.spec b/src/test/isolation/specs/eval-plan-qual-2.spec new file mode 100644 index 00000000000..30447bef24a --- /dev/null +++ b/src/test/isolation/specs/eval-plan-qual-2.spec @@ -0,0 +1,30 @@ +setup +{ + CREATE TABLE accounts (accountid text PRIMARY KEY, balance numeric not null, + balance2 numeric GENERATED ALWAYS AS (balance * 2) STORED); + INSERT INTO accounts VALUES ('checking', 600), ('savings', 600); +} + +teardown +{ + DROP TABLE accounts; +} + +session s1 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step wb1 { DELETE FROM accounts WHERE balance = 600 RETURNING *; } +step c1 { COMMIT; } + +session s2 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step wx2 { UPDATE accounts SET balance = balance + 450 WHERE accountid = 'checking' RETURNING balance; } +step c2 { COMMIT; } + +session s3 +setup { BEGIN ISOLATION LEVEL READ COMMITTED; } +step read { SELECT * FROM accounts ORDER BY accountid; } +step read_u { SELECT * FROM accounts; } + +teardown { COMMIT; } + +permutation read_u wx2 wb1 c2 c1 read_u read diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 18185d02067..1c6825f391a 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -164,7 +164,7 @@ dibuildempty(Relation index) */ static bool diinsert(Relation index, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, + Datum tupleid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo) @@ -303,7 +303,8 @@ dihandler(PG_FUNCTION_ARGS) amroutine->ambuild = dibuild; amroutine->ambuildempty = dibuildempty; - amroutine->aminsert = diinsert; + amroutine->aminsert = NULL; + amroutine->aminsertextended = diinsert; amroutine->ambulkdelete = dibulkdelete; amroutine->amvacuumcleanup = divacuumcleanup; amroutine->amcanreturn = NULL; diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index cf6eac57349..1f74afeca8f 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1233,6 +1233,24 @@ SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; 14 (1 row) +-- OR-clauses shouldn't be transformed into SAOP because hash indexes don't +-- support SAOP scans. +SET enable_seqscan = off; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM tenk1 WHERE stringu1 = 'TVAAAA' OR stringu1 = 'TVAAAB'; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((stringu1 = 'TVAAAA'::name) OR (stringu1 = 'TVAAAB'::name)) + -> BitmapOr + -> Bitmap Index Scan on hash_tuplesort_idx + Index Cond: (stringu1 = 'TVAAAA'::name) + -> Bitmap Index Scan on hash_tuplesort_idx + Index Cond: (stringu1 = 'TVAAAB'::name) +(8 rows) + +RESET enable_seqscan; DROP INDEX hash_tuplesort_idx; RESET maintenance_work_mem; -- @@ -1843,19 +1861,122 @@ DROP TABLE onek_with_null; -- EXPLAIN (COSTS OFF) SELECT * FROM tenk1 - WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); - QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------------------ + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); + QUERY PLAN +-------------------------------------------------------------------------------- + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42,0}'::integer[]))) +(2 rows) + +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx +(1 row) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + QUERY PLAN +---------------------------------------------------------------------------------------- + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY (ARRAY[1, (InitPlan 1).col1, 42]))) + InitPlan 1 + -> Result +(4 rows) + +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 +---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+--------- + 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx +(1 row) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous IS NULL); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------- Bitmap Heap Scan on tenk1 - Recheck Cond: (((thousand = 42) AND (tenthous = 1)) OR ((thousand = 42) AND (tenthous = 3)) OR ((thousand = 42) AND (tenthous = 42))) + Recheck Cond: (((thousand = 42) AND (tenthous IS NULL)) OR ((thousand = 42) AND ((tenthous = 1) OR (tenthous = 3) OR (tenthous = 42)))) + Filter: ((tenthous = 1) OR (tenthous = 3) OR (tenthous = 42) OR (tenthous IS NULL)) -> BitmapOr -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 1)) + Index Cond: ((thousand = 42) AND (tenthous IS NULL)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) +(8 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous = 42::int8); + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: ((tenthous = '1'::smallint) OR ((tenthous)::smallint = '3'::bigint) OR (tenthous = '42'::bigint)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous::int2 = 42::int8); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: ((tenthous = '1'::smallint) OR ((tenthous)::smallint = '3'::bigint) OR ((tenthous)::smallint = '42'::bigint)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous = 3::int8 OR tenthous = 42::int8); + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (((thousand = 42) AND ((tenthous = '3'::bigint) OR (tenthous = '42'::bigint))) OR ((thousand = 42) AND (tenthous = '1'::smallint))) + Filter: ((tenthous = '1'::smallint) OR (tenthous = '3'::bigint) OR (tenthous = '42'::bigint)) + -> BitmapOr -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 3)) + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{3,42}'::bigint[]))) -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: ((thousand = 42) AND (tenthous = 42)) -(9 rows) + Index Cond: ((thousand = 42) AND (tenthous = '1'::smallint)) +(8 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + QUERY PLAN +--------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,99}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); + QUERY PLAN +------------------------------------------------------------------------------ + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3,42}'::integer[]))) +(2 rows) SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); @@ -1864,6 +1985,27 @@ SELECT * FROM tenk1 42 | 5530 | 0 | 2 | 2 | 2 | 42 | 42 | 42 | 42 | 42 | 84 | 85 | QBAAAA | SEIAAA | OOOOxx (1 row) +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric); + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Bitmap Heap Scan on tenk1 + Recheck Cond: (thousand = 42) + Filter: (((tenthous)::numeric = '1'::numeric) OR (tenthous = 3) OR ((tenthous)::numeric = '42'::numeric)) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 42) +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric; + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + Seq Scan on tenk1 + Filter: (((tenthous)::numeric = '1'::numeric) OR (tenthous = 3) OR ((tenthous)::numeric = '42'::numeric)) +(2 rows) + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); @@ -1872,23 +2014,191 @@ SELECT count(*) FROM tenk1 Aggregate -> Bitmap Heap Scan on tenk1 Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,99}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + QUERY PLAN +----------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND ((thousand < 42) OR (thousand < 99) OR (43 > thousand) OR (42 > thousand))) + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand < ANY ('{42,99,43,42}'::integer[])) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + QUERY PLAN +----------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (((thousand = 42) AND ((tenthous = 1) OR (tenthous = 3))) OR (thousand = 41)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: ((thousand = 42) AND (tenthous = ANY ('{1,3}'::integer[]))) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 41) +(8 rows) + +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + count +------- + 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (((hundred = 42) AND (((thousand = 42) OR (thousand = 99)) OR (tenthous < 2))) OR (thousand = 41)) + Filter: (((hundred = 42) AND ((thousand = 42) OR (thousand = 99) OR (tenthous < 2))) OR (thousand = 41)) + -> BitmapOr + -> BitmapAnd + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> BitmapOr + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = ANY ('{42,99}'::integer[])) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (tenthous < 2) + -> Bitmap Index Scan on tenk1_thous_tenthous + Index Cond: (thousand = 41) +(15 rows) + +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + count +------- + 20 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); + QUERY PLAN +--------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tenk1 + Recheck Cond: ((hundred = 42) AND (((thousand = 99) AND (tenthous = 2)) OR ((thousand = 42) OR (thousand = 41)))) + Filter: ((thousand = 42) OR (thousand = 41) OR ((thousand = 99) AND (tenthous = 2))) -> BitmapAnd -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 42) -> BitmapOr -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 42) + Index Cond: ((thousand = 99) AND (tenthous = 2)) -> Bitmap Index Scan on tenk1_thous_tenthous - Index Cond: (thousand = 99) -(11 rows) + Index Cond: (thousand = ANY ('{42,41}'::integer[])) +(12 rows) SELECT count(*) FROM tenk1 - WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); count ------- 10 (1 row) +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk1.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Aggregate + -> Nested Loop + Join Filter: ((tenk2.thousand = 42) OR (tenk1.thousand = 41) OR (tenk2.tenthous = 2)) + -> Bitmap Heap Scan on tenk1 + Recheck Cond: (hundred = 42) + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 42) + -> Materialize + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) +(12 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------ + Aggregate + -> Nested Loop + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = 42) +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------ + Aggregate + -> Nested Loop + -> Bitmap Heap Scan on tenk2 + Recheck Cond: (hundred = 42) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) + -> Bitmap Index Scan on tenk2_hundred + Index Cond: (hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = 42) +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 LEFT JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: (tenk1.hundred = 42) + -> Index Only Scan using tenk1_hundred on tenk1 + -> Memoize + Cache Key: tenk1.hundred + Cache Mode: logical + -> Index Scan using tenk2_hundred on tenk2 + Index Cond: (hundred = tenk1.hundred) + Filter: ((thousand = 42) OR (thousand = 41) OR (tenthous = 2)) +(10 rows) + -- -- Check behavior with duplicate index column contents -- @@ -2904,6 +3214,49 @@ SELECT b.relname, (2 rows) DROP TABLE concur_temp_tab_1, concur_temp_tab_2, reindex_temp_before; +-- Check bitmap scan can consider similar OR arguments separately without +-- grouping them into SAOP. +CREATE TABLE bitmap_split_or (a int NOT NULL, b int NOT NULL, c int NOT NULL); +INSERT INTO bitmap_split_or (SELECT 1, 1, i FROM generate_series(1, 1000) i); +INSERT INTO bitmap_split_or (select i, 2, 2 FROM generate_series(1, 1000) i); +VACUUM ANALYZE bitmap_split_or; +CREATE INDEX t_b_partial_1_idx ON bitmap_split_or (b) WHERE a = 1; +CREATE INDEX t_b_partial_2_idx ON bitmap_split_or (b) WHERE a = 2; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE (a = 1 OR a = 2) AND b = 2; + QUERY PLAN +------------------------------------------------------------------ + Bitmap Heap Scan on bitmap_split_or + Recheck Cond: (((b = 2) AND (a = 1)) OR ((b = 2) AND (a = 2))) + -> BitmapOr + -> Bitmap Index Scan on t_b_partial_1_idx + Index Cond: (b = 2) + -> Bitmap Index Scan on t_b_partial_2_idx + Index Cond: (b = 2) +(7 rows) + +DROP INDEX t_b_partial_1_idx; +DROP INDEX t_b_partial_2_idx; +CREATE INDEX t_a_b_idx ON bitmap_split_or (a, b); +CREATE INDEX t_b_c_idx ON bitmap_split_or (b, c); +CREATE STATISTICS t_a_b_stat (mcv) ON a, b FROM bitmap_split_or; +CREATE STATISTICS t_b_c_stat (mcv) ON b, c FROM bitmap_split_or; +ANALYZE bitmap_split_or; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE a = 1 AND (b = 1 OR b = 2) AND c = 2; + QUERY PLAN +------------------------------------------------------------------ + Bitmap Heap Scan on bitmap_split_or + Recheck Cond: (((b = 1) AND (c = 2)) OR ((a = 1) AND (b = 2))) + Filter: ((a = 1) AND (c = 2)) + -> BitmapOr + -> Bitmap Index Scan on t_b_c_idx + Index Cond: ((b = 1) AND (c = 2)) + -> Bitmap Index Scan on t_a_b_idx + Index Cond: ((a = 1) AND (b = 2)) +(8 rows) + +DROP TABLE bitmap_split_or; -- -- REINDEX SCHEMA -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 8d1d3ec1dcf..f1664516bf7 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4225,20 +4225,20 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.hundred = 4))) -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 2) -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + Recheck Cond: ((unique2 = 3) OR (unique1 = 1)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = 3) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (17 rows) explain (costs off) @@ -4252,12 +4252,12 @@ select * from tenk1 a join tenk1 b on Filter: ((unique1 = 2) OR (ten = 4)) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3)) + Recheck Cond: ((unique2 = 3) OR (unique1 = 1)) -> BitmapOr - -> Bitmap Index Scan on tenk1_unique1 - Index Cond: (unique1 = 1) -> Bitmap Index Scan on tenk1_unique2 Index Cond: (unique2 = 3) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) (12 rows) explain (costs off) @@ -4269,23 +4269,70 @@ select * from tenk1 a join tenk1 b on Nested Loop Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) -> Bitmap Heap Scan on tenk1 b - Recheck Cond: ((unique1 = 2) OR (hundred = 4)) + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) -> BitmapOr + -> Bitmap Index Scan on tenk1_hundred + Index Cond: (hundred = 4) -> Bitmap Index Scan on tenk1_unique1 Index Cond: (unique1 = 2) + -> Materialize + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR (unique1 = 1)) + Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + -> BitmapOr + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 1) +(18 rows) + +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) + -> Bitmap Heap Scan on tenk1 b + Recheck Cond: ((hundred = 4) OR (unique1 = 2)) + -> BitmapOr -> Bitmap Index Scan on tenk1_hundred Index Cond: (hundred = 4) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = 2) -> Materialize -> Bitmap Heap Scan on tenk1 a - Recheck Cond: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR (unique1 = 1)) + Filter: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) -> BitmapOr + -> Bitmap Index Scan on tenk1_unique2 + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) -> Bitmap Index Scan on tenk1_unique1 Index Cond: (unique1 = 1) +(18 rows) + +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 < 20 or a.unique1 = 3 or a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------- + Nested Loop + Join Filter: ((a.unique1 < 20) OR (a.unique1 = 3) OR ((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4))) + -> Seq Scan on tenk1 b + -> Materialize + -> Bitmap Heap Scan on tenk1 a + Recheck Cond: (((unique2 = 3) OR (unique2 = 7)) OR ((unique1 = 3) OR (unique1 = 1)) OR (unique1 < 20)) + Filter: ((unique1 < 20) OR (unique1 = 3) OR (unique1 = 1) OR (unique2 = 3) OR (unique2 = 7)) + -> BitmapOr -> Bitmap Index Scan on tenk1_unique2 - Index Cond: (unique2 = 3) - -> Bitmap Index Scan on tenk1_unique2 - Index Cond: (unique2 = 7) -(19 rows) + Index Cond: (unique2 = ANY ('{3,7}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 = ANY ('{3,1}'::integer[])) + -> Bitmap Index Scan on tenk1_unique1 + Index Cond: (unique1 < 20) +(14 rows) -- -- test placement of movable quals in a parameterized join tree diff --git a/src/test/regress/expected/jsonb_jsonpath.out b/src/test/regress/expected/jsonb_jsonpath.out index 57c117ea580..8cf6ecfc7f8 100644 --- a/src/test/regress/expected/jsonb_jsonpath.out +++ b/src/test/regress/expected/jsonb_jsonpath.out @@ -2634,12 +2634,16 @@ select jsonb_path_query('"12:34:56 +5:30"', '$.time_tz().string()'); "12:34:56+05:30" (1 row) +-- this timetz usage will absorb the UTC offset of the current timezone setting +begin; +set local timezone = 'UTC-10'; select jsonb_path_query_tz('"12:34:56"', '$.time_tz().string()'); jsonb_path_query_tz --------------------- - "12:34:56-07:00" + "12:34:56+10:00" (1 row) +rollback; select jsonb_path_query('"12:34:56"', '$.time().string()'); jsonb_path_query ------------------ diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index 319190855bd..ef890b96cc6 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -4492,6 +4492,13 @@ SELECT * FROM rls_tbl WHERE a <<< 1000; --- (0 rows) +EXPLAIN (COSTS OFF) SELECT * FROM rls_tbl WHERE a <<< 1000 or a <<< 900; + QUERY PLAN +-------------------------- + Result + One-Time Filter: false +(2 rows) + DROP OPERATOR <<< (int, int); DROP FUNCTION op_leak(int, int); RESET SESSION AUTHORIZATION; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 8c4da955084..a4c7be487ef 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -3254,6 +3254,8 @@ CREATE OPERATOR <<< (procedure = op_leak, leftarg = int, rightarg = int, restrict = scalarltsel); SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Permission denied ERROR: permission denied for table priv_test_tbl +SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 OR b <<< 0; +ERROR: permission denied for table priv_test_tbl DELETE FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Permission denied ERROR: permission denied for table priv_test_tbl -- Grant access via a security barrier view, but hide all data @@ -3268,6 +3270,11 @@ SELECT * FROM tststats.priv_test_view WHERE a <<< 0 AND b <<< 0; -- Should not l ---+--- (0 rows) +SELECT * FROM tststats.priv_test_view WHERE a <<< 0 OR b <<< 0; -- Should not leak + a | b +---+--- +(0 rows) + DELETE FROM tststats.priv_test_view WHERE a <<< 0 AND b <<< 0; -- Should not leak -- Grant table access, but hide all data with RLS RESET SESSION AUTHORIZATION; @@ -3280,6 +3287,11 @@ SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Should not le ---+--- (0 rows) +SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 OR b <<< 0; + a | b +---+--- +(0 rows) + DELETE FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Should not leak -- privilege checks for pg_stats_ext and pg_stats_ext_exprs RESET SESSION AUTHORIZATION; diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out index 6026e15ed31..8f4ef0d7a6a 100644 --- a/src/test/regress/expected/uuid.out +++ b/src/test/regress/expected/uuid.out @@ -129,6 +129,37 @@ CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <> '11111111111111111111111111111111' OR + guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------ + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field <> '11111111-1111-1111-1111-111111111111'::uuid) OR (guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid)) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <= '22222222-2222-2222-2222-222222222222' OR + guid_field <= '11111111111111111111111111111111' OR + guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field <= '22222222-2222-2222-2222-222222222222'::uuid) OR (guid_field <= '11111111-1111-1111-1111-111111111111'::uuid) OR (guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid)) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e' OR + guid_field = '11111111111111111111111111111111'; + QUERY PLAN +---------------------------------------------------------------------------------------------------------------------------------------------- + Aggregate + -> Seq Scan on guid1 + Filter: ((guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e'::uuid) OR (guid_field = '11111111-1111-1111-1111-111111111111'::uuid)) +(3 rows) + -- should fail INSERT INTO guid1(guid_field) VALUES('11111111-1111-1111-1111-111111111111'); ERROR: duplicate key value violates unique constraint "guid1_unique_btree" diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 45a6ad3c49e..7b8e91d07b3 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -606,7 +606,7 @@ make_tuple_indirect(PG_FUNCTION_ARGS) continue; /* copy datum, so it still lives later */ - if (VARATT_IS_EXTERNAL_ONDISK(attr)) + if (VARATT_IS_EXTERNAL_ONDISK(attr) || VARATT_IS_EXTERNAL_ORIOLEDB(attr)) attr = detoast_external_attr(attr); else { diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index e296891cab8..6b683da30f9 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -372,6 +372,12 @@ CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fi EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; +-- OR-clauses shouldn't be transformed into SAOP because hash indexes don't +-- support SAOP scans. +SET enable_seqscan = off; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM tenk1 WHERE stringu1 = 'TVAAAA' OR stringu1 = 'TVAAAB'; +RESET enable_seqscan; DROP INDEX hash_tuplesort_idx; RESET maintenance_work_mem; @@ -726,18 +732,104 @@ DROP TABLE onek_with_null; -- Check bitmap index path planning -- +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous = 0); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = (SELECT 1 + 2) OR tenthous = 42); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42 OR tenthous IS NULL); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous = 42::int8); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous::int2 = 3::int8 OR tenthous::int2 = 42::int8); + + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::int2 OR tenthous = 3::int8 OR tenthous = 42::int8); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); + EXPLAIN (COSTS OFF) SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); SELECT * FROM tenk1 WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3 OR tenthous = 42); +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric); + +EXPLAIN (COSTS OFF) +SELECT * FROM tenk1 + WHERE tenthous = 1::numeric OR tenthous = 3::int4 OR tenthous = 42::numeric; + EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); SELECT count(*) FROM tenk1 WHERE hundred = 42 AND (thousand = 42 OR thousand = 99); +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand < 42 OR thousand < 99 OR 43 > thousand OR 42 > thousand); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; +SELECT count(*) FROM tenk1 + WHERE thousand = 42 AND (tenthous = 1 OR tenthous = 3) OR thousand = 41; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 99 OR tenthous < 2) OR thousand = 41; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); +SELECT count(*) FROM tenk1 + WHERE hundred = 42 AND (thousand = 42 OR thousand = 41 OR thousand = 99 AND tenthous = 2); + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk1.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1, tenk2 + WHERE tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM tenk1 LEFT JOIN tenk2 ON + tenk1.hundred = 42 AND (tenk2.thousand = 42 OR tenk2.thousand = 41 OR tenk2.tenthous = 2) AND + tenk2.hundred = tenk1.hundred; -- -- Check behavior with duplicate index column contents -- @@ -1252,6 +1344,27 @@ SELECT b.relname, ORDER BY 1; DROP TABLE concur_temp_tab_1, concur_temp_tab_2, reindex_temp_before; +-- Check bitmap scan can consider similar OR arguments separately without +-- grouping them into SAOP. +CREATE TABLE bitmap_split_or (a int NOT NULL, b int NOT NULL, c int NOT NULL); +INSERT INTO bitmap_split_or (SELECT 1, 1, i FROM generate_series(1, 1000) i); +INSERT INTO bitmap_split_or (select i, 2, 2 FROM generate_series(1, 1000) i); +VACUUM ANALYZE bitmap_split_or; +CREATE INDEX t_b_partial_1_idx ON bitmap_split_or (b) WHERE a = 1; +CREATE INDEX t_b_partial_2_idx ON bitmap_split_or (b) WHERE a = 2; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE (a = 1 OR a = 2) AND b = 2; +DROP INDEX t_b_partial_1_idx; +DROP INDEX t_b_partial_2_idx; +CREATE INDEX t_a_b_idx ON bitmap_split_or (a, b); +CREATE INDEX t_b_c_idx ON bitmap_split_or (b, c); +CREATE STATISTICS t_a_b_stat (mcv) ON a, b FROM bitmap_split_or; +CREATE STATISTICS t_b_c_stat (mcv) ON b, c FROM bitmap_split_or; +ANALYZE bitmap_split_or; +EXPLAIN (COSTS OFF) +SELECT * FROM bitmap_split_or WHERE a = 1 AND (b = 1 OR b = 2) AND c = 2; +DROP TABLE bitmap_split_or; + -- -- REINDEX SCHEMA -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 8281bbd8ef8..b67b4caef23 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1433,6 +1433,15 @@ select * from tenk1 a join tenk1 b on (a.unique1 = 1 and b.unique1 = 2) or ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); +explain (costs off) +select * from tenk1 a join tenk1 b on + (a.unique1 < 20 or a.unique1 = 3 or a.unique1 = 1 and b.unique1 = 2) or + ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4); + -- -- test placement of movable quals in a parameterized join tree -- diff --git a/src/test/regress/sql/jsonb_jsonpath.sql b/src/test/regress/sql/jsonb_jsonpath.sql index c647af55e94..acb508c0dd2 100644 --- a/src/test/regress/sql/jsonb_jsonpath.sql +++ b/src/test/regress/sql/jsonb_jsonpath.sql @@ -596,7 +596,11 @@ select jsonb_path_query_tz('"2023-08-15 12:34:56"', '$.timestamp_tz().string()') select jsonb_path_query('"2023-08-15 12:34:56 +5:30"', '$.timestamp_tz().string()'); select jsonb_path_query('"2023-08-15 12:34:56"', '$.timestamp().string()'); select jsonb_path_query('"12:34:56 +5:30"', '$.time_tz().string()'); +-- this timetz usage will absorb the UTC offset of the current timezone setting +begin; +set local timezone = 'UTC-10'; select jsonb_path_query_tz('"12:34:56"', '$.time_tz().string()'); +rollback; select jsonb_path_query('"12:34:56"', '$.time().string()'); select jsonb_path_query('"2023-08-15"', '$.date().string()'); diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index 3011d71b12b..6d2414b6044 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -2177,6 +2177,7 @@ CREATE FUNCTION op_leak(int, int) RETURNS bool CREATE OPERATOR <<< (procedure = op_leak, leftarg = int, rightarg = int, restrict = scalarltsel); SELECT * FROM rls_tbl WHERE a <<< 1000; +EXPLAIN (COSTS OFF) SELECT * FROM rls_tbl WHERE a <<< 1000 or a <<< 900; DROP OPERATOR <<< (int, int); DROP FUNCTION op_leak(int, int); RESET SESSION AUTHORIZATION; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 0c08a6cc42e..5c786b16c6f 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1634,6 +1634,7 @@ CREATE FUNCTION op_leak(int, int) RETURNS bool CREATE OPERATOR <<< (procedure = op_leak, leftarg = int, rightarg = int, restrict = scalarltsel); SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Permission denied +SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 OR b <<< 0; DELETE FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Permission denied -- Grant access via a security barrier view, but hide all data @@ -1645,6 +1646,7 @@ GRANT SELECT, DELETE ON tststats.priv_test_view TO regress_stats_user1; -- Should now have access via the view, but see nothing and leak nothing SET SESSION AUTHORIZATION regress_stats_user1; SELECT * FROM tststats.priv_test_view WHERE a <<< 0 AND b <<< 0; -- Should not leak +SELECT * FROM tststats.priv_test_view WHERE a <<< 0 OR b <<< 0; -- Should not leak DELETE FROM tststats.priv_test_view WHERE a <<< 0 AND b <<< 0; -- Should not leak -- Grant table access, but hide all data with RLS @@ -1655,6 +1657,7 @@ GRANT SELECT, DELETE ON tststats.priv_test_tbl TO regress_stats_user1; -- Should now have direct table access, but see nothing and leak nothing SET SESSION AUTHORIZATION regress_stats_user1; SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Should not leak +SELECT * FROM tststats.priv_test_tbl WHERE a <<< 0 OR b <<< 0; DELETE FROM tststats.priv_test_tbl WHERE a <<< 0 AND b <<< 0; -- Should not leak -- privilege checks for pg_stats_ext and pg_stats_ext_exprs diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql index c88f6d087a7..75ee966ded0 100644 --- a/src/test/regress/sql/uuid.sql +++ b/src/test/regress/sql/uuid.sql @@ -63,6 +63,18 @@ CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); + +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <> '11111111111111111111111111111111' OR + guid_field <> '3f3e3c3b-3a30-3938-3736-353433a2313e'; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field <= '22222222-2222-2222-2222-222222222222' OR + guid_field <= '11111111111111111111111111111111' OR + guid_field <= '3f3e3c3b-3a30-3938-3736-353433a2313e'; +EXPLAIN (COSTS OFF) +SELECT COUNT(*) FROM guid1 WHERE guid_field = '3f3e3c3b-3a30-3938-3736-353433a2313e' OR + guid_field = '11111111111111111111111111111111'; + -- should fail INSERT INTO guid1(guid_field) VALUES('11111111-1111-1111-1111-111111111111'); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index d4e9515e9f4..0131f9a8d43 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1761,6 +1761,7 @@ OprCacheKey OprInfo OprProofCacheEntry OprProofCacheKey +OrArgIndexMatch OuterJoinClauseInfo OutputPluginCallbacks OutputPluginOptions @@ -3266,6 +3267,7 @@ amgetbitmap_function amgettuple_function aminitparallelscan_function aminsert_function +aminsert_extended_function aminsertcleanup_function ammarkpos_function amoptions_function