From 13f8feafbe2d148318aed6869ffb695c38f2d4b0 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Thu, 20 May 2021 21:16:09 +0200
Subject: [PATCH v39 2/5] Index skip scan

Allow IndexOnlyScan to skip duplicated tuples based on search key prefix
(a trick also known as Index Skip Scan or Loose Index Scan, see in the
wiki [1]). The idea is to avoid scanning all equal values of a key, as
soon as a new value is found, restart the search by looking for a larger
value. This approach is much faster when the index has many equal keys.

Implemented via equipping IndexPath with indexskipprefix field and
creating an extra IndexPath with such prefix if suitable unique
expressions are present. On the execution size a new index am function
amskip is introduced to provide index specific implementation for such
skipping. To simplify potential amskip implementations,
ExecSupportsBackwardScan now returns false in case if index skip scan is
used, otherwise amskip has to deal with scroll cursor and be prepared to
handle different advance/read directions. ExecSupportsBackwardScan may
seem to have too big scope, but looks like now it used only together
with cursorOptions checks for CURSOR_OPT_SCROLL.

Original patch and design were proposed by Thomas Munro [2], revived and
improved by Dmitry Dolgov and Jesper Pedersen.

[1] https://wiki.postgresql.org/wiki/Loose_indexscan
[2] https://www.postgresql.org/message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com

Author: Jesper Pedersen, Dmitry Dolgov
Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan
---
 contrib/bloom/blutils.c                       |   1 +
 doc/src/sgml/config.sgml                      |  15 ++
 doc/src/sgml/indexam.sgml                     |  43 ++++++
 doc/src/sgml/indices.sgml                     |  23 +++
 src/backend/access/brin/brin.c                |   1 +
 src/backend/access/gin/ginutil.c              |   1 +
 src/backend/access/gist/gist.c                |   1 +
 src/backend/access/hash/hash.c                |   1 +
 src/backend/access/index/indexam.c            |  16 ++
 src/backend/access/spgist/spgutils.c          |   1 +
 src/backend/commands/explain.c                |  23 +++
 src/backend/executor/execAmi.c                |  32 +++-
 src/backend/executor/nodeIndexonlyscan.c      |  47 +++++-
 src/backend/nodes/copyfuncs.c                 |   1 +
 src/backend/nodes/outfuncs.c                  |   1 +
 src/backend/nodes/readfuncs.c                 |   1 +
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/optimizer/path/indxpath.c         | 140 +++++++++++++++++-
 src/backend/optimizer/path/pathkeys.c         |  54 ++++++-
 src/backend/optimizer/plan/createplan.c       |  10 +-
 src/backend/optimizer/util/pathnode.c         |  37 +++++
 src/backend/optimizer/util/plancat.c          |   1 +
 src/backend/utils/misc/guc.c                  |   9 ++
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/amapi.h                    |   6 +
 src/include/access/genam.h                    |   1 +
 src/include/access/sdir.h                     |   7 +
 src/include/nodes/execnodes.h                 |   2 +
 src/include/nodes/pathnodes.h                 |   4 +
 src/include/nodes/plannodes.h                 |   2 +
 src/include/optimizer/cost.h                  |   1 +
 src/include/optimizer/pathnode.h              |   4 +
 src/include/optimizer/paths.h                 |   5 +-
 src/test/regress/expected/sysviews.out        |   3 +-
 34 files changed, 477 insertions(+), 19 deletions(-)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index 754de008d4..f7288cf6dc 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -134,6 +134,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = blcostestimate;
 	amroutine->amoptions = bloptions;
 	amroutine->amproperty = NULL;
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5efbfe97b5..114b4c9b5f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -5056,6 +5056,21 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-indexskipscan" xreflabel="enable_indexskipscan">
+      <term><varname>enable_indexskipscan</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_indexskipscan</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's use of index-skip-scan plan
+        types (see <xref linkend="indexes-index-skip-scans"/>). The default is
+        <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-material" xreflabel="enable_material">
       <term><varname>enable_material</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 29fa3b7933..6c364a53d8 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -151,6 +151,7 @@ typedef struct IndexAmRoutine
     amendscan_function amendscan;
     ammarkpos_function ammarkpos;       /* can be NULL */
     amrestrpos_function amrestrpos;     /* can be NULL */
+    amskip_function amskip;             /* can be NULL */
 
     /* interface functions to support parallel index scans */
     amestimateparallelscan_function amestimateparallelscan;    /* can be NULL */
@@ -768,6 +769,48 @@ amrestrpos (IndexScanDesc scan);
 
   <para>
 <programlisting>
+bool
+amskip (IndexScanDesc scan,
+        ScanDirection direction,
+        int prefix);
+</programlisting>
+  Skip past all tuples where the first 'prefix' columns have the same value as
+  the last tuple returned in the current scan. The arguments are:
+
+   <variablelist>
+    <varlistentry>
+     <term><parameter>scan</parameter></term>
+     <listitem>
+      <para>
+       Index scan information
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term><parameter>direction</parameter></term>
+     <listitem>
+      <para>
+       The direction in which data is advancing.
+      </para>
+     </listitem>
+    </varlistentry>
+
+    <varlistentry>
+     <term><parameter>prefix</parameter></term>
+     <listitem>
+      <para>
+        Distinct prefix size.
+      </para>
+     </listitem>
+    </varlistentry>
+
+   </variablelist>
+
+  </para>
+
+  <para>
+<programlisting>
 Size
 amestimateparallelscan (void);
 </programlisting>
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index 56fbd45178..b0c8744205 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1297,6 +1297,29 @@ SELECT target FROM tests WHERE subject = 'some-subject' AND success;
    and later will recognize such cases and allow index-only scans to be
    generated, but older versions will not.
   </para>
+
+  <sect2 id="indexes-index-skip-scans">
+    <title>Index Skip Scans</title>
+
+    <indexterm zone="indexes-index-skip-scans">
+      <primary>index</primary>
+      <secondary>index-skip scans</secondary>
+    </indexterm>
+    <indexterm zone="indexes-index-skip-scans">
+      <primary>index-skip scan</primary>
+    </indexterm>
+
+    <para>
+     When the rows retrieved from an index scan are then deduplicated by
+     eliminating rows matching on a prefix of index keys (e.g. when using
+     <literal>SELECT DISTINCT</literal>), the planner will consider
+     skipping groups of rows with a matching key prefix. When a row with
+     a particular prefix is found, remaining rows with the same key prefix
+     are skipped.  The larger the number of rows with the same key prefix
+     rows (i.e. the lower the number of distinct key prefixes in the index),
+     the more efficient this is.
+    </para>
+  </sect2>
  </sect1>
 
 
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index c23ea44866..f0ce6f7334 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -118,6 +118,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = brincostestimate;
 	amroutine->amoptions = brinoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index cdd626ff0a..aeb1d40b0f 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -66,6 +66,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = gincostestimate;
 	amroutine->amoptions = ginoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 0683f42c25..fed061184e 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -87,6 +87,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = gistcostestimate;
 	amroutine->amoptions = gistoptions;
 	amroutine->amproperty = gistproperty;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 0752fb38a9..fd7c13ee4c 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -84,6 +84,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = hashcostestimate;
 	amroutine->amoptions = hashoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 5e22479b7a..310aacca58 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_skip		- advance past duplicate key values in a scan
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -739,6 +740,21 @@ index_can_return(Relation indexRelation, int attno)
 	return indexRelation->rd_indam->amcanreturn(indexRelation, attno);
 }
 
+/* ----------------
+ *		index_skip
+ *
+ *		Skip past all tuples where the first 'prefix' columns have the
+ *		same value as the last tuple returned in the current scan.
+ * ----------------
+ */
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	SCAN_CHECKS;
+
+	return scan->indexRelation->rd_indam->amskip(scan, direction, prefix);
+}
+
 /* ----------------
  *		index_getprocid
  *
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 8d99c9b762..f9b1e395f4 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -71,6 +71,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = spgcostestimate;
 	amroutine->amoptions = spgoptions;
 	amroutine->amproperty = spgproperty;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 8ab7bca866..37ca8612f4 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -152,6 +152,7 @@ static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es);
 static void ExplainIndentText(ExplainState *es);
 static void ExplainJSONLineEnding(ExplainState *es);
 static void ExplainYAMLLineStarting(ExplainState *es);
+static void ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es);
 static void escape_yaml(StringInfo buf, const char *str);
 
 
@@ -1118,6 +1119,21 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used)
 	return planstate_tree_walker(planstate, ExplainPreScanNode, rels_used);
 }
 
+/*
+ * ExplainIndexSkipScanKeys -
+ *	  Append information about index skip scan to es->str.
+ *
+ * Can be used to print the skip prefix size.
+ */
+static void
+ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es)
+{
+	if (skipPrefixSize > 0)
+	{
+		ExplainPropertyInteger("Distinct Prefix", NULL, skipPrefixSize, es);
+	}
+}
+
 /*
  * ExplainNode -
  *	  Appends a description of a plan tree to es->str
@@ -1475,6 +1491,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			{
 				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
 
+
 				ExplainIndexScanDetails(indexonlyscan->indexid,
 										indexonlyscan->indexorderdir,
 										es);
@@ -1748,6 +1765,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
 										   planstate, es);
 			break;
 		case T_IndexOnlyScan:
+			if (((IndexOnlyScan *) plan)->indexskipprefixsize > 0)
+			{
+				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
+				ExplainPropertyBool("Skip scan", true, es);
+				ExplainIndexSkipScanKeys(indexonlyscan->indexskipprefixsize, es);
+			}
 			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
 						   "Index Cond", planstate, ancestors, es);
 			if (((IndexOnlyScan *) plan)->indexqual)
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index b3726a54f3..4248912239 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -64,7 +64,7 @@
 #include "utils/rel.h"
 #include "utils/syscache.h"
 
-static bool IndexSupportsBackwardScan(Oid indexid);
+static bool IndexSupportsBackwardScan(Plan *node);
 
 
 /*
@@ -557,10 +557,8 @@ ExecSupportsBackwardScan(Plan *node)
 			return false;
 
 		case T_IndexScan:
-			return IndexSupportsBackwardScan(((IndexScan *) node)->indexid);
-
 		case T_IndexOnlyScan:
-			return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid);
+			return IndexSupportsBackwardScan(node);
 
 		case T_SubqueryScan:
 			return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan);
@@ -604,16 +602,38 @@ ExecSupportsBackwardScan(Plan *node)
 
 /*
  * An IndexScan or IndexOnlyScan node supports backward scan only if the
- * index's AM does.
+ * index's AM does and no skip scan is used.
  */
 static bool
-IndexSupportsBackwardScan(Oid indexid)
+IndexSupportsBackwardScan(Plan *node)
 {
 	bool		result;
+	Oid			indexid = InvalidOid;
+	int         skip_prefix_size = 0;
 	HeapTuple	ht_idxrel;
 	Form_pg_class idxrelrec;
 	IndexAmRoutine *amroutine;
 
+	Assert(IsA(node, IndexScan) || IsA(node, IndexOnlyScan));
+	switch(nodeTag(node))
+	{
+		case T_IndexScan:
+			indexid = ((IndexScan *) node)->indexid;
+			break;
+
+		case T_IndexOnlyScan:
+			indexid = ((IndexOnlyScan *) node)->indexid;
+			skip_prefix_size = ((IndexOnlyScan *) node)->indexskipprefixsize;
+			break;
+
+		default:
+			elog(DEBUG2, "unrecognized node type: %d", (int) nodeTag(node));
+			break;
+	}
+
+	if (skip_prefix_size > 0)
+		return false;
+
 	/* Fetch the pg_class tuple of the index relation */
 	ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexid));
 	if (!HeapTupleIsValid(ht_idxrel))
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 0754e28a9a..e800e43e35 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -41,6 +41,7 @@
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/predicate.h"
+#include "storage/itemptr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -62,9 +63,17 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	EState	   *estate;
 	ExprContext *econtext;
 	ScanDirection direction;
+	ScanDirection readDirection;
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
-	ItemPointer tid;
+	ItemPointer tid = NULL;
+	IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan;
+
+	/*
+	 * Tells if the current position was reached via skipping. In this case
+	 * there is no nead for the index_getnext_tid
+	 */
+	bool skipped = false;
 
 	/*
 	 * extract necessary information from index scan node
@@ -72,7 +81,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	estate = node->ss.ps.state;
 	direction = estate->es_direction;
 	/* flip direction if this is an overall backward scan */
-	if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
+	if (ScanDirectionIsBackward(indexonlyscan->indexorderdir))
 	{
 		if (ScanDirectionIsForward(direction))
 			direction = BackwardScanDirection;
@@ -115,15 +124,43 @@ IndexOnlyNext(IndexOnlyScanState *node)
 						 node->ioss_NumOrderByKeys);
 	}
 
+	/*
+	 * Check if we need to skip to the next key prefix.
+	 */
+	if (node->ioss_SkipPrefixSize > 0 && node->ioss_FirstTupleEmitted)
+	{
+		if (!index_skip(scandesc, direction, node->ioss_SkipPrefixSize))
+		{
+			/*
+			 * Reached end of index. At this point currPos is invalidated, and
+			 * we need to reset ioss_FirstTupleEmitted, since otherwise after
+			 * going backwards, reaching the end of index, and going forward
+			 * again we apply skip again. It would be incorrect and lead to an
+			 * extra skipped item.
+			 */
+			node->ioss_FirstTupleEmitted = false;
+			return ExecClearTuple(slot);
+		}
+		else
+		{
+			skipped = true;
+			tid = &scandesc->xs_heaptid;
+		}
+	}
+
+	readDirection = skipped ? indexonlyscan->indexorderdir : direction;
+
 	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
-	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	while (skipped || (tid = index_getnext_tid(scandesc, readDirection)) != NULL)
 	{
 		bool		tuple_from_heap = false;
 
 		CHECK_FOR_INTERRUPTS();
 
+		skipped = false;
+
 		/*
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
@@ -250,6 +287,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
 							  ItemPointerGetBlockNumber(tid),
 							  estate->es_snapshot);
 
+		node->ioss_FirstTupleEmitted = true;
+
 		return slot;
 	}
 
@@ -504,6 +543,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
+	indexstate->ioss_SkipPrefixSize = node->indexskipprefixsize;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index f5a7760740..320d648540 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -517,6 +517,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
 	COPY_NODE_FIELD(indexorderby);
 	COPY_NODE_FIELD(indextlist);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(indexskipprefixsize);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index c723f6d635..a6daf3847a 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -575,6 +575,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
 	WRITE_NODE_FIELD(indexorderby);
 	WRITE_NODE_FIELD(indextlist);
 	WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+	WRITE_INT_FIELD(indexskipprefixsize);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 3746668f52..ab5f237305 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1887,6 +1887,7 @@ _readIndexOnlyScan(void)
 	READ_NODE_FIELD(indexorderby);
 	READ_NODE_FIELD(indextlist);
 	READ_ENUM_FIELD(indexorderdir, ScanDirection);
+	READ_INT_FIELD(indexskipprefixsize);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 8577c7b138..d449876855 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -133,6 +133,7 @@ int			max_parallel_workers_per_gather = 2;
 bool		enable_seqscan = true;
 bool		enable_indexscan = true;
 bool		enable_indexonlyscan = true;
+bool		enable_indexskipscan = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 0e4e00eaf0..2622f6e389 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -784,6 +784,16 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	{
 		IndexPath  *ipath = (IndexPath *) lfirst(lc);
 
+		/*
+		 * To prevent unique paths from index skip scans being potentially used
+		 * when not needed scan keep them in a separate pathlist.
+		*/
+		if (ipath->indexskipprefix != 0)
+		{
+			add_unique_path(rel, (Path *) ipath);
+			continue;
+		}
+
 		if (index->amhasgettuple)
 			add_path(rel, (Path *) ipath);
 
@@ -866,12 +876,15 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	double		loop_count;
 	List	   *orderbyclauses;
 	List	   *orderbyclausecols;
-	List	   *index_pathkeys;
+	List	   *index_pathkeys = NIL;
 	List	   *useful_pathkeys;
+	List	   *index_pathkeys_pos = NIL;
 	bool		found_lower_saop_clause;
 	bool		pathkeys_possibly_useful;
 	bool		index_is_ordered;
 	bool		index_only_scan;
+	bool		not_empty_qual = false;
+	bool		can_skip;
 	int			indexcol;
 
 	/*
@@ -989,7 +1002,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	if (index_is_ordered && pathkeys_possibly_useful)
 	{
 		index_pathkeys = build_index_pathkeys(root, index,
-											  ForwardScanDirection);
+											  ForwardScanDirection,
+											  &index_pathkeys_pos);
 		useful_pathkeys = truncate_useless_pathkeys(root, rel,
 													index_pathkeys);
 		orderbyclauses = NIL;
@@ -1021,6 +1035,72 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	index_only_scan = (scantype != ST_BITMAPSCAN &&
 					   check_index_only(rel, index));
 
+	/* Check if an index skip scan is possible. */
+	can_skip = enable_indexskipscan & index->amcanskip & index_only_scan;
+
+	if (can_skip)
+	{
+		/*
+		 * Skip scan is not supported when there are qual conditions, which are
+		 * not covered by index. The reason for that is that those conditions
+		 * are evaluated later, already after skipping was applied.
+		 *
+		 * TODO: This implementation is too restrictive, and doesn't allow e.g.
+		 * index expressions. For that we need to examine index_clauses too.
+		 */
+		if (root->parse->jointree != NULL)
+		{
+			ListCell *lc;
+
+			foreach(lc, (List *) root->parse->jointree->quals)
+			{
+				Node *expr, *qual = (Node *) lfirst(lc);
+				OpExpr *expr_op;
+				Var *var;
+				bool found = false;
+
+				if (!is_opclause(qual))
+				{
+					not_empty_qual = true;
+					break;
+				}
+
+				expr = get_leftop(qual);
+				expr_op = (OpExpr *) qual;
+
+				if (!IsA(expr, Var))
+				{
+					not_empty_qual = true;
+					break;
+				}
+
+				var = (Var *) expr;
+
+				/*
+				 * Check if the qual operator is indexable by any columns of
+				 * the index, test collation and opfamily.
+				 */
+				for (int i = 0; i < index->ncolumns; i++)
+				{
+					if (index->indexkeys[i] == var->varattno &&
+						IndexCollMatchesExprColl(index->indexcollations[i],
+												 expr_op->inputcollid) &&
+						op_in_opfamily(expr_op->opno, index->opfamily[i]))
+					{
+						found = true;
+						break;
+					}
+				}
+
+				if (!found)
+				{
+					not_empty_qual = true;
+					break;
+				}
+			}
+		}
+	}
+
 	/*
 	 * 4. Generate an indexscan path if there are relevant restriction clauses
 	 * in the current clauses, OR the index ordering is potentially useful for
@@ -1044,6 +1124,33 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 								  false);
 		result = lappend(result, ipath);
 
+		/* Consider index skip scan as well */
+		if (root->query_uniquekeys != NULL && can_skip && !not_empty_qual)
+		{
+			int numusefulkeys = list_length(useful_pathkeys);
+			int numsortkeys = list_length(root->query_pathkeys);
+
+			if (numusefulkeys == numsortkeys)
+			{
+				int prefix;
+				if (list_length(root->distinct_pathkeys) > 0)
+					prefix = find_index_prefix_for_pathkey(index_pathkeys,
+														   index_pathkeys_pos,
+														   llast_node(PathKey,
+														   root->distinct_pathkeys));
+				else
+					/*
+					 * All distinct keys are constant and optimized away.
+					 * Skipping with 1 is sufficient.
+					 */
+					prefix = 1;
+
+				result = lappend(result,
+								 create_skipscan_unique_path(root, index,
+															 (Path *) ipath, prefix));
+			}
+		}
+
 		/*
 		 * If appropriate, consider parallel index scan.  We don't allow
 		 * parallel index scan for bitmap index scans.
@@ -1082,7 +1189,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	if (index_is_ordered && pathkeys_possibly_useful)
 	{
 		index_pathkeys = build_index_pathkeys(root, index,
-											  BackwardScanDirection);
+											  BackwardScanDirection,
+											  &index_pathkeys_pos);
 		useful_pathkeys = truncate_useless_pathkeys(root, rel,
 													index_pathkeys);
 		if (useful_pathkeys != NIL)
@@ -1099,6 +1207,32 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 									  false);
 			result = lappend(result, ipath);
 
+			/* Consider index skip scan as well */
+			if (root->query_uniquekeys != NULL && can_skip && !not_empty_qual)
+			{
+				int numusefulkeys = list_length(useful_pathkeys);
+				int numsortkeys = list_length(root->query_pathkeys);
+
+				if (numusefulkeys == numsortkeys)
+				{
+					int prefix;
+					if (list_length(root->distinct_pathkeys) > 0)
+						prefix = find_index_prefix_for_pathkey(index_pathkeys,
+															   index_pathkeys_pos,
+															   llast_node(PathKey,
+															   root->distinct_pathkeys));
+					else
+						/* all are distinct keys are constant and optimized away.
+						 * skipping with 1 is sufficient as all are constant anyway
+						 */
+						prefix = 1;
+
+					result = lappend(result,
+									 create_skipscan_unique_path(root, index,
+																 (Path *) ipath, prefix));
+				}
+			}
+
 			/* If appropriate, consider parallel index scan */
 			if (index->amcanparallel &&
 				rel->consider_parallel && outer_relids == NULL &&
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index f28547148d..154cb30745 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -523,6 +523,47 @@ get_cheapest_parallel_safe_total_inner(List *paths)
  *		NEW PATHKEY FORMATION
  ****************************************************************************/
 
+/*
+ * Find the prefix size for a specific path key in an index. For example, an
+ * index with (a,b,c) finding path key b will return prefix 2. Optionally
+ * pathkeys_positions can be provided, to specify at which position in the
+ * original pathkey list this particular key could be found (this is helpful
+ * when we deal with redundant pathkeys).
+ *
+ * Returns 0 when not found.
+ */
+int
+find_index_prefix_for_pathkey(List *index_pathkeys,
+							  List *pathkeys_positions,
+							  PathKey *target_pathkey)
+{
+	ListCell   *lc;
+	int			i;
+
+	i = 0;
+	foreach(lc, index_pathkeys)
+	{
+		PathKey    *cpathkey = (PathKey *) lfirst(lc);
+
+		if (cpathkey == target_pathkey)
+		{
+			/*
+			 * Prefix expected to start from 1, increment positions since
+			 * they're 0 based.
+			 */
+			if (pathkeys_positions != NIL &&
+				pathkeys_positions->length > i)
+				return list_nth_int(pathkeys_positions, i) + 1;
+			else
+				return i + 1;
+		}
+
+		i++;
+	}
+
+	return 0;
+}
+
 /*
  * build_index_pathkeys
  *	  Build a pathkeys list that describes the ordering induced by an index
@@ -535,7 +576,9 @@ get_cheapest_parallel_safe_total_inner(List *paths)
  * We iterate only key columns of covering indexes, since non-key columns
  * don't influence index ordering.  The result is canonical, meaning that
  * redundant pathkeys are removed; it may therefore have fewer entries than
- * there are key columns in the index.
+ * there are key columns in the index. Since by removing redundant pathkeys the
+ * information about original position is lost, return it via positions
+ * argument.
  *
  * Another reason for stopping early is that we may be able to tell that
  * an index column's sort order is uninteresting for this query.  However,
@@ -546,7 +589,8 @@ get_cheapest_parallel_safe_total_inner(List *paths)
 List *
 build_index_pathkeys(PlannerInfo *root,
 					 IndexOptInfo *index,
-					 ScanDirection scandir)
+					 ScanDirection scandir,
+					 List **positions)
 {
 	List	   *retval = NIL;
 	ListCell   *lc;
@@ -555,6 +599,8 @@ build_index_pathkeys(PlannerInfo *root,
 	if (index->sortopfamily == NULL)
 		return NIL;				/* non-orderable index */
 
+	*positions = NIL;
+
 	i = 0;
 	foreach(lc, index->indextlist)
 	{
@@ -608,7 +654,11 @@ build_index_pathkeys(PlannerInfo *root,
 			 * for this query.  Add it to list, unless it's redundant.
 			 */
 			if (!pathkey_is_redundant(cpathkey, retval))
+			{
 				retval = lappend(retval, cpathkey);
+				*positions = lappend_int(*positions,
+										 foreach_current_index(lc));
+			}
 		}
 		else
 		{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index a9aff24831..513466f7b6 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -190,7 +190,8 @@ static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
 										 Index scanrelid, Oid indexid,
 										 List *indexqual, List *indexorderby,
 										 List *indextlist,
-										 ScanDirection indexscandir);
+										 ScanDirection indexscandir,
+										 int skipprefix);
 static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid,
 											  List *indexqual,
 											  List *indexqualorig);
@@ -3067,7 +3068,8 @@ create_indexscan_plan(PlannerInfo *root,
 												fixed_indexquals,
 												fixed_indexorderbys,
 												best_path->indexinfo->indextlist,
-												best_path->indexscandir);
+												best_path->indexscandir,
+												best_path->indexskipprefix);
 	else
 		scan_plan = (Scan *) make_indexscan(tlist,
 											qpqual,
@@ -5439,7 +5441,8 @@ make_indexonlyscan(List *qptlist,
 				   List *indexqual,
 				   List *indexorderby,
 				   List *indextlist,
-				   ScanDirection indexscandir)
+				   ScanDirection indexscandir,
+				   int skipPrefixSize)
 {
 	IndexOnlyScan *node = makeNode(IndexOnlyScan);
 	Plan	   *plan = &node->scan.plan;
@@ -5454,6 +5457,7 @@ make_indexonlyscan(List *qptlist,
 	node->indexorderby = indexorderby;
 	node->indextlist = indextlist;
 	node->indexorderdir = indexscandir;
+	node->indexskipprefixsize = skipPrefixSize;
 
 	return node;
 }
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index e190b20675..0b24f670b4 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3080,6 +3080,43 @@ create_upper_unique_path(PlannerInfo *root,
 	return pathnode;
 }
 
+/*
+ * create_skipscan_unique_path
+ *	  Creates a pathnode the same as an existing IndexPath except based on
+ *	  skipping duplicate values.  This may or may not be cheaper than using
+ *	  create_upper_unique_path.
+ *
+ * The input path must be an IndexPath for an index that supports amskip.
+ */
+IndexPath *
+create_skipscan_unique_path(PlannerInfo *root, IndexOptInfo *index,
+							Path *basepath, int prefix)
+{
+	IndexPath 	*pathnode = makeNode(IndexPath);
+	int 		numDistinctRows;
+	List 		*uniqExprs;
+
+	Assert(IsA(basepath, IndexPath));
+
+	/* We don't want to modify basepath, so make a copy. */
+	memcpy(pathnode, basepath, sizeof(IndexPath));
+
+	uniqExprs = linitial_node(List, root->query_uniquekeys);
+
+	Assert(prefix > 0);
+	pathnode->indexskipprefix = prefix;
+	pathnode->path.uniquekeys = root->query_uniquekeys;
+
+	numDistinctRows = estimate_num_groups(root, uniqExprs,
+										  pathnode->path.rows,
+										  NULL, NULL);
+
+	pathnode->path.total_cost = pathnode->path.startup_cost * numDistinctRows;
+	pathnode->path.rows = numDistinctRows;
+
+	return pathnode;
+}
+
 /*
  * create_agg_path
  *	  Creates a pathnode that represents performing aggregation/grouping
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index c5194fdbbf..df20b88316 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -271,6 +271,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			info->amoptionalkey = amroutine->amoptionalkey;
 			info->amsearcharray = amroutine->amsearcharray;
 			info->amsearchnulls = amroutine->amsearchnulls;
+			info->amcanskip = (amroutine->amskip != NULL);
 			info->amcanparallel = amroutine->amcanparallel;
 			info->amhasgettuple = (amroutine->amgettuple != NULL);
 			info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f2c7c2486b..f0671e406c 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -985,6 +985,15 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-skip-scan plans."),
+			NULL
+		},
+		&enable_indexskipscan,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of bitmap-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0f7f49b949..0be508ef04 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -365,6 +365,7 @@
 #enable_hashjoin = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
+#enable_indexskipscan = on
 #enable_material = on
 #enable_mergejoin = on
 #enable_nestloop = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index d357ebb559..b1e82f4fb5 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -173,6 +173,11 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* skip past duplicates in a given prefix */
+typedef bool (*amskip_function) (IndexScanDesc scan,
+								 ScanDirection dir,
+								 int prefix);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -275,6 +280,7 @@ typedef struct IndexAmRoutine
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;	/* can be NULL */
 	amrestrpos_function amrestrpos; /* can be NULL */
+	amskip_function amskip;				/* can be NULL */
 
 	/* interface functions to support parallel index scans */
 	amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 480a4762f5..e757f01b0d 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -183,6 +183,7 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 												   IndexBulkDeleteResult *istat);
 extern bool index_can_return(Relation indexRelation, int attno);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 									uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/sdir.h b/src/include/access/sdir.h
index 8154adf3b8..6690c2a0b0 100644
--- a/src/include/access/sdir.h
+++ b/src/include/access/sdir.h
@@ -55,4 +55,11 @@ typedef enum ScanDirection
 #define ScanDirectionIsForward(direction) \
 	((bool) ((direction) == ForwardScanDirection))
 
+/*
+ * ScanDirectionsAreOpposite
+ *		True iff scan directions are backward/forward or forward/backward.
+ */
+#define ScanDirectionsAreOpposite(dirA, dirB) \
+	((bool) (dirA != NoMovementScanDirection && dirA == -dirB))
+
 #endif							/* SDIR_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e7ae21c023..c480794f59 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1517,6 +1517,8 @@ typedef struct IndexOnlyScanState
 	struct IndexScanDescData *ioss_ScanDesc;
 	TupleTableSlot *ioss_TableSlot;
 	Buffer		ioss_VMBuffer;
+	int         ioss_SkipPrefixSize;
+	bool		ioss_FirstTupleEmitted;
 	Size		ioss_PscanLen;
 } IndexOnlyScanState;
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 6c04ba2bb2..d53dda7ef7 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1234,6 +1234,9 @@ typedef struct Path
  * we need not recompute them when considering using the same index in a
  * bitmap index/heap scan (see BitmapHeapPath).  The costs of the IndexPath
  * itself represent the costs of an IndexScan or IndexOnlyScan plan type.
+ *
+ * 'indexskipprefix' represents the number of columns to consider for skip
+ * scans.
  *----------
  */
 typedef struct IndexPath
@@ -1246,6 +1249,7 @@ typedef struct IndexPath
 	ScanDirection indexscandir;
 	Cost		indextotalcost;
 	Selectivity indexselectivity;
+	int			indexskipprefix;
 } IndexPath;
 
 /*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index d671328dfd..530e9ab875 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -434,6 +434,8 @@ typedef struct IndexOnlyScan
 	List	   *indexorderby;	/* list of index ORDER BY exprs */
 	List	   *indextlist;		/* TargetEntry list describing index's cols */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			indexskipprefixsize;	/* the size of the prefix for distinct
+										 * scans */
 } IndexOnlyScan;
 
 /* ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 0fe60d82e4..baa8655c55 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -50,6 +50,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather;
 extern PGDLLIMPORT bool enable_seqscan;
 extern PGDLLIMPORT bool enable_indexscan;
 extern PGDLLIMPORT bool enable_indexonlyscan;
+extern PGDLLIMPORT bool enable_indexskipscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index b976f2963e..e9e56873a6 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -217,6 +217,10 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root,
 												 Path *subpath,
 												 int numCols,
 												 double numGroups);
+extern IndexPath *create_skipscan_unique_path(PlannerInfo *root,
+											  IndexOptInfo *index,
+											  Path *subpath,
+											  int prefix);
 extern AggPath *create_agg_path(PlannerInfo *root,
 								RelOptInfo *rel,
 								Path *subpath,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index b613392e2f..d4288f8451 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -212,8 +212,11 @@ extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths,
 													   Relids required_outer,
 													   double fraction);
 extern Path *get_cheapest_parallel_safe_total_inner(List *paths);
+extern int find_index_prefix_for_pathkey(List *index_pathkeys,
+										 List *pathkey_positions,
+										 PathKey *target_pathkey);
 extern List *build_index_pathkeys(PlannerInfo *root, IndexOptInfo *index,
-								  ScanDirection scandir);
+								  ScanDirection scandir, List **positions);
 extern List *build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel,
 									  ScanDirection scandir, bool *partialkeys);
 extern List *build_expression_pathkey(PlannerInfo *root, Expr *expr,
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 0bb558d93c..fa54f00ae7 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -103,6 +103,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexskipscan           | on
  enable_material                | on
  enable_mergejoin               | on
  enable_nestloop                | on
@@ -115,7 +116,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(20 rows)
+(21 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
-- 
2.26.3

