*** pgsql/src/include/executor/hashjoin.h 2009/01/01 17:23:59 1.49 --- pgsql/src/include/executor/hashjoin.h 2009/03/21 00:04:40 1.50 *************** *** 7,13 **** * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * ! * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.48 2008/01/01 19:45:57 momjian Exp $ * *------------------------------------------------------------------------- */ --- 7,13 ---- * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * ! * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.49 2009/01/01 17:23:59 momjian Exp $ * *------------------------------------------------------------------------- */ *************** typedef struct HashJoinTupleData *** 72,77 **** --- 72,107 ---- #define HJTUPLE_MINTUPLE(hjtup) \ ((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD)) + /* + * If the outer relation's distribution is sufficiently nonuniform, we attempt + * to optimize the join by treating the hash values corresponding to the outer + * relation's MCVs specially. Inner relation tuples matching these hash + * values go into the "skew" hashtable instead of the main hashtable, and + * outer relation tuples with these hash values are matched against that + * table instead of the main one. Thus, tuples with these hash values are + * effectively handled as part of the first batch and will never go to disk. + * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory + * allowed for the join; while building the hashtables, we decrease the number + * of MCVs being specially treated if needed to stay under this limit. + * + * Note: you might wonder why we look at the outer relation stats for this, + * rather than the inner. One reason is that the outer relation is typically + * bigger, so we get more I/O savings by optimizing for its most common values. + * Also, for similarly-sized relations, the planner prefers to put the more + * uniformly distributed relation on the inside, so we're more likely to find + * interesting skew in the outer relation. + */ + typedef struct HashSkewBucket + { + uint32 hashvalue; /* common hash value */ + HashJoinTuple tuples; /* linked list of inner-relation tuples */ + } HashSkewBucket; + + #define SKEW_BUCKET_OVERHEAD MAXALIGN(sizeof(HashSkewBucket)) + #define INVALID_SKEW_BUCKET_NO (-1) + #define SKEW_WORK_MEM_PERCENT 2 + #define SKEW_MIN_OUTER_FRACTION 0.01 + typedef struct HashJoinTableData { *************** typedef struct HashJoinTableData *** 82,87 **** --- 112,123 ---- struct HashJoinTupleData **buckets; /* buckets array is per-batch storage, as are all the tuples */ + bool skewEnabled; /* are we using skew optimization? */ + HashSkewBucket **skewBucket; /* hashtable of skew buckets */ + int skewBucketLen; /* size of skewBucket array (a power of 2!) */ + int nSkewBuckets; /* number of active skew buckets */ + int *skewBucketNums; /* array indexes of active skew buckets */ + int nbatch; /* number of batches */ int curbatch; /* current batch #; 0 during 1st pass */ *************** typedef struct HashJoinTableData *** 113,118 **** --- 149,156 ---- Size spaceUsed; /* memory space currently used by tuples */ Size spaceAllowed; /* upper limit for space used */ + Size spaceUsedSkew; /* skew hash table's current space usage */ + Size spaceAllowedSkew; /* upper limit for skew hashtable */ MemoryContext hashCxt; /* context for whole-hash-join storage */ MemoryContext batchCxt; /* context for this-batch-only storage */