Skip to content
This repository was archived by the owner on May 24, 2024. It is now read-only.

Commit cc5b942

Browse files
authored
get correlation from segments instead of calculating it on QD (#15357)
We cannot use the same method as PostgreSQL does to calculate the correlation in QD. When we collect data from segments to QD, this will change the physical order of the data. such as in segment 1 the data is 1,3,5,7,9. And in segment 2 the data is 2,4,6,8,10. In each segment the data is ordered, and correlation is 1 in each segment. But after we collect the data to QD, it may be 1,3,5,2,4,7,9,6,8,10. And the correlation is 0.3 or something else and it is not stable. And this will increase the cost of index scan which is shouldn't be done. So get correlations from segments and then calculate correlation for QD. we use the weighted mean algorithm to calculate correlation on QD, However, In some situations, we may not be able to obtain reltuples of a table, such as none-leaf part of partitioned table or the parent table of the inherited table. So we can only use the mean algorithm to calculate correlation for these tables.
1 parent 8059e30 commit cc5b942

File tree

11 files changed

+871
-63
lines changed

11 files changed

+871
-63
lines changed

src/backend/commands/analyze.c

Lines changed: 409 additions & 28 deletions
Large diffs are not rendered by default.

src/backend/commands/analyzefuncs.c

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "foreign/fdwapi.h"
2020
#include "miscadmin.h"
2121
#include "funcapi.h"
22+
#include "utils/syscache.h"
2223

2324
/**
2425
* Statistics related parameters.
@@ -350,3 +351,159 @@ gp_acquire_sample_rows_col_type(Oid typid)
350351
}
351352
return typid;
352353
}
354+
355+
/*
356+
* gp_acquire_correlations - Acquire each column's correlation for a table.
357+
* This is an internal function called in gp_acquire_correlations_dispatcher.
358+
* this function will return a result set, a row for each alive column.
359+
* each row contains 3 columns: attnum, the correlation for it and totalrows.
360+
* if correlation is null, set totalrows to 0 for it.
361+
*
362+
* So overall, this returns a result set like this:
363+
* create table t(tc1 int, tc2 int, tc3 int);
364+
* insert values.
365+
* alter table t drop column tc2;
366+
*
367+
* attnum | correlation| totalrows
368+
* ----------+------------|+------------
369+
* 0 | 0.8 | 200
370+
* 2 | | 0
371+
*/
372+
Datum
373+
gp_acquire_correlations(PG_FUNCTION_ARGS)
374+
{
375+
FuncCallContext *funcctx = NULL;
376+
gp_acquire_correlation_context *ctx;
377+
MemoryContext oldcontext;
378+
Oid relOid = PG_GETARG_OID(0);
379+
bool inherited = PG_GETARG_BOOL(1);
380+
TupleDesc relDesc;
381+
TupleDesc outDesc;
382+
383+
if (SRF_IS_FIRSTCALL())
384+
{
385+
Relation onerel;
386+
funcctx = SRF_FIRSTCALL_INIT();
387+
388+
/*
389+
* switch to memory context appropriate for multiple function
390+
* calls
391+
*/
392+
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
393+
394+
/* Construct the context to keep across calls. */
395+
ctx = (gp_acquire_correlation_context *) palloc0(sizeof(gp_acquire_correlation_context));
396+
397+
if (!pg_class_ownercheck(relOid, GetUserId()))
398+
aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_TABLE,
399+
get_rel_name(relOid));
400+
401+
onerel = table_open(relOid, AccessShareLock);
402+
relDesc = RelationGetDescr(onerel);
403+
404+
outDesc = CreateTemplateTupleDesc(3);
405+
TupleDescInitEntry(outDesc,
406+
1,
407+
"attnum",
408+
INT4OID,
409+
-1,
410+
0);
411+
TupleDescInitEntry(outDesc,
412+
2,
413+
"correlation",
414+
FLOAT4OID,
415+
-1,
416+
0);
417+
TupleDescInitEntry(outDesc,
418+
3,
419+
"totalrows",
420+
INT4OID,
421+
-1,
422+
0);
423+
424+
BlessTupleDesc(outDesc);
425+
funcctx->tuple_desc = outDesc;
426+
427+
ctx->onerel = onerel;
428+
funcctx->user_fctx = ctx;
429+
ctx->outDesc = outDesc;
430+
431+
ctx->index = 0;
432+
ctx->totalAttr = relDesc->natts;
433+
MemoryContextSwitchTo(oldcontext);
434+
}
435+
436+
/* stuff done on every call of the function */
437+
funcctx = SRF_PERCALL_SETUP();
438+
439+
ctx = funcctx->user_fctx;
440+
relDesc = RelationGetDescr(ctx->onerel);
441+
outDesc = ctx->outDesc;
442+
443+
Datum *outvalues = (Datum *) palloc(outDesc->natts * sizeof(Datum));
444+
bool *outnulls = (bool *) palloc(outDesc->natts * sizeof(bool));
445+
HeapTuple res;
446+
int attno = ctx->index;
447+
448+
/* Return all alive attribute correlation */
449+
for (; attno < ctx->totalAttr; attno++)
450+
{
451+
/* get the correlation of the column */
452+
int totalrows = 0;
453+
HeapTuple statsTuple;
454+
Form_pg_attribute relatt = TupleDescAttr(relDesc, attno);
455+
if (relatt->attisdropped)
456+
continue;
457+
statsTuple = SearchSysCache3(STATRELATTINH,
458+
ObjectIdGetDatum(relOid),
459+
Int16GetDatum(attno + 1),
460+
BoolGetDatum(inherited));
461+
outvalues[0] = Int32GetDatum(attno);
462+
outnulls[0] = false;
463+
464+
if (HeapTupleIsValid(statsTuple))
465+
{
466+
AttStatsSlot sslot;
467+
468+
if (get_attstatsslot(&sslot, statsTuple,
469+
STATISTIC_KIND_CORRELATION, InvalidOid,
470+
ATTSTATSSLOT_NUMBERS))
471+
{
472+
float4 varCorrelation;
473+
Assert(sslot.nnumbers == 1);
474+
varCorrelation = sslot.numbers[0];
475+
476+
free_attstatsslot(&sslot);
477+
478+
outvalues[1] = Float4GetDatum(varCorrelation);
479+
outnulls[1] = false;
480+
totalrows = ctx->onerel->rd_rel->reltuples;
481+
}
482+
else
483+
{
484+
outvalues[1] = (Datum) 0;
485+
outnulls[1] = true;
486+
}
487+
ReleaseSysCache(statsTuple);
488+
}
489+
else
490+
{
491+
outvalues[1] = (Datum) 0;
492+
outnulls[1] = true;
493+
}
494+
495+
outvalues[2] = Int32GetDatum(totalrows);
496+
outnulls[2] = false;
497+
498+
res = heap_form_tuple(outDesc, outvalues, outnulls);
499+
ctx->index = attno + 1;
500+
501+
SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(res));
502+
}
503+
504+
table_close(ctx->onerel, AccessShareLock);
505+
pfree(ctx);
506+
funcctx->user_fctx = NULL;
507+
508+
SRF_RETURN_DONE(funcctx);
509+
}

src/include/catalog/pg_proc.dat

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11137,6 +11137,8 @@
1113711137
# Analyze related
1113811138
{ oid => 6038, descr => 'Collect a random sample of rows from table',
1113911139
proname => 'gp_acquire_sample_rows', prorows => '1000', proretset => 't', provolatile => 'v', proparallel => 'u', prorettype => 'record', proargtypes => 'oid int4 bool', prosrc => 'gp_acquire_sample_rows', proexeclocation => 's' },
11140+
{ oid => 6040, descr => 'Collect correlations from segments',
11141+
proname => 'gp_acquire_correlations', prorows => '10', proretset => 't', provolatile => 'v', proparallel => 'u', prorettype => 'record', proargtypes => 'oid bool', prosrc => 'gp_acquire_correlations', proexeclocation => 's' },
1114011142

1114111143
# Backoff related
1114211144
{ oid => 5040, descr => 'change weight of all the backends for a given session id',

src/include/commands/vacuum.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ typedef struct VacAttrStats
142142
bool *exprnulls;
143143
int rowstride;
144144
bool merge_stats;
145+
bool corrnull; /* whether correlation value is null */
146+
bool partitiontbl_qd; /* analyze is on QD and the policy of table is partitioned */
147+
float4 corrval; /* correlation gathered from segments */
145148
} VacAttrStats;
146149

147150
typedef enum VacuumOption
@@ -274,6 +277,24 @@ typedef struct
274277
bool summary_sent;
275278
} gp_acquire_sample_rows_context;
276279

280+
typedef struct
281+
{
282+
/* Table being analyzed */
283+
Relation onerel;
284+
285+
/* whether acquire inherited table's correlations */
286+
bool inherited;
287+
288+
/*
289+
* Result tuple descriptor.
290+
*/
291+
TupleDesc outDesc;
292+
293+
/* SRF state, to track which rows have already been returned. */
294+
int index;
295+
int totalAttr;
296+
} gp_acquire_correlation_context;
297+
277298
/* GUC parameters */
278299
extern PGDLLIMPORT int default_statistics_target; /* PGDLLIMPORT for PostGIS */
279300
extern int vacuum_freeze_min_age;
@@ -353,6 +374,7 @@ extern int acquire_inherited_sample_rows(Relation onerel, int elevel,
353374

354375
/* in commands/analyzefuncs.c */
355376
extern Datum gp_acquire_sample_rows(PG_FUNCTION_ARGS);
377+
extern Datum gp_acquire_correlations(PG_FUNCTION_ARGS);
356378
extern Oid gp_acquire_sample_rows_col_type(Oid typid);
357379

358380
extern bool gp_vacuum_needs_update_stats(void);

src/test/isolation2/expected/lockmodes.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2388,7 +2388,7 @@ DROP
23882388
schemaname | tablename | attname | inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram
23892389
------------+---------------+---------+-----------+-----------+-----------+------------+------------------+-------------------+--------------------------------------------------+-------------+-------------------+------------------------+----------------------
23902390
public | analyzedrop | a | t | 0 | 4 | -1 | | | {0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,19} | | | |
2391-
public | analyzedrop_2 | a | f | 0 | 4 | -1 | | | {10,11,12,13,14,15,16,17,18,19} | -0.345455 | | |
2391+
public | analyzedrop_2 | a | f | 0 | 4 | -1 | | | {10,11,12,13,14,15,16,17,18,19} | 1 | | |
23922392
(2 rows)
23932393
-- Case 2. No failure should happen when there's concurrent drop on parent as well.
23942394
1:select gp_inject_fault_infinite('merge_leaf_stats_after_find_children', 'suspend', dbid) from gp_segment_configuration where content = -1 and role = 'p';

src/test/isolation2/expected/lockmodes_optimizer.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2389,7 +2389,7 @@ DROP
23892389
schemaname | tablename | attname | inherited | null_frac | avg_width | n_distinct | most_common_vals | most_common_freqs | histogram_bounds | correlation | most_common_elems | most_common_elem_freqs | elem_count_histogram
23902390
------------+---------------+---------+-----------+-----------+-----------+------------+------------------+-------------------+--------------------------------------------------+-------------+-------------------+------------------------+----------------------
23912391
public | analyzedrop | a | t | 0 | 4 | -1 | | | {0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,19} | | | |
2392-
public | analyzedrop_2 | a | f | 0 | 4 | -1 | | | {10,11,12,13,14,15,16,17,18,19} | -0.345455 | | |
2392+
public | analyzedrop_2 | a | f | 0 | 4 | -1 | | | {10,11,12,13,14,15,16,17,18,19} | 1 | | |
23932393
(2 rows)
23942394
-- Case 2. No failure should happen when there's concurrent drop on parent as well.
23952395
1:select gp_inject_fault_infinite('merge_leaf_stats_after_find_children', 'suspend', dbid) from gp_segment_configuration where content = -1 and role = 'p';

0 commit comments

Comments
 (0)