diff -cr postgresql-8.2.3/src/backend/access/heap/heapam.c postgresql-8.2.3-ss/src/backend/access/heap/heapam.c *** postgresql-8.2.3/src/backend/access/heap/heapam.c Sun Feb 4 12:00:49 2007 --- postgresql-8.2.3-ss/src/backend/access/heap/heapam.c Tue Mar 20 16:12:12 2007 *************** *** 65,70 **** --- 65,275 ---- * ---------------------------------------------------------------- */ + static BlockNumber ss_init(HeapScanDesc); + static int ss_store_hint(HeapScanDesc,BlockNumber); + static int ss_hash(HeapScanDesc); + bool Trace_sync_seqscan = false; + double sync_seqscan_threshold = DEFAULT_SYNC_SCAN_THRESHOLD; + double sync_seqscan_offset = DEFAULT_SYNC_SCAN_OFFSET; + + /* + * ss_init: + * + * This function reads the Sync Scan Hint Table + * (creating it if it doesn't already exist) to + * find a possible location for an already running + * sequential scan on this relation. + * + * By starting a sequential scan near the location + * of an already running scan, we improve the chance + * of finding pages in cache. + * + * Also, depending on SYNC_SCAN_START_OFFSET, this + * function will subtract from the hint before + * starting the scan, in order to pick up pages that + * are likely to already be in cache. + * + * This function assumes that scan->rs_nblocks is + * already properly set, and sets scan->rs_start_page + * to a value based on the hint found. Also, it sets + * scan->rs_hint to point to the location of the hint + * in the hint table. + */ + static BlockNumber ss_init(HeapScanDesc scan) + { + ss_hint_t *hint_table; + int table_offset; + bool found; + int threshold = sync_seqscan_threshold * NBuffers; + int offset = sync_seqscan_offset * NBuffers; + + /* + * If the table is not large enough, or sync_scan_threshold + * is disabled (negative), don't Sync Scan. + */ + if(threshold < 0 || scan->rs_nblocks < threshold) + { + scan->rs_start_page = 0; + return 0; + } + + table_offset = ss_hash(scan); + hint_table = (ss_hint_t*)ShmemInitStruct("Sync Scan Hint Table", + SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t),&found); + + scan->rs_hint = &hint_table[table_offset]; + + /* + * If we just created the hint table for the first time, + * initialize the table to zero and start the scan at page 0. + */ + if(!found) { + if(Trace_sync_seqscan) + elog(DEBUG2,"SYNC_SCAN: Created Hint Table"); + memset(hint_table,0,sizeof(ss_hint_t)*SYNC_SCAN_TABLE_SIZE); + scan->rs_start_page = 0; + return 0; + } + + /* + * If the hint's relid is 0, that means + * we have not previously created a hint + * at this location in the table. + */ + if(scan->rs_hint->relid == 0) { + if(Trace_sync_seqscan) + elog(DEBUG2, "SYNC_SCAN: Hint empty"); + scan->rs_start_page = 0; + return 0; + } + + /* + * If the relid doesn't match the one in the hint, + * we have a hash collision. + */ + if(RelationGetRelid(scan->rs_rd) != scan->rs_hint->relid) + { + if(Trace_sync_seqscan) + elog(DEBUG1,"SYNC_SCAN: Hash collision"); + scan->rs_start_page = 0; + return 0; + } + + /* + * If the hint is not a valid block number + * for this relation, start at 0. + * + * This can happen if, for instance, someone + * TRUNCATEd the table between when the hint + * was set and now. + */ + if(scan->rs_hint->location < 0 || + scan->rs_hint->location >= scan->rs_nblocks) + { + if(Trace_sync_seqscan) + elog(DEBUG2,"SYNC_SCAN: Hint %d out of range." \ + " Relation has %d pages.", + scan->rs_hint->location,scan->rs_nblocks); + scan->rs_start_page = 0; + return 0; + } + + scan->rs_start_page = scan->rs_hint->location; + + /* + * By starting at offset earlier than the hint, + * it's likely that all of the blocks will already be + * cached, and the scan will quickly catch up to the head. + * + * offset is a positive value that will be + * subtracted from the hint. + */ + if(offset > scan->rs_nblocks) + { + if(Trace_sync_seqscan) + elog(DEBUG2,"SYNC_SCAN: Relation smaller than start offset: %d", + offset); + return 0; + } + + /* + * If subtracting the offset would bring the value + * to less than 0, we circle backwards to the end of the + * file. + */ + if(offset > scan->rs_start_page) + scan->rs_start_page += scan->rs_nblocks; + + scan->rs_start_page -= offset; + + if(Trace_sync_seqscan) + elog(DEBUG2,"SYNC_SCAN: START: OID = %d; Location = %d; Size: %d", + RelationGetRelid(scan->rs_rd), + scan->rs_start_page,scan->rs_nblocks); + + return 0; + } + + /* + * ss_store_hint: + * + * Writes an entry in the Sync Scan Hint Table + * of the form (relid,blocknumber). This will + * overwrite any existing entry that may collide + * with this entry in the table. + * + * No locking is performed here. When this data is + * later read by ss_init(), sanity checking is + * performed to ensure we don't use an invalid + * relation block number. + */ + static int ss_store_hint(HeapScanDesc scan, BlockNumber location) + { + ss_hint_t hint; + int threshold = sync_seqscan_threshold * NBuffers; + int offset = sync_seqscan_offset * NBuffers; + + /* + * If the table is not large enough, or sync_scan_threshold + * is disabled (negative), don't Sync Scan. + */ + if(threshold < 0 || scan->rs_nblocks < threshold) + return 0; + + /* + * If this scan has been progressing for less + * than offset pages, don't store the hint. + */ + if(location >= scan->rs_start_page) + { + if((location - scan->rs_start_page) < offset) + return 0; + } + else + { + if((location + scan->rs_nblocks - scan->rs_start_page) + < offset) + return 0; + } + + hint.relid = RelationGetRelid(scan->rs_rd); + hint.location = location; + + *scan->rs_hint = hint; + + return 0; + } + + /* + * This is a simplistic function to hash + * the Oid of the relation for placement in + * the Sync Scan Hint Table + */ + static int ss_hash(HeapScanDesc scan) + { + return RelationGetRelid(scan->rs_rd) % SYNC_SCAN_TABLE_SIZE; + } + /* ---------------- * initscan - scan code common to heap_beginscan and heap_rescan * ---------------- *************** *** 81,86 **** --- 286,296 ---- */ scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd); + /* + * Choose an good place to start the relation scan. + */ + ss_init(scan); + scan->rs_inited = false; scan->rs_ctup.t_data = NULL; ItemPointerSetInvalid(&scan->rs_ctup.t_self); *************** *** 223,229 **** tuple->t_data = NULL; return; } ! page = 0; /* first page */ heapgetpage(scan, page); lineoff = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; --- 433,443 ---- tuple->t_data = NULL; return; } ! /* ! * start the scan at the location that we chose ! * in ss_init() ! */ ! page = scan->rs_start_page; heapgetpage(scan, page); lineoff = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; *************** *** 364,378 **** } /* ! * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); /* ! * return NULL if we've exhausted all the pages */ ! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); --- 578,615 ---- } /* ! * If we get here, it means we've exhausted the items on this page and * it's time to move to the next. + * + * For the forward scan, we need to wrap around to the beginning + * of the relation file if we reach the end. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + if(backward) + page--; + else + page = (page + 1) % (scan->rs_nblocks); + + if(Trace_sync_seqscan) + { + if (!(page%50000)) + elog(DEBUG2,"page: %d",page); + else if (!(page%5000)) + elog(DEBUG3,"page: %d",page); + } + + if(! (page % SYNC_SCAN_REPORT_INTERVAL) ) + ss_store_hint(scan,page); + /* ! * Return NULL if we've exhausted all the pages. ! * For reverse scans, that means we've reached 0. For ! * forward scans, that means we've reached the page on ! * which we started. */ ! if ((backward && (page == 0)) || ! ((page%(scan->rs_nblocks)) == scan->rs_start_page)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); *************** *** 383,390 **** return; } - page = backward ? (page - 1) : (page + 1); - heapgetpage(scan, page); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); --- 620,625 ---- *************** *** 450,456 **** tuple->t_data = NULL; return; } ! page = 0; /* first page */ heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; --- 685,695 ---- tuple->t_data = NULL; return; } ! /* ! * start the scan at the location that we chose ! * in ss_init() ! */ ! page = scan->rs_start_page; heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; *************** *** 585,598 **** } /* ! * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ /* ! * return NULL if we've exhausted all the pages */ ! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); --- 824,859 ---- } /* ! * If we get here, it means we've exhausted the items on this page and * it's time to move to the next. + * + * For the forward scan, we need to wrap around to the beginning + * of the relation file if we reach the end. */ + if(backward) + page--; + else + page = (page + 1) % (scan->rs_nblocks); + + if(Trace_sync_seqscan) + { + if (!(page%50000)) + elog(DEBUG2,"page: %d",page); + else if (!(page%5000)) + elog(DEBUG3,"page: %d",page); + } + + if(! (page % SYNC_SCAN_REPORT_INTERVAL) ) + ss_store_hint(scan,page); /* ! * Return NULL if we've exhausted all the pages. ! * For reverse scans, that means we've reached 0. For ! * forward scans, that means we've reached the page on ! * which we started. */ ! if ((backward && (page == 0)) || ! ((page%(scan->rs_nblocks)) == scan->rs_start_page)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); *************** *** 603,609 **** return; } - page = backward ? (page - 1) : (page + 1); heapgetpage(scan, page); dp = (Page) BufferGetPage(scan->rs_cbuf); --- 864,869 ---- *************** *** 616,621 **** --- 876,892 ---- } } + /* + * SyncScanShmemSize: + * + * Called by CreateSharedMemoryAndSemaphores() + * to find out how much room the Sync Scan Hint + * Table will need to occupy. + */ + Size SyncScanShmemSize(void) + { + return SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t); + } #if defined(DISABLE_COMPLEX_MACRO) /* Only in postgresql-8.2.3-ss/src/backend/access/heap: heapam.c.orig diff -cr postgresql-8.2.3/src/backend/storage/ipc/ipci.c postgresql-8.2.3-ss/src/backend/storage/ipc/ipci.c *** postgresql-8.2.3/src/backend/storage/ipc/ipci.c Sun Oct 15 15:04:07 2006 --- postgresql-8.2.3-ss/src/backend/storage/ipc/ipci.c Tue Mar 20 16:10:31 2007 *************** *** 19,24 **** --- 19,25 ---- #include "access/nbtree.h" #include "access/subtrans.h" #include "access/twophase.h" + #include "access/heapam.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" *************** *** 110,115 **** --- 111,117 ---- size = add_size(size, FreeSpaceShmemSize()); size = add_size(size, BgWriterShmemSize()); size = add_size(size, BTreeShmemSize()); + size = add_size(size, SyncScanShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif Only in postgresql-8.2.3-ss/src/backend/storage/ipc: ipci.c.orig diff -cr postgresql-8.2.3/src/backend/utils/misc/guc.c postgresql-8.2.3-ss/src/backend/utils/misc/guc.c *** postgresql-8.2.3/src/backend/utils/misc/guc.c Wed Nov 29 06:50:07 2006 --- postgresql-8.2.3-ss/src/backend/utils/misc/guc.c Tue Mar 20 16:10:31 2007 *************** *** 25,31 **** #include #endif ! #include "access/gin.h" #include "access/twophase.h" #include "access/xact.h" --- 25,31 ---- #include #endif ! #include "access/heapam.h" #include "access/gin.h" #include "access/twophase.h" #include "access/xact.h" *************** *** 758,763 **** --- 758,773 ---- false, NULL, NULL }, + { + {"trace_sync_seqscan", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Generates debugging output for Synchronized Scans."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &Trace_sync_seqscan, + false, NULL, NULL + }, + #ifdef LOCK_DEBUG { {"trace_locks", PGC_SUSET, DEVELOPER_OPTIONS, *************** *** 1722,1727 **** --- 1732,1753 ---- &Geqo_selection_bias, DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS, MAX_GEQO_SELECTION_BIAS, NULL, NULL + }, + { + {"sync_seqscan_threshold", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN, + gettext_noop("Minimum size of table before synchronized scanning takes effect, as a fraction of shared_buffers."), + NULL + }, + &sync_seqscan_threshold, + DEFAULT_SYNC_SCAN_THRESHOLD, -1.0, 100.0, NULL, NULL + }, + { + {"sync_seqscan_offset", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN, + gettext_noop("Start synchronized scans at this offset (as a fraction of shared_buffers) before other scans."), + NULL + }, + &sync_seqscan_offset, + DEFAULT_SYNC_SCAN_OFFSET, 0.0, 100.0, NULL, NULL }, { Only in postgresql-8.2.3-ss/src/backend/utils/misc: guc.c.orig diff -cr postgresql-8.2.3/src/include/access/heapam.h postgresql-8.2.3-ss/src/include/access/heapam.h *** postgresql-8.2.3/src/include/access/heapam.h Sun Nov 5 14:42:10 2006 --- postgresql-8.2.3-ss/src/include/access/heapam.h Tue Mar 20 16:10:31 2007 *************** *** 25,30 **** --- 25,49 ---- #include "utils/rel.h" #include "utils/tqual.h" + /* + * Size of the Sync Scan Hint Table. + */ + #define SYNC_SCAN_TABLE_SIZE 1000 + + /* + * Interval between reports of the location + * of the current scan, in pages. + */ + #define SYNC_SCAN_REPORT_INTERVAL 16 + + #define DEFAULT_SYNC_SCAN_THRESHOLD 1.0 + #define DEFAULT_SYNC_SCAN_OFFSET 0.0 + + extern DLLIMPORT bool Trace_sync_seqscan; + extern DLLIMPORT double sync_seqscan_threshold; + extern DLLIMPORT double sync_seqscan_offset; + extern Size SyncScanShmemSize(void); + /* ---------------- * fastgetattr * Only in postgresql-8.2.3-ss/src/include/access: heapam.h.orig diff -cr postgresql-8.2.3/src/include/access/relscan.h postgresql-8.2.3-ss/src/include/access/relscan.h *** postgresql-8.2.3/src/include/access/relscan.h Tue Oct 3 17:30:07 2006 --- postgresql-8.2.3-ss/src/include/access/relscan.h Tue Mar 20 16:10:31 2007 *************** *** 19,24 **** --- 19,33 ---- #include "utils/tqual.h" + /* + * Structure of an entry in the + * Sync Scan Hint Table. + */ + typedef struct { + Oid relid; /* The relid that tags this hint entry */ + BlockNumber location; /* The location in the relation */ + } ss_hint_t; + typedef struct HeapScanDescData { /* scan parameters */ *************** *** 33,38 **** --- 42,49 ---- bool rs_inited; /* false = scan not init'd yet */ HeapTupleData rs_ctup; /* current tuple in scan, if any */ BlockNumber rs_cblock; /* current block # in scan, if any */ + BlockNumber rs_start_page; /* page where this scan began */ + ss_hint_t *rs_hint; /* pointer to scan hint */ Buffer rs_cbuf; /* current buffer in scan, if any */ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ ItemPointerData rs_mctid; /* marked scan position, if any */ Only in postgresql-8.2.3-ss/src/include/access: relscan.h.orig diff -cr postgresql-8.2.3/src/include/utils/guc_tables.h postgresql-8.2.3-ss/src/include/utils/guc_tables.h *** postgresql-8.2.3/src/include/utils/guc_tables.h Tue Oct 3 14:11:55 2006 --- postgresql-8.2.3-ss/src/include/utils/guc_tables.h Tue Mar 20 16:10:31 2007 *************** *** 56,61 **** --- 56,62 ---- QUERY_TUNING_METHOD, QUERY_TUNING_COST, QUERY_TUNING_GEQO, + QUERY_TUNING_SYNC_SEQSCAN, QUERY_TUNING_OTHER, LOGGING, LOGGING_WHERE, Only in postgresql-8.2.3-ss/src/include/utils: guc_tables.h.orig