accounting bytes written stat with all the indexing options

Thejas-bhat · Thejas-bhat · commit 53f09d71414e · 2022-07-20T19:55:29.000+05:30
diff --git a/contentcoder.go b/contentcoder.go
@@ -19,6 +19,7 @@ import (
 	"encoding/binary"
 	"io"
 	"reflect"
+	"sync/atomic"
 
 	"github.com/golang/snappy"
 )
@@ -48,6 +49,9 @@ type chunkedContentCoder struct {
 	chunkMeta []MetaData
 
 	compressed []byte // temp buf for snappy compression
+
+	// atomic access to this variable
+	bytesWritten uint64
 }
 
 // MetaData represents the data information inside a
@@ -105,6 +109,14 @@ func (c *chunkedContentCoder) Close() error {
 	return c.flushContents()
 }
 
+func (c *chunkedContentCoder) incrementBytesWritten(val uint64) {
+	atomic.AddUint64(&c.bytesWritten, val)
+}
+
+func (c *chunkedContentCoder) getBytesWritten() uint64 {
+	return atomic.LoadUint64(&c.bytesWritten)
+}
+
 func (c *chunkedContentCoder) flushContents() error {
 	// flush the contents, with meta information at first
 	buf := make([]byte, binary.MaxVarintLen64)
@@ -127,6 +139,7 @@ func (c *chunkedContentCoder) flushContents() error {
 	c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
 	// write the compressed data to the final data
 	c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
+	c.incrementBytesWritten(uint64(len(c.compressed)))
 	c.final = append(c.final, c.compressed...)
 
 	c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))
diff --git a/docvalues.go b/docvalues.go
@@ -142,16 +142,20 @@ func (di *docValueReader) BytesRead() uint64 {
 	return atomic.LoadUint64(&di.bytesRead)
 }
 
-func (di *docValueReader) SetBytesRead(val uint64) {
+func (di *docValueReader) ResetBytesRead(val uint64) {
 	atomic.StoreUint64(&di.bytesRead, val)
 }
 
 func (di *docValueReader) incrementBytesRead(val uint64) {
-	if segment.CollectIOStats {
+	if CollectDiskStats {
 		atomic.AddUint64(&di.bytesRead, val)
 	}
 }
 
+func (di *docValueReader) BytesWritten() uint64 {
+	return 0
+}
+
 func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
 	// advance to the chunk where the docValues
 	// reside for the given docNum
diff --git a/intDecoder.go b/intDecoder.go
@@ -18,8 +18,6 @@ import (
 	"encoding/binary"
 	"fmt"
 	"sync/atomic"
-
-	segment "github.com/blevesearch/scorch_segment_api/v2"
 )
 
 type chunkedIntDecoder struct {
@@ -61,7 +59,7 @@ func newChunkedIntDecoder(buf []byte, offset uint64, rv *chunkedIntDecoder) *chu
 		rv.chunkOffsets[i], read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64])
 		n += uint64(read)
 	}
-	if segment.CollectIOStats {
+	if CollectDiskStats {
 		atomic.AddUint64(&rv.bytesRead, n)
 	}
 	rv.dataStartOffset = offset + n
@@ -93,7 +91,7 @@ func (d *chunkedIntDecoder) loadChunk(chunk int) error {
 	start += s
 	end += e
 	d.curChunkBytes = d.data[start:end]
-	if segment.CollectIOStats {
+	if CollectDiskStats {
 		atomic.AddUint64(&d.bytesRead, uint64(len(d.curChunkBytes)))
 	}
 	if d.r == nil {
diff --git a/intcoder.go b/intcoder.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"encoding/binary"
 	"io"
+	"sync/atomic"
 )
 
 // We can safely use 0 to represent termNotEncoded since 0
@@ -34,6 +35,9 @@ type chunkedIntCoder struct {
 	currChunk uint64
 
 	buf []byte
+
+	// atomic access to this variable
+	bytesWritten uint64
 }
 
 // newChunkedIntCoder returns a new chunk int coder which packs data into
@@ -73,6 +77,14 @@ func (c *chunkedIntCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) {
 	}
 }
 
+func (c *chunkedIntCoder) incrementBytesWritten(val uint64) {
+	atomic.AddUint64(&c.bytesWritten, val)
+}
+
+func (c *chunkedIntCoder) getBytesWritten() uint64 {
+	return atomic.LoadUint64(&c.bytesWritten)
+}
+
 // Add encodes the provided integers into the correct chunk for the provided
 // doc num.  You MUST call Add() with increasing docNums.
 func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
@@ -94,6 +106,7 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
 		if err != nil {
 			return err
 		}
+		c.incrementBytesWritten(uint64(wb))
 	}
 
 	return nil
diff --git a/new.go b/new.go
@@ -20,6 +20,7 @@ import (
 	"math"
 	"sort"
 	"sync"
+	"sync/atomic"
 
 	"github.com/RoaringBitmap/roaring"
 	index "github.com/blevesearch/bleve_index_api"
@@ -32,6 +33,10 @@ var NewSegmentBufferNumResultsBump int = 100
 var NewSegmentBufferNumResultsFactor float64 = 1.0
 var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
 
+// This flag controls the disk stats collection from the segment files
+// during indexing and querying
+var CollectDiskStats bool
+
 // ValidateDocFields can be set by applications to perform additional checks
 // on fields in a document being added to a new segment, by default it does
 // nothing.
@@ -80,6 +85,7 @@ func (*ZapPlugin) newWithChunkMode(results []index.Document,
 	if err == nil && s.reset() == nil {
 		s.lastNumDocs = len(results)
 		s.lastOutSize = len(br.Bytes())
+		sb.setBytesWritten(s.getBytesWritten())
 		interimPool.Put(s)
 	}
 
@@ -141,6 +147,9 @@ type interim struct {
 
 	lastNumDocs int
 	lastOutSize int
+
+	// atomic access to this variable
+	bytesWritten uint64
 }
 
 func (s *interim) reset() (err error) {
@@ -484,6 +493,16 @@ func (s *interim) processDocument(docNum uint64,
 	}
 }
 
+func (s *interim) getBytesWritten() uint64 {
+	return atomic.LoadUint64(&s.bytesWritten)
+}
+
+func (s *interim) incrementBytesWritten(val uint64) {
+	if CollectDiskStats {
+		atomic.AddUint64(&s.bytesWritten, val)
+	}
+}
+
 func (s *interim) writeStoredFields() (
 	storedIndexOffset uint64, err error) {
 	varBuf := make([]byte, binary.MaxVarintLen64)
@@ -559,7 +578,7 @@ func (s *interim) writeStoredFields() (
 		metaBytes := s.metaBuf.Bytes()
 
 		compressed = snappy.Encode(compressed[:cap(compressed)], data)
-
+		s.incrementBytesWritten(uint64(len(compressed)))
 		docStoredOffsets[docNum] = uint64(s.w.Count())
 
 		_, err := writeUvarints(s.w,
@@ -597,6 +616,10 @@ func (s *interim) writeStoredFields() (
 	return storedIndexOffset, nil
 }
 
+func (s *interim) setBytesWritten(val uint64) {
+	atomic.StoreUint64(&s.bytesWritten, val)
+}
+
 func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
 	dictOffsets = make([]uint64, len(s.FieldsInv))
 
@@ -682,7 +705,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
 					if err != nil {
 						return 0, nil, err
 					}
-
+					prevBytesWritten := locEncoder.getBytesWritten()
 					for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
 						err = locEncoder.Add(docNum,
 							uint64(loc.fieldID), loc.pos, loc.start, loc.end,
@@ -696,7 +719,9 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
 							return 0, nil, err
 						}
 					}
-
+					if locEncoder.getBytesWritten()-prevBytesWritten > 0 {
+						s.incrementBytesWritten(locEncoder.getBytesWritten() - prevBytesWritten)
+					}
 					locOffset += freqNorm.numLocs
 				}
 
@@ -750,6 +775,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
 			return 0, nil, err
 		}
 
+		s.incrementBytesWritten(uint64(len(vellumData)))
+
 		// reset vellum for reuse
 		s.builderBuf.Reset()
 
@@ -764,6 +791,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
 		if err != nil {
 			return 0, nil, err
 		}
+
 		fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(s.results)-1), s.w, false)
 		if s.IncludeDocValues[fieldID] {
 			for docNum, docTerms := range docTermMap {
@@ -772,13 +800,16 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
 					if err != nil {
 						return 0, nil, err
 					}
+					// s.incrementBytesWritten(uint64(len(docTerms)))
 				}
 			}
 			err = fdvEncoder.Close()
 			if err != nil {
 				return 0, nil, err
 			}
 
+			s.setBytesWritten(s.getBytesWritten())
+
 			fdvOffsetsStart[fieldID] = uint64(s.w.Count())
 
 			_, err = fdvEncoder.Write()
diff --git a/posting.go b/posting.go
@@ -254,7 +254,7 @@ func (p *PostingsList) Count() uint64 {
 // The purpose of this implementation is to get
 // the bytes read from the postings lists stored
 // on disk, while querying
-func (p *PostingsList) SetBytesRead(val uint64) {
+func (p *PostingsList) ResetBytesRead(val uint64) {
 	atomic.StoreUint64(&p.bytesRead, val)
 }
 
@@ -263,11 +263,15 @@ func (p *PostingsList) BytesRead() uint64 {
 }
 
 func (p *PostingsList) incrementBytesRead(val uint64) {
-	if segment.CollectIOStats {
+	if CollectDiskStats {
 		atomic.AddUint64(&p.bytesRead, val)
 	}
 }
 
+func (p *PostingsList) BytesWritten() uint64 {
+	return 0
+}
+
 func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
 	rv.postingsOffset = postingsOffset
 
@@ -365,7 +369,7 @@ func (i *PostingsIterator) Size() int {
 // the bytes read from the disk which includes
 // the freqNorm and location specific information
 // of a hit
-func (i *PostingsIterator) SetBytesRead(val uint64) {
+func (i *PostingsIterator) ResetBytesRead(val uint64) {
 	atomic.StoreUint64(&i.bytesRead, val)
 }
 
@@ -374,11 +378,15 @@ func (i *PostingsIterator) BytesRead() uint64 {
 }
 
 func (i *PostingsIterator) incrementBytesRead(val uint64) {
-	if segment.CollectIOStats {
+	if CollectDiskStats {
 		atomic.AddUint64(&i.bytesRead, val)
 	}
 }
 
+func (i *PostingsIterator) BytesWritten() uint64 {
+	return 0
+}
+
 func (i *PostingsIterator) loadChunk(chunk int) error {
 	if i.includeFreqNorm {
 		err := i.freqNormReader.loadChunk(chunk)
@@ -390,15 +398,15 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
 		// the postingsIterator is tracking only the chunk loaded
 		// and the cumulation is tracked correctly in the downstream
 		// intDecoder
-		i.SetBytesRead(i.freqNormReader.getBytesRead())
+		i.ResetBytesRead(i.freqNormReader.getBytesRead())
 	}
 
 	if i.includeLocs {
 		err := i.locReader.loadChunk(chunk)
 		if err != nil {
 			return err
 		}
-		i.SetBytesRead(i.locReader.getBytesRead())
+		i.ResetBytesRead(i.locReader.getBytesRead())
 	}
 
 	i.currChunk = uint32(chunk)
diff --git a/segment.go b/segment.go
@@ -104,7 +104,8 @@ type SegmentBase struct {
 	size              uint64
 
 	// atomic access to this variable
-	bytesRead uint64
+	bytesRead    uint64
+	bytesWritten uint64
 
 	m         sync.Mutex
 	fieldFSTs map[uint16]*vellum.FST
@@ -226,23 +227,45 @@ func (s *Segment) loadConfig() error {
 // interface, as the intention is to retrieve the bytes
 // read from the on-disk segment as part of the current
 // query.
-func (s *Segment) SetBytesRead(val uint64) {
-	atomic.StoreUint64(&s.SegmentBase.bytesRead, val)
+func (s *Segment) ResetBytesRead(val uint64) {
+	if CollectDiskStats {
+		atomic.StoreUint64(&s.SegmentBase.bytesRead, val)
+	}
 }
 
 func (s *Segment) BytesRead() uint64 {
 	return atomic.LoadUint64(&s.bytesRead) +
 		atomic.LoadUint64(&s.SegmentBase.bytesRead)
 }
 
+func (s *Segment) BytesWritten() uint64 {
+	return 0
+}
+
 func (s *Segment) incrementBytesRead(val uint64) {
-	if segment.CollectIOStats {
+	if CollectDiskStats {
 		atomic.AddUint64(&s.bytesRead, val)
 	}
 }
 
+func (s *SegmentBase) BytesWritten() uint64 {
+	return atomic.LoadUint64(&s.bytesWritten)
+}
+
+func (s *SegmentBase) setBytesWritten(val uint64) {
+	if CollectDiskStats {
+		atomic.AddUint64(&s.bytesWritten, val)
+	}
+}
+
+func (s *SegmentBase) BytesRead() uint64 {
+	return 0
+}
+
+func (s *SegmentBase) ResetBytesRead(val uint64) {}
+
 func (s *SegmentBase) incrementBytesRead(val uint64) {
-	if segment.CollectIOStats {
+	if CollectDiskStats {
 		atomic.AddUint64(&s.bytesRead, val)
 	}
 }

Original file line number	Diff line number	Diff line change
`@@ -142,16 +142,20 @@ func (di *docValueReader) BytesRead() uint64 {`
`142`	`142`	`return atomic.LoadUint64(&di.bytesRead)`
`143`	`143`	`}`
`144`	`144`
`145`		`-func (di *docValueReader) SetBytesRead(val uint64) {`
	`145`	`+func (di *docValueReader) ResetBytesRead(val uint64) {`
`146`	`146`	`atomic.StoreUint64(&di.bytesRead, val)`
`147`	`147`	`}`
`148`	`148`
`149`	`149`	`func (di *docValueReader) incrementBytesRead(val uint64) {`
`150`		`- if segment.CollectIOStats {`
	`150`	`+ if CollectDiskStats {`
`151`	`151`	`atomic.AddUint64(&di.bytesRead, val)`
`152`	`152`	`}`
`153`	`153`	`}`
`154`	`154`
	`155`	`+func (di *docValueReader) BytesWritten() uint64 {`
	`156`	`+ return 0`
	`157`	`+}`
	`158`	`+`
`155`	`159`	`func (di docValueReader) loadDvChunk(chunkNumber uint64, s SegmentBase) error {`
`156`	`160`	`// advance to the chunk where the docValues`
`157`	`161`	`// reside for the given docNum`
Original file line number	Diff line number	Diff line change
`@@ -18,8 +18,6 @@ import (`
`18`	`18`	`"encoding/binary"`
`19`	`19`	`"fmt"`
`20`	`20`	`"sync/atomic"`
`21`		`-`
`22`		`- segment "github.com/blevesearch/scorch_segment_api/v2"`
`23`	`21`	`)`
`24`	`22`
`25`	`23`	`type chunkedIntDecoder struct {`
`@@ -61,7 +59,7 @@ func newChunkedIntDecoder(buf []byte, offset uint64, rv chunkedIntDecoder) chu`
`61`	`59`	`rv.chunkOffsets[i], read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64])`
`62`	`60`	`n += uint64(read)`
`63`	`61`	`}`
`64`		`- if segment.CollectIOStats {`
	`62`	`+ if CollectDiskStats {`
`65`	`63`	`atomic.AddUint64(&rv.bytesRead, n)`
`66`	`64`	`}`
`67`	`65`	`rv.dataStartOffset = offset + n`
`@@ -93,7 +91,7 @@ func (d *chunkedIntDecoder) loadChunk(chunk int) error {`
`93`	`91`	`start += s`
`94`	`92`	`end += e`
`95`	`93`	`d.curChunkBytes = d.data[start:end]`
`96`		`- if segment.CollectIOStats {`
	`94`	`+ if CollectDiskStats {`
`97`	`95`	`atomic.AddUint64(&d.bytesRead, uint64(len(d.curChunkBytes)))`
`98`	`96`	`}`
`99`	`97`	`if d.r == nil {`
Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ import (`
`18`	`18`	`"bytes"`
`19`	`19`	`"encoding/binary"`
`20`	`20`	`"io"`
	`21`	`+ "sync/atomic"`
`21`	`22`	`)`
`22`	`23`
`23`	`24`	`// We can safely use 0 to represent termNotEncoded since 0`
`@@ -34,6 +35,9 @@ type chunkedIntCoder struct {`
`34`	`35`	`currChunk uint64`
`35`	`36`
`36`	`37`	`buf []byte`
	`38`	`+`
	`39`	`+ // atomic access to this variable`
	`40`	`+ bytesWritten uint64`
`37`	`41`	`}`
`38`	`42`
`39`	`43`	`// newChunkedIntCoder returns a new chunk int coder which packs data into`
`@@ -73,6 +77,14 @@ func (c *chunkedIntCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) {`
`73`	`77`	`}`
`74`	`78`	`}`
`75`	`79`
	`80`	`+func (c *chunkedIntCoder) incrementBytesWritten(val uint64) {`
	`81`	`+ atomic.AddUint64(&c.bytesWritten, val)`
	`82`	`+}`
	`83`	`+`
	`84`	`+func (c *chunkedIntCoder) getBytesWritten() uint64 {`
	`85`	`+ return atomic.LoadUint64(&c.bytesWritten)`
	`86`	`+}`
	`87`	`+`
`76`	`88`	`// Add encodes the provided integers into the correct chunk for the provided`
`77`	`89`	`// doc num. You MUST call Add() with increasing docNums.`
`78`	`90`	`func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {`
`@@ -94,6 +106,7 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {`
`94`	`106`	`if err != nil {`
`95`	`107`	`return err`
`96`	`108`	`}`
	`109`	`+ c.incrementBytesWritten(uint64(wb))`
`97`	`110`	`}`
`98`	`111`
`99`	`112`	`return nil`
Original file line number	Diff line number	Diff line change
`@@ -254,7 +254,7 @@ func (p *PostingsList) Count() uint64 {`
`254`	`254`	`// The purpose of this implementation is to get`
`255`	`255`	`// the bytes read from the postings lists stored`
`256`	`256`	`// on disk, while querying`
`257`		`-func (p *PostingsList) SetBytesRead(val uint64) {`
	`257`	`+func (p *PostingsList) ResetBytesRead(val uint64) {`
`258`	`258`	`atomic.StoreUint64(&p.bytesRead, val)`
`259`	`259`	`}`
`260`	`260`
`@@ -263,11 +263,15 @@ func (p *PostingsList) BytesRead() uint64 {`
`263`	`263`	`}`
`264`	`264`
`265`	`265`	`func (p *PostingsList) incrementBytesRead(val uint64) {`
`266`		`- if segment.CollectIOStats {`
	`266`	`+ if CollectDiskStats {`
`267`	`267`	`atomic.AddUint64(&p.bytesRead, val)`
`268`	`268`	`}`
`269`	`269`	`}`
`270`	`270`
	`271`	`+func (p *PostingsList) BytesWritten() uint64 {`
	`272`	`+ return 0`
	`273`	`+}`
	`274`	`+`
`271`	`275`	`func (rv PostingsList) read(postingsOffset uint64, d Dictionary) error {`
`272`	`276`	`rv.postingsOffset = postingsOffset`
`273`	`277`
`@@ -365,7 +369,7 @@ func (i *PostingsIterator) Size() int {`
`365`	`369`	`// the bytes read from the disk which includes`
`366`	`370`	`// the freqNorm and location specific information`
`367`	`371`	`// of a hit`
`368`		`-func (i *PostingsIterator) SetBytesRead(val uint64) {`
	`372`	`+func (i *PostingsIterator) ResetBytesRead(val uint64) {`
`369`	`373`	`atomic.StoreUint64(&i.bytesRead, val)`
`370`	`374`	`}`
`371`	`375`
`@@ -374,11 +378,15 @@ func (i *PostingsIterator) BytesRead() uint64 {`
`374`	`378`	`}`
`375`	`379`
`376`	`380`	`func (i *PostingsIterator) incrementBytesRead(val uint64) {`
`377`		`- if segment.CollectIOStats {`
	`381`	`+ if CollectDiskStats {`
`378`	`382`	`atomic.AddUint64(&i.bytesRead, val)`
`379`	`383`	`}`
`380`	`384`	`}`
`381`	`385`
	`386`	`+func (i *PostingsIterator) BytesWritten() uint64 {`
	`387`	`+ return 0`
	`388`	`+}`
	`389`	`+`
`382`	`390`	`func (i *PostingsIterator) loadChunk(chunk int) error {`
`383`	`391`	`if i.includeFreqNorm {`
`384`	`392`	`err := i.freqNormReader.loadChunk(chunk)`
`@@ -390,15 +398,15 @@ func (i *PostingsIterator) loadChunk(chunk int) error {`
`390`	`398`	`// the postingsIterator is tracking only the chunk loaded`
`391`	`399`	`// and the cumulation is tracked correctly in the downstream`
`392`	`400`	`// intDecoder`
`393`		`- i.SetBytesRead(i.freqNormReader.getBytesRead())`
	`401`	`+ i.ResetBytesRead(i.freqNormReader.getBytesRead())`
`394`	`402`	`}`
`395`	`403`
`396`	`404`	`if i.includeLocs {`
`397`	`405`	`err := i.locReader.loadChunk(chunk)`
`398`	`406`	`if err != nil {`
`399`	`407`	`return err`
`400`	`408`	`}`
`401`		`- i.SetBytesRead(i.locReader.getBytesRead())`
	`409`	`+ i.ResetBytesRead(i.locReader.getBytesRead())`
`402`	`410`	`}`
`403`	`411`
`404`	`412`	`i.currChunk = uint32(chunk)`