forked from zuoyebang/bitalostable
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptions.go
1473 lines (1344 loc) · 55.8 KB
/
options.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2011 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package bitalostable
import (
"bytes"
"fmt"
"io"
"runtime"
"strconv"
"strings"
"time"
"github.com/cockroachdb/errors"
"github.com/zuoyebang/bitalostable/internal/base"
"github.com/zuoyebang/bitalostable/internal/cache"
"github.com/zuoyebang/bitalostable/internal/humanize"
"github.com/zuoyebang/bitalostable/internal/manifest"
"github.com/zuoyebang/bitalostable/sstable"
"github.com/zuoyebang/bitalostable/vfs"
)
const (
cacheDefaultSize = 8 << 20 // 8 MB
)
// Compression exports the base.Compression type.
type Compression = sstable.Compression
// Exported Compression constants.
const (
DefaultCompression = sstable.DefaultCompression
NoCompression = sstable.NoCompression
SnappyCompression = sstable.SnappyCompression
ZstdCompression = sstable.ZstdCompression
)
// FilterType exports the base.FilterType type.
type FilterType = base.FilterType
// Exported TableFilter constants.
const (
TableFilter = base.TableFilter
)
// FilterWriter exports the base.FilterWriter type.
type FilterWriter = base.FilterWriter
// FilterPolicy exports the base.FilterPolicy type.
type FilterPolicy = base.FilterPolicy
// TablePropertyCollector exports the sstable.TablePropertyCollector type.
type TablePropertyCollector = sstable.TablePropertyCollector
// BlockPropertyCollector exports the sstable.BlockPropertyCollector type.
type BlockPropertyCollector = sstable.BlockPropertyCollector
// BlockPropertyFilter exports the sstable.BlockPropertyFilter type.
type BlockPropertyFilter = base.BlockPropertyFilter
// IterKeyType configures which types of keys an iterator should surface.
type IterKeyType int8
const (
// IterKeyTypePointsOnly configures an iterator to iterate over point keys
// only.
IterKeyTypePointsOnly IterKeyType = iota
// IterKeyTypeRangesOnly configures an iterator to iterate over range keys
// only.
IterKeyTypeRangesOnly
// IterKeyTypePointsAndRanges configures an iterator iterate over both point
// keys and range keys simultaneously.
IterKeyTypePointsAndRanges
)
// String implements fmt.Stringer.
func (t IterKeyType) String() string {
switch t {
case IterKeyTypePointsOnly:
return "points-only"
case IterKeyTypeRangesOnly:
return "ranges-only"
case IterKeyTypePointsAndRanges:
return "points-and-ranges"
default:
panic(fmt.Sprintf("unknown key type %d", t))
}
}
// IterOptions hold the optional per-query parameters for NewIter.
//
// Like Options, a nil *IterOptions is valid and means to use the default
// values.
type IterOptions struct {
// LowerBound specifies the smallest key (inclusive) that the iterator will
// return during iteration. If the iterator is seeked or iterated past this
// boundary the iterator will return Valid()==false. Setting LowerBound
// effectively truncates the key space visible to the iterator.
LowerBound []byte
// UpperBound specifies the largest key (exclusive) that the iterator will
// return during iteration. If the iterator is seeked or iterated past this
// boundary the iterator will return Valid()==false. Setting UpperBound
// effectively truncates the key space visible to the iterator.
UpperBound []byte
// TableFilter can be used to filter the tables that are scanned during
// iteration based on the user properties. Return true to scan the table and
// false to skip scanning. This function must be thread-safe since the same
// function can be used by multiple iterators, if the iterator is cloned.
TableFilter func(userProps map[string]string) bool
// PointKeyFilters can be used to avoid scanning tables and blocks in tables
// when iterating over point keys. It is requires that this slice is sorted in
// increasing order of the BlockPropertyFilter.ShortID. This slice represents
// an intersection across all filters, i.e., all filters must indicate that the
// block is relevant.
PointKeyFilters []BlockPropertyFilter
// RangeKeyFilters can be usefd to avoid scanning tables and blocks in tables
// when iterating over range keys. The same requirements that apply to
// PointKeyFilters apply here too.
RangeKeyFilters []BlockPropertyFilter
// KeyTypes configures which types of keys to iterate over: point keys,
// range keys, or both.
KeyTypes IterKeyType
// RangeKeyMasking can be used to enable automatic masking of point keys by
// range keys. Range key masking is only supported during combined range key
// and point key iteration mode (IterKeyTypePointsAndRanges).
RangeKeyMasking RangeKeyMasking
// OnlyReadGuaranteedDurable is an advanced option that is only supported by
// the Reader implemented by DB. When set to true, only the guaranteed to be
// durable state is visible in the iterator.
// - This definition is made under the assumption that the FS implementation
// is providing a durability guarantee when data is synced.
// - The visible state represents a consistent point in the history of the
// DB.
// - The implementation is free to choose a conservative definition of what
// is guaranteed durable. For simplicity, the current implementation
// ignores memtables. A more sophisticated implementation could track the
// highest seqnum that is synced to the WAL and published and use that as
// the visible seqnum for an iterator. Note that the latter approach is
// not strictly better than the former since we can have DBs that are (a)
// synced more rarely than memtable flushes, (b) have no WAL. (a) is
// likely to be true in a future CockroachDB context where the DB
// containing the state machine may be rarely synced.
// NB: this current implementation relies on the fact that memtables are
// flushed in seqnum order, and any ingested sstables that happen to have a
// lower seqnum than a non-flushed memtable don't have any overlapping keys.
// This is the fundamental level invariant used in other code too, like when
// merging iterators.
//
// Semantically, using this option provides the caller a "snapshot" as of
// the time the most recent memtable was flushed. An alternate interface
// would be to add a NewSnapshot variant. Creating a snapshot is heavier
// weight than creating an iterator, so we have opted to support this
// iterator option.
OnlyReadGuaranteedDurable bool
// UseL6Filters allows the caller to opt into reading filter blocks for L6
// sstables. Helpful if a lot of SeekPrefixGEs are expected in quick
// succession, that are also likely to not yield a single key. Filter blocks in
// L6 can be relatively large, often larger than data blocks, so the benefit of
// loading them in the cache is minimized if the probability of the key
// existing is not low or if we just expect a one-time Seek (where loading the
// data block directly is better).
UseL6Filters bool
// Internal options.
logger Logger
// Level corresponding to this file. Only passed in if constructed by a
// levelIter.
level manifest.Level
// NB: If adding new Options, you must account for them in iterator
// construction and Iterator.SetOptions.
}
// GetLowerBound returns the LowerBound or nil if the receiver is nil.
func (o *IterOptions) GetLowerBound() []byte {
if o == nil {
return nil
}
return o.LowerBound
}
// GetUpperBound returns the UpperBound or nil if the receiver is nil.
func (o *IterOptions) GetUpperBound() []byte {
if o == nil {
return nil
}
return o.UpperBound
}
func (o *IterOptions) pointKeys() bool {
if o == nil {
return true
}
return o.KeyTypes == IterKeyTypePointsOnly || o.KeyTypes == IterKeyTypePointsAndRanges
}
func (o *IterOptions) rangeKeys() bool {
if o == nil {
return false
}
return o.KeyTypes == IterKeyTypeRangesOnly || o.KeyTypes == IterKeyTypePointsAndRanges
}
func (o *IterOptions) getLogger() Logger {
if o == nil || o.logger == nil {
return DefaultLogger
}
return o.logger
}
// RangeKeyMasking configures automatic hiding of point keys by range keys. A
// non-nil Suffix enables range-key masking. When enabled, range keys with
// suffixes ≥ Suffix behave as masks. All point keys that are contained within a
// masking range key's bounds and have suffixes greater than the range key's
// suffix are automatically skipped.
//
// Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there
// exists a range key with suffix _r_ covering a point key with suffix _p_, and
//
// _s_ ≤ _r_ < _p_
//
// then the point key is elided.
//
// Range-key masking may only be used when iterating over both point keys and
// range keys with IterKeyTypePointsAndRanges.
type RangeKeyMasking struct {
// Suffix configures which range keys may mask point keys. Only range keys
// that are defined at suffixes greater than or equal to Suffix will mask
// point keys.
Suffix []byte
// Filter is an optional field that may be used to improve performance of
// range-key masking through a block-property filter defined over key
// suffixes. If non-nil, Filter is called by Pebble to construct a
// block-property filter mask at iterator creation. The filter is used to
// skip whole point-key blocks containing point keys with suffixes greater
// than a covering range-key's suffix.
//
// To use this functionality, the caller must create and configure (through
// Options.BlockPropertyCollectors) a block-property collector that records
// the maxmimum suffix contained within a block. The caller then must write
// and provide a BlockPropertyFilterMask implementation on that same
// property. See the BlockPropertyFilterMask type for more information.
Filter func() BlockPropertyFilterMask
}
// BlockPropertyFilterMask extends the BlockPropertyFilter interface for use
// with range-key masking. Unlike an ordinary block property filter, a
// BlockPropertyFilterMask's filtering criteria is allowed to change when Pebble
// invokes its SetSuffix method.
//
// When a Pebble iterator steps into a range key's bounds and the range key has
// a suffix greater than or equal to RangeKeyMasking.Suffix, the range key acts
// as a mask. The masking range key hides all point keys that fall within the
// range key's bounds and have suffixes > the range key's suffix. Without a
// filter mask configured, Pebble performs this hiding by stepping through point
// keys and comparing suffixes. If large numbers of point keys are masked, this
// requires Pebble to load, iterate through and discard a large number of
// sstable blocks containing masked point keys.
//
// If a block-property collector and a filter mask are configured, Pebble may
// skip loading some point-key blocks altogether. If a block's keys are known to
// all fall within the bounds of the masking range key and the block was
// annotated by a block-property collector with the maximal suffix, Pebble can
// ask the filter mask to compare the property to the current masking range
// key's suffix. If the mask reports no intersection, the block may be skipped.
//
// If unsuffixed and suffixed keys are written to the database, care must be
// taken to avoid unintentionally masking un-suffixed keys located in the same
// block as suffixed keys. One solution is to interpret unsuffixed keys as
// containing the maximal suffix value, ensuring that blocks containing
// unsuffixed keys are always loaded.
type BlockPropertyFilterMask interface {
BlockPropertyFilter
// SetSuffix configures the mask with the suffix of a range key. The filter
// should return false from Intersects whenever it's provided with a
// property encoding a block's minimum suffix that's greater (according to
// Compare) than the provided suffix.
SetSuffix(suffix []byte) error
}
// WriteOptions hold the optional per-query parameters for Set and Delete
// operations.
//
// Like Options, a nil *WriteOptions is valid and means to use the default
// values.
type WriteOptions struct {
// Sync is whether to sync writes through the OS buffer cache and down onto
// the actual disk, if applicable. Setting Sync is required for durability of
// individual write operations but can result in slower writes.
//
// If false, and the process or machine crashes, then a recent write may be
// lost. This is due to the recently written data being buffered inside the
// process running Pebble. This differs from the semantics of a write system
// call in which the data is buffered in the OS buffer cache and would thus
// survive a process crash.
//
// The default value is true.
Sync bool
}
// Sync specifies the default write options for writes which synchronize to
// disk.
var Sync = &WriteOptions{Sync: true}
// NoSync specifies the default write options for writes which do not
// synchronize to disk.
var NoSync = &WriteOptions{Sync: false}
// GetSync returns the Sync value or true if the receiver is nil.
func (o *WriteOptions) GetSync() bool {
return o == nil || o.Sync
}
// LevelOptions holds the optional per-level parameters.
type LevelOptions struct {
// BlockRestartInterval is the number of keys between restart points
// for delta encoding of keys.
//
// The default value is 16.
BlockRestartInterval int
// BlockSize is the target uncompressed size in bytes of each table block.
//
// The default value is 4096.
BlockSize int
// BlockSizeThreshold finishes a block if the block size is larger than the
// specified percentage of the target block size and adding the next entry
// would cause the block to be larger than the target block size.
//
// The default value is 90
BlockSizeThreshold int
// Compression defines the per-block compression to use.
//
// The default value (DefaultCompression) uses snappy compression.
Compression Compression
// FilterPolicy defines a filter algorithm (such as a Bloom filter) that can
// reduce disk reads for Get calls.
//
// One such implementation is bloom.FilterPolicy(10) from the bitalostable/bloom
// package.
//
// The default value means to use no filter.
FilterPolicy FilterPolicy
// FilterType defines whether an existing filter policy is applied at a
// block-level or table-level. Block-level filters use less memory to create,
// but are slower to access as a check for the key in the index must first be
// performed to locate the filter block. A table-level filter will require
// memory proportional to the number of keys in an sstable to create, but
// avoids the index lookup when determining if a key is present. Table-level
// filters should be preferred except under constrained memory situations.
FilterType FilterType
// IndexBlockSize is the target uncompressed size in bytes of each index
// block. When the index block size is larger than this target, two-level
// indexes are automatically enabled. Setting this option to a large value
// (such as math.MaxInt32) disables the automatic creation of two-level
// indexes.
//
// The default value is the value of BlockSize.
IndexBlockSize int
// The target file size for the level.
TargetFileSize int64
}
// EnsureDefaults ensures that the default values for all of the options have
// been initialized. It is valid to call EnsureDefaults on a nil receiver. A
// non-nil result will always be returned.
func (o *LevelOptions) EnsureDefaults() *LevelOptions {
if o == nil {
o = &LevelOptions{}
}
if o.BlockRestartInterval <= 0 {
o.BlockRestartInterval = base.DefaultBlockRestartInterval
}
if o.BlockSize <= 0 {
o.BlockSize = base.DefaultBlockSize
}
if o.BlockSizeThreshold <= 0 {
o.BlockSizeThreshold = base.DefaultBlockSizeThreshold
}
if o.Compression <= DefaultCompression || o.Compression >= sstable.NCompression {
o.Compression = SnappyCompression
}
if o.IndexBlockSize <= 0 {
o.IndexBlockSize = o.BlockSize
}
if o.TargetFileSize <= 0 {
o.TargetFileSize = 2 << 20 // 2 MB
}
return o
}
// Options holds the optional parameters for configuring bitalostable. These options
// apply to the DB at large; per-query options are defined by the IterOptions
// and WriteOptions types.
type Options struct {
// Sync sstables periodically in order to smooth out writes to disk. This
// option does not provide any persistency guarantee, but is used to avoid
// latency spikes if the OS automatically decides to write out a large chunk
// of dirty filesystem buffers. This option only controls SSTable syncs; WAL
// syncs are controlled by WALBytesPerSync.
//
// The default value is 512KB.
BytesPerSync int
// Cache is used to cache uncompressed blocks from sstables.
//
// The default cache size is 8 MB.
Cache *cache.Cache
// Cleaner cleans obsolete files.
//
// The default cleaner uses the DeleteCleaner.
Cleaner Cleaner
// Comparer defines a total ordering over the space of []byte keys: a 'less
// than' relationship. The same comparison algorithm must be used for reads
// and writes over the lifetime of the DB.
//
// The default value uses the same ordering as bytes.Compare.
Comparer *Comparer
// DebugCheck is invoked, if non-nil, whenever a new version is being
// installed. Typically, this is set to bitalostable.DebugCheckLevels in tests
// or tools only, to check invariants over all the data in the database.
DebugCheck func(*DB) error
// Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits
// crash recovery, but can improve performance if crash recovery is not
// needed (e.g. when only temporary state is being stored in the database).
//
// TODO(peter): untested
DisableWAL bool
// ErrorIfExists is whether it is an error if the database already exists.
//
// The default value is false.
ErrorIfExists bool
// ErrorIfNotExists is whether it is an error if the database does not
// already exist.
//
// The default value is false which will cause a database to be created if it
// does not already exist.
ErrorIfNotExists bool
// EventListener provides hooks to listening to significant DB events such as
// flushes, compactions, and table deletion.
EventListener EventListener
// Experimental contains experimental options which are off by default.
// These options are temporary and will eventually either be deleted, moved
// out of the experimental group, or made the non-adjustable default. These
// options may change at any time, so do not rely on them.
Experimental struct {
// The threshold of L0 read-amplification at which compaction concurrency
// is enabled (if CompactionDebtConcurrency was not already exceeded).
// Every multiple of this value enables another concurrent
// compaction up to MaxConcurrentCompactions.
L0CompactionConcurrency int
// CompactionDebtConcurrency controls the threshold of compaction debt
// at which additional compaction concurrency slots are added. For every
// multiple of this value in compaction debt bytes, an additional
// concurrent compaction is added. This works "on top" of
// L0CompactionConcurrency, so the higher of the count of compaction
// concurrency slots as determined by the two options is chosen.
CompactionDebtConcurrency int
// MinDeletionRate is the minimum number of bytes per second that would
// be deleted. Deletion pacing is used to slow down deletions when
// compactions finish up or readers close, and newly-obsolete files need
// cleaning up. Deleting lots of files at once can cause disk latency to
// go up on some SSDs, which this functionality guards against. This is a
// minimum as the maximum is theoretically unlimited; pacing is disabled
// when there are too many obsolete files relative to live bytes, or
// there isn't enough disk space available. Setting this to 0 disables
// deletion pacing, which is also the default.
MinDeletionRate int
// ReadCompactionRate controls the frequency of read triggered
// compactions by adjusting `AllowedSeeks` in manifest.FileMetadata:
//
// AllowedSeeks = FileSize / ReadCompactionRate
//
// From LevelDB:
// ```
// We arrange to automatically compact this file after
// a certain number of seeks. Let's assume:
// (1) One seek costs 10ms
// (2) Writing or reading 1MB costs 10ms (100MB/s)
// (3) A compaction of 1MB does 25MB of IO:
// 1MB read from this level
// 10-12MB read from next level (boundaries may be misaligned)
// 10-12MB written to next level
// This implies that 25 seeks cost the same as the compaction
// of 1MB of data. I.e., one seek costs approximately the
// same as the compaction of 40KB of data. We are a little
// conservative and allow approximately one seek for every 16KB
// of data before triggering a compaction.
// ```
ReadCompactionRate int64
// ReadSamplingMultiplier is a multiplier for the readSamplingPeriod in
// iterator.maybeSampleRead() to control the frequency of read sampling
// to trigger a read triggered compaction. A value of -1 prevents sampling
// and disables read triggered compactions. The default is 1 << 4. which
// gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB).
ReadSamplingMultiplier int64
// TableCacheShards is the number of shards per table cache.
// Reducing the value can reduce the number of idle goroutines per DB
// instance which can be useful in scenarios with a lot of DB instances
// and a large number of CPUs, but doing so can lead to higher contention
// in the table cache and reduced performance.
//
// The default value is the number of logical CPUs, which can be
// limited by runtime.GOMAXPROCS.
TableCacheShards int
// KeyValidationFunc is a function to validate a user key in an SSTable.
//
// Currently, this function is used to validate the smallest and largest
// keys in an SSTable undergoing compaction. In this case, returning an
// error from the validation function will result in a panic at runtime,
// given that there is rarely any way of recovering from malformed keys
// present in compacted files. By default, validation is not performed.
//
// Additional use-cases may be added in the future.
//
// NOTE: callers should take care to not mutate the key being validated.
KeyValidationFunc func(userKey []byte) error
// ValidateOnIngest schedules validation of sstables after they have
// been ingested.
//
// By default, this value is false.
ValidateOnIngest bool
// MultiLevelCompaction allows the compaction of SSTs from more than two
// levels iff a conventional two level compaction will quickly trigger a
// compaction in the output level.
MultiLevelCompaction bool
// MaxWriterConcurrency is used to indicate the maximum number of
// compression workers the compression queue is allowed to use. If
// MaxWriterConcurrency > 0, then the Writer will use parallelism, to
// compress and write blocks to disk. Otherwise, the writer will
// compress and write blocks to disk synchronously.
MaxWriterConcurrency int
// ForceWriterParallelism is used to force parallelism in the sstable
// Writer for the metamorphic tests. Even with the MaxWriterConcurrency
// option set, we only enable parallelism in the sstable Writer if there
// is enough CPU available, and this option bypasses that.
ForceWriterParallelism bool
// CPUWorkPermissionGranter should be set if Pebble should be given the
// ability to optionally schedule additional CPU. See the documentation
// for CPUWorkPermissionGranter for more details.
CPUWorkPermissionGranter CPUWorkPermissionGranter
}
// Filters is a map from filter policy name to filter policy. It is used for
// debugging tools which may be used on multiple databases configured with
// different filter policies. It is not necessary to populate this filters
// map during normal usage of a DB.
Filters map[string]FilterPolicy
// FlushDelayDeleteRange configures how long the database should wait before
// forcing a flush of a memtable that contains a range deletion. Disk space
// cannot be reclaimed until the range deletion is flushed. No automatic
// flush occurs if zero.
FlushDelayDeleteRange time.Duration
// FlushDelayRangeKey configures how long the database should wait before
// forcing a flush of a memtable that contains a range key. Range keys in
// the memtable prevent lazy combined iteration, so it's desirable to flush
// range keys promptly. No automatic flush occurs if zero.
FlushDelayRangeKey time.Duration
// FlushSplitBytes denotes the target number of bytes per sublevel in
// each flush split interval (i.e. range between two flush split keys)
// in L0 sstables. When set to zero, only a single sstable is generated
// by each flush. When set to a non-zero value, flushes are split at
// points to meet L0's TargetFileSize, any grandparent-related overlap
// options, and at boundary keys of L0 flush split intervals (which are
// targeted to contain around FlushSplitBytes bytes in each sublevel
// between pairs of boundary keys). Splitting sstables during flush
// allows increased compaction flexibility and concurrency when those
// tables are compacted to lower levels.
FlushSplitBytes int64
// FormatMajorVersion sets the format of on-disk files. It is
// recommended to set the format major version to an explicit
// version, as the default may change over time.
//
// At Open if the existing database is formatted using a later
// format major version that is known to this version of Pebble,
// Pebble will continue to use the later format major version. If
// the existing database's version is unknown, the caller may use
// FormatMostCompatible and will be able to open the database
// regardless of its actual version.
//
// If the existing database is formatted using a format major
// version earlier than the one specified, Open will automatically
// ratchet the database to the specified format major version.
FormatMajorVersion FormatMajorVersion
// FS provides the interface for persistent file storage.
//
// The default value uses the underlying operating system's file system.
FS vfs.FS
// The count of L0 files necessary to trigger an L0 compaction.
L0CompactionFileThreshold int
// The amount of L0 read-amplification necessary to trigger an L0 compaction.
L0CompactionThreshold int
// Hard limit on L0 read-amplification, computed as the number of L0
// sublevels. Writes are stopped when this threshold is reached.
L0StopWritesThreshold int
// The maximum number of bytes for LBase. The base level is the level which
// L0 is compacted into. The base level is determined dynamically based on
// the existing data in the LSM. The maximum number of bytes for other levels
// is computed dynamically based on the base level's maximum size. When the
// maximum number of bytes for a level is exceeded, compaction is requested.
LBaseMaxBytes int64
// Per-level options. Options for at least one level must be specified. The
// options for the last level are used for all subsequent levels.
Levels []LevelOptions
// Logger used to write log messages.
//
// The default logger uses the Go standard library log package.
Logger Logger
LogTag string
Verbose bool
// MaxManifestFileSize is the maximum size the MANIFEST file is allowed to
// become. When the MANIFEST exceeds this size it is rolled over and a new
// MANIFEST is created.
MaxManifestFileSize int64
// MaxOpenFiles is a soft limit on the number of open files that can be
// used by the DB.
//
// The default value is 1000.
MaxOpenFiles int
// The size of a MemTable in steady state. The actual MemTable size starts at
// min(256KB, MemTableSize) and doubles for each subsequent MemTable up to
// MemTableSize. This reduces the memory pressure caused by MemTables for
// short lived (test) DB instances. Note that more than one MemTable can be
// in existence since flushing a MemTable involves creating a new one and
// writing the contents of the old one in the
// background. MemTableStopWritesThreshold places a hard limit on the size of
// the queued MemTables.
MemTableSize int
// Hard limit on the size of queued of MemTables. Writes are stopped when the
// sum of the queued memtable sizes exceeds
// MemTableStopWritesThreshold*MemTableSize. This value should be at least 2
// or writes will stop whenever a MemTable is being flushed.
MemTableStopWritesThreshold int
// Merger defines the associative merge operation to use for merging values
// written with {Batch,DB}.Merge.
//
// The default merger concatenates values.
Merger *Merger
// MaxConcurrentCompactions specifies the maximum number of concurrent
// compactions. The default is 1. Concurrent compactions are performed
// - when L0 read-amplification passes the L0CompactionConcurrency threshold
// - for automatic background compactions
// - when a manual compaction for a level is split and parallelized
// MaxConcurrentCompactions must be greater than 0.
MaxConcurrentCompactions func() int
// DisableAutomaticCompactions dictates whether automatic compactions are
// scheduled or not. The default is false (enabled). This option is only used
// externally when running a manual compaction, and internally for tests.
DisableAutomaticCompactions bool
// NoSyncOnClose decides whether the Pebble instance will enforce a
// close-time synchronization (e.g., fdatasync() or sync_file_range())
// on files it writes to. Setting this to true removes the guarantee for a
// sync on close. Some implementations can still issue a non-blocking sync.
NoSyncOnClose bool
// NumPrevManifest is the number of non-current or older manifests which
// we want to keep around for debugging purposes. By default, we're going
// to keep one older manifest.
NumPrevManifest int
// ReadOnly indicates that the DB should be opened in read-only mode. Writes
// to the DB will return an error, background compactions are disabled, and
// the flush that normally occurs after replaying the WAL at startup is
// disabled.
ReadOnly bool
// TableCache is an initialized TableCache which should be set as an
// option if the DB needs to be initialized with a pre-existing table cache.
// If TableCache is nil, then a table cache which is unique to the DB instance
// is created. TableCache can be shared between db instances by setting it here.
// The TableCache set here must use the same underlying cache as Options.Cache
// and bitalostable will panic otherwise.
TableCache *TableCache
// TablePropertyCollectors is a list of TablePropertyCollector creation
// functions. A new TablePropertyCollector is created for each sstable built
// and lives for the lifetime of the table.
TablePropertyCollectors []func() TablePropertyCollector
// BlockPropertyCollectors is a list of BlockPropertyCollector creation
// functions. A new BlockPropertyCollector is created for each sstable
// built and lives for the lifetime of writing that table.
BlockPropertyCollectors []func() BlockPropertyCollector
// WALBytesPerSync sets the number of bytes to write to a WAL before calling
// Sync on it in the background. Just like with BytesPerSync above, this
// helps smooth out disk write latencies, and avoids cases where the OS
// writes a lot of buffered data to disk at once. However, this is less
// necessary with WALs, as many write operations already pass in
// Sync = true.
//
// The default value is 0, i.e. no background syncing. This matches the
// default behaviour in RocksDB.
WALBytesPerSync int
// WALDir specifies the directory to store write-ahead logs (WALs) in. If
// empty (the default), WALs will be stored in the same directory as sstables
// (i.e. the directory passed to bitalostable.Open).
WALDir string
// WALMinSyncInterval is the minimum duration between syncs of the WAL. If
// WAL syncs are requested faster than this interval, they will be
// artificially delayed. Introducing a small artificial delay (500us) between
// WAL syncs can allow more operations to arrive and reduce IO operations
// while having a minimal impact on throughput. This option is supplied as a
// closure in order to allow the value to be changed dynamically. The default
// value is 0.
//
// TODO(peter): rather than a closure, should there be another mechanism for
// changing options dynamically?
WALMinSyncInterval func() time.Duration
Id int
FlushReporter func(int)
KvCheckExpireFunc func([]byte, []byte) bool
// private options are only used by internal tests or are used internally
// for facilitating upgrade paths of unconfigurable functionality.
private struct {
// strictWALTail configures whether or not a database's WALs created
// prior to the most recent one should be interpreted strictly,
// requiring a clean EOF. RocksDB 6.2.1 and the version of Pebble
// included in CockroachDB 20.1 do not guarantee that closed WALs end
// cleanly. If this option is set within an OPTIONS file, Pebble
// interprets previous WALs strictly, requiring a clean EOF.
// Otherwise, it interprets them permissively in the same manner as
// RocksDB 6.2.1.
strictWALTail bool
// A private option to disable stats collection.
disableTableStats bool
// fsCloser holds a closer that should be invoked after a DB using these
// Options is closed. This is used to automatically stop the
// long-running goroutine associated with the disk-health-checking FS.
// See the initialization of FS in EnsureDefaults. Note that care has
// been taken to ensure that it is still safe to continue using the FS
// after this closer has been invoked. However, if write operations
// against the FS are made after the DB is closed, the FS may leak a
// goroutine indefinitely.
fsCloser io.Closer
logInit bool
}
}
// DebugCheckLevels calls CheckLevels on the provided database.
// It may be set in the DebugCheck field of Options to check
// level invariants whenever a new version is installed.
func DebugCheckLevels(db *DB) error {
return db.CheckLevels(nil)
}
// EnsureDefaults ensures that the default values for all options are set if a
// valid value was not already specified. Returns the new options.
func (o *Options) EnsureDefaults() *Options {
if o == nil {
o = &Options{}
}
if o.BytesPerSync <= 0 {
o.BytesPerSync = 512 << 10 // 512 KB
}
if o.Cleaner == nil {
o.Cleaner = DeleteCleaner{}
}
if o.Comparer == nil {
o.Comparer = DefaultComparer
}
if o.Experimental.L0CompactionConcurrency <= 0 {
o.Experimental.L0CompactionConcurrency = 10
}
if o.Experimental.CompactionDebtConcurrency <= 0 {
o.Experimental.CompactionDebtConcurrency = 1 << 30 // 1 GB
}
if o.Experimental.KeyValidationFunc == nil {
o.Experimental.KeyValidationFunc = func([]byte) error { return nil }
}
if o.L0CompactionThreshold <= 0 {
o.L0CompactionThreshold = 4
}
if o.L0CompactionFileThreshold <= 0 {
// Some justification for the default of 500:
// Why not smaller?:
// - The default target file size for L0 is 2MB, so 500 files is <= 1GB
// of data. At observed compaction speeds of > 20MB/s, L0 can be
// cleared of all files in < 1min, so this backlog is not huge.
// - 500 files is low overhead for instantiating L0 sublevels from
// scratch.
// - Lower values were observed to cause excessive and inefficient
// compactions out of L0 in a TPCC import benchmark.
// Why not larger?:
// - More than 1min to compact everything out of L0.
// - CockroachDB's admission control system uses a threshold of 1000
// files to start throttling writes to Pebble. Using 500 here gives
// us headroom between when Pebble should start compacting L0 and
// when the admission control threshold is reached.
//
// We can revisit this default in the future based on better
// experimental understanding.
//
// TODO(jackson): Experiment with slightly lower thresholds [or higher
// admission control thresholds] to see whether a higher L0 score at the
// threshold (currently 2.0) is necessary for some workloads to avoid
// starving L0 in favor of lower-level compactions.
o.L0CompactionFileThreshold = 500
}
if o.L0StopWritesThreshold <= 0 {
o.L0StopWritesThreshold = 12
}
if o.LBaseMaxBytes <= 0 {
o.LBaseMaxBytes = 64 << 20 // 64 MB
}
if o.Levels == nil {
o.Levels = make([]LevelOptions, 1)
for i := range o.Levels {
if i > 0 {
l := &o.Levels[i]
if l.TargetFileSize <= 0 {
l.TargetFileSize = o.Levels[i-1].TargetFileSize * 2
}
}
o.Levels[i].EnsureDefaults()
}
} else {
for i := range o.Levels {
o.Levels[i].EnsureDefaults()
}
}
if o.Logger == nil {
o.Logger = DefaultLogger
}
o.EventListener.EnsureDefaults(o.Logger)
if o.MaxManifestFileSize == 0 {
o.MaxManifestFileSize = 128 << 20 // 128 MB
}
if o.MaxOpenFiles == 0 {
o.MaxOpenFiles = 1000
}
if o.MemTableSize <= 0 {
o.MemTableSize = 4 << 20
}
if o.MemTableStopWritesThreshold <= 0 {
o.MemTableStopWritesThreshold = 2
}
if o.Merger == nil {
o.Merger = DefaultMerger
}
if !o.private.logInit {
o.Logger = base.NewLogger(o.Logger, o.LogTag)
if o.Verbose {
o.EventListener = MakeLoggingEventListener(o.Logger)
} else {
o.EventListener.EnsureDefaults(o.Logger)
}
o.private.logInit = true
}
o.private.strictWALTail = true
if o.MaxConcurrentCompactions == nil {
o.MaxConcurrentCompactions = func() int { return 1 }
}
if o.NumPrevManifest <= 0 {
o.NumPrevManifest = 1
}
if o.FormatMajorVersion == FormatDefault {
o.FormatMajorVersion = FormatMostCompatible
}
if o.FS == nil {
o.FS, o.private.fsCloser = vfs.WithDiskHealthChecks(vfs.Default, 5*time.Second,
func(name string, duration time.Duration) {
o.EventListener.DiskSlow(DiskSlowInfo{
Path: name,
Duration: duration,
})
})
}
if o.FlushSplitBytes <= 0 {
o.FlushSplitBytes = 2 * o.Levels[0].TargetFileSize
}
if o.Experimental.ReadCompactionRate == 0 {
o.Experimental.ReadCompactionRate = 16000
}
if o.Experimental.ReadSamplingMultiplier == 0 {
o.Experimental.ReadSamplingMultiplier = 1 << 4
}
if o.Experimental.TableCacheShards <= 0 {
o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0)
}
if o.KvCheckExpireFunc == nil {
o.KvCheckExpireFunc = func([]byte, []byte) bool { return false }
}
o.initMaps()
return o
}
func (o *Options) equal() Equal {
if o.Comparer.Equal == nil {
return bytes.Equal
}
return o.Comparer.Equal
}
// initMaps initializes the Comparers, Filters, and Mergers maps.
func (o *Options) initMaps() {
for i := range o.Levels {
l := &o.Levels[i]
if l.FilterPolicy != nil {
if o.Filters == nil {
o.Filters = make(map[string]FilterPolicy)
}
name := l.FilterPolicy.Name()
if _, ok := o.Filters[name]; !ok {
o.Filters[name] = l.FilterPolicy
}
}
}
}
// Level returns the LevelOptions for the specified level.
func (o *Options) Level(level int) LevelOptions {
if level < len(o.Levels) {
return o.Levels[level]
}
n := len(o.Levels) - 1
l := o.Levels[n]
for i := n; i < level; i++ {
l.TargetFileSize *= 2
}
return l
}
// Clone creates a shallow-copy of the supplied options.
func (o *Options) Clone() *Options {
n := &Options{}
if o != nil {
*n = *o
}
return n
}
func filterPolicyName(p FilterPolicy) string {
if p == nil {
return "none"
}
return p.Name()
}
func (o *Options) String() string {
var buf bytes.Buffer