Documentation for design of new extent format (in dev branch):
/*
* In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
* preceded by checksum/compression information (bch_extent_crc32 or
* bch_extent_crc64).
*
* One major determining factor in the format of extents is how we handle and
* represent extents that have been partially overwritten and thus trimmed:
*
* If an extent is not checksummed or compressed, when the extent is trimmed we
* don't have to remember the extent we originally allocated and wrote: we can
* merely adjust ptr->offset to point to the start of the start of the data that
* is currently live. The size field in struct bkey records the current (live)
* size of the extent, and is also used to mean "size of region on disk that we
* point to" in this case.
*
* Thus an extent that is not checksummed or compressed will consist only of a
* list of bch_extent_ptrs, with none of the fields in
* bch_extent_crc32/bch_extent_crc64.
*
* When an extent is checksummed or compressed, it's not possible to read only
* the data that is currently live: we have to read the entire extent that was
* originally written, and then return only the part of the extent that is
* currently live.
*
* Thus, in addition to the current size of the extent in struct bkey, we need
* to store the size of the originally allocated space - this is the
* compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
* when the extent is trimmed, instead of modifying the offset field of the
* pointer, we keep a second smaller offset field - "offset into the original
* extent of the currently live region".
*
* The other major determining factor is replication and data migration:
*
* Each pointer may have its own bch_extent_crc32/64. When doing a replicated
* write, we will initially write all the replicas in the same format, with the
* same checksum type and compression format - however, when copygc runs later (or
* tiering/cache promotion, anything that moves data), it is not in general
* going to rewrite all the pointers at once - one of the replicas may be in a
* bucket on one device that has very little fragmentation while another lives
* in a bucket that has become heavily fragmented, and thus is being rewritten
* sooner than the rest.
*
* Thus it will only move a subset of the pointers (or in the case of
* tiering/cache promotion perhaps add a single pointer without dropping any
* current pointers), and if the extent has been partially overwritten it must
* write only the currently live portion (or copygc would not be able to reduce
* fragmentation!) - which necessitates a different bch_extent_crc format for
* the new pointer.
*
* But in the interests of space efficiency, we don't want to store one
* bch_extent_crc for each pointer if we don't have to.
*
* Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
* bch_extent_ptrs appended arbitrarily one after the other. We determine the
* type of a given entry with a scheme similar to utf8, encoding the type in the
* position of the first set bit:
*
* bch_extent_crc32 - field_type 1
* bch_extent_ptr - field_type 10
* bch_extent_crc64 - field_type 100
*
* We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
* bch_extent_crc64 is the least constrained).
*
* Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
* until the next bch_extent_crc32/64.
*
* If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
* is neither checksummed nor compressed.
*/
struct bch_extent_crc32 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u32 field_type:1,
compressed_size:8,
uncompressed_size:8,
offset:7,
csum_type:4,
compression_type:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u32 csum_type:4,
compression_type:4,
offset:7,
uncompressed_size:8,
compressed_size:8,
field_type:1;
#endif
__u32 csum;
};
#define CRC32_EXTENT_SIZE_MAX (1U << 7)
struct bch_extent_crc64 {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 field_type:3,
compressed_size:18,
uncompressed_size:18,
offset:17,
csum_type:4,
compression_type:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 csum_type:4,
compression_type:4,
offset:17,
uncompressed_size:18,
compressed_size:18,
field_type:3;
#endif
__u64 csum;
};
#define CRC64_EXTENT_SIZE_MAX (1U << 17)
struct bch_extent_ptr {
union {
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 field_type:2,
erasure_coded:1,
offset:45, /* 16 petabytes */
dev:8,
gen:8;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 gen:8,
dev:8,
offset:45,
erasure_coded:1,
field_type:2;
#endif
};
__u64 _val;
};
};
static inline struct bch_extent_ptr PTR(__u64 gen, __u64 offset, __u64 dev)
{
return (struct bch_extent_ptr) {
.gen = gen,
.dev = dev,
.offset = offset,
};
}
/* Dummy DEV numbers: */
#define PTR_LOST_DEV 255 /* XXX: kill */
enum {
BCH_EXTENT = 128,
/*
* This is kind of a hack, we're overloading the type for a boolean that
* really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
* have the same value type:
*/
BCH_EXTENT_CACHED = 129,
};
struct bch_extent {
struct bch_val v;
struct bch_extent_ptr ptr[0];
};
BKEY_VAL_TYPE(extent, BCH_EXTENT);