diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index 7f69ae4855f6ce..9fe25c5341576c 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -374,6 +374,15 @@ breaking most of the collisions from a similarly-named file appearing in many different directories. At the moment, this version is not allowed when writing reachability bitmap files with `--write-bitmap-index` and it will be automatically changed to version `1`. ++ +The name hash version `3` abandons the locality features of versions `1` +and `2` in favor of minimizing collisions. The goal here is to separate +objects by their full path and abandon hope for cross-path delta +compression. For this reason, this option is preferred for repacking large +repositories with many versions and many name hash collisions when using +the first two versions. At the moment, this version is not allowed when +writing reachability bitmap files with `--write-bitmap-index` and it will +be automatically changed to version `1`. DELTA ISLANDS diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 4d10baf7ac9822..8297af1a272ca3 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -270,7 +270,7 @@ static int name_hash_version = -1; static void validate_name_hash_version(void) { - if (name_hash_version < 1 || name_hash_version > 2) + if (name_hash_version < 1 || name_hash_version > 3) die(_("invalid --name-hash-version option: %d"), name_hash_version); } @@ -292,6 +292,9 @@ static inline uint32_t pack_name_hash_fn(const char *name) case 2: return pack_name_hash_v2((const unsigned char *)name); + case 3: + return pack_name_hash_v3(name); + default: BUG("invalid name-hash version: %d", name_hash_version); } diff --git a/pack-objects.h b/pack-objects.h index 681c111648664d..2b20a56fc99399 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -235,6 +235,32 @@ static inline uint32_t pack_name_hash_v2(const unsigned char *name) return (base >> 6) ^ hash; } +static inline uint32_t pack_name_hash_v3(const char *name) +{ + /* + * This 'bigp' value is a large prime, at least 25% of the max + * value of an uint32_t. Multiplying by this value (modulo 2^32) + * causes the 32 bits to change pseudo-randomly. + */ + const uint32_t bigp = 1234572167U; + uint32_t c, hash = bigp; + + if (!name) + return 0; + + /* + * Do the simplest thing that will resemble pseudo-randomness: add + * random multiples of a large prime number with a binary shift. + * The goal is not to be cryptographic, but to be generally + * uniformly distributed. + */ + while ((c = *name++) != 0) { + hash += c * bigp; + hash = (hash >> 5) | (hash << 27); + } + return hash; +} + static inline enum object_type oe_type(const struct object_entry *e) { return e->type_valid ? e->type_ : OBJ_BAD; diff --git a/t/helper/test-name-hash.c b/t/helper/test-name-hash.c index af1d52de101438..cc5acd58a658b9 100644 --- a/t/helper/test-name-hash.c +++ b/t/helper/test-name-hash.c @@ -15,6 +15,7 @@ int cmd__name_hash(int argc UNUSED, const char **argv UNUSED) while (!strbuf_getline(&line, stdin)) { printf("%10u ", pack_name_hash(line.buf)); printf("%10u ", pack_name_hash_v2((unsigned const char *)line.buf)); + printf("%10u ", pack_name_hash_v3(line.buf)); printf("%s\n", line.buf); } diff --git a/t/perf/p5313-pack-objects.sh b/t/perf/p5313-pack-objects.sh index be5229a0ecdcf5..493872e656d295 100755 --- a/t/perf/p5313-pack-objects.sh +++ b/t/perf/p5313-pack-objects.sh @@ -25,7 +25,7 @@ test_expect_success 'create rev input' ' EOF ' -for version in 1 2 +for version in 1 2 3 do export version diff --git a/t/perf/p5314-name-hash.sh b/t/perf/p5314-name-hash.sh index 4ef0ba771143d8..e58a218d1ae4e9 100755 --- a/t/perf/p5314-name-hash.sh +++ b/t/perf/p5314-name-hash.sh @@ -14,7 +14,7 @@ test_size 'paths at head' ' test-tool name-hash name-hashes ' -for version in 1 2 +for version in 1 2 3 do test_size "distinct hash value: v$version" ' awk "{ print \$$version; }" err && test_grep "invalid --name-hash-version option" err || return 1 diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh index 871ce01401a11e..2bf75e2a5d0e6a 100755 --- a/t/t5310-pack-bitmaps.sh +++ b/t/t5310-pack-bitmaps.sh @@ -45,13 +45,13 @@ test_expect_success 'name-hash value stability' ' test-tool name-hash out && cat >expect <<-\EOF && - 2582249472 1763573760 first - 2289942528 1188134912 second - 2300837888 1130758144 third - 2544516325 3963087891 a/one-long-enough-for-collisions - 2544516325 4013419539 b/two-long-enough-for-collisions - 1420111091 1709547268 many/parts/to/this/path/enough/to/collide/in/v2 - 1420111091 1709547268 enough/parts/to/this/path/enough/to/collide/in/v2 + 2582249472 1763573760 3109209818 first + 2289942528 1188134912 3781118409 second + 2300837888 1130758144 3028707182 third + 2544516325 3963087891 3586976147 a/one-long-enough-for-collisions + 2544516325 4013419539 1701624798 b/two-long-enough-for-collisions + 1420111091 1709547268 2676129939 many/parts/to/this/path/enough/to/collide/in/v2 + 1420111091 1709547268 2740459187 enough/parts/to/this/path/enough/to/collide/in/v2 EOF test_cmp expect out diff --git a/t/t5616-partial-clone.sh b/t/t5616-partial-clone.sh index c53e93be2f7779..4e7863af9e0af2 100755 --- a/t/t5616-partial-clone.sh +++ b/t/t5616-partial-clone.sh @@ -516,7 +516,18 @@ test_expect_success 'fetch lazy-fetches only to resolve deltas' ' # Exercise to make sure it works. Git will not fetch anything from the # promisor remote other than for the big tree (because it needs to # resolve the delta). - GIT_TRACE_PACKET="$(pwd)/trace" git -C client \ + # + # TODO: the --name-hash-version option is disabled here, since this test + # is fundamentally broken! When GIT_TEST_NAME_HASH_VERSION=3, the server + # recognizes delta bases in a different way and then sends a _blob_ to + # the client with a delta base that the client does not have! This is + # because the client is cloned from "promisor-server" with tree:0 but + # is now fetching from "server" without any filter. This is violating the + # promise to the server that all reachable objects exist and could be + # used as delta bases! + GIT_TRACE_PACKET="$(pwd)/trace" \ + GIT_TEST_NAME_HASH_VERSION=1 \ + git -C client \ fetch "file://$(pwd)/server" main && # Verify the assumption that the client needed to fetch the delta base @@ -535,7 +546,18 @@ test_expect_success 'fetch lazy-fetches only to resolve deltas, protocol v2' ' # Exercise to make sure it works. Git will not fetch anything from the # promisor remote other than for the big blob (because it needs to # resolve the delta). - GIT_TRACE_PACKET="$(pwd)/trace" git -C client \ + # + # TODO: the --name-hash-verion option is disabled here, since this test + # is fundamentally broken! When GIT_TEST_NAME_HASH_VERSION=3, the server + # recognizes delta bases in a different way and then sends a _blob_ to + # the client with a delta base that the client does not have! This is + # because the client is cloned from "promisor-server" with tree:0 but + # is now fetching from "server" without any filter. This is violating the + # promise to the server that all reachable objects exist and could be + # used as delta bases! + GIT_TRACE_PACKET="$(pwd)/trace" \ + GIT_TEST_NAME_HASH_VERSION=1 \ + git -C client \ fetch "file://$(pwd)/server" main && # Verify that protocol version 2 was used.