001/* 002 * Copyright (C) 2011 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.hash; 016 017import static com.google.common.base.Preconditions.checkArgument; 018 019import com.google.common.annotations.Beta; 020import com.google.common.annotations.VisibleForTesting; 021import com.google.common.base.Supplier; 022 023import java.nio.ByteBuffer; 024import java.security.MessageDigest; 025import java.util.Iterator; 026import java.util.zip.Adler32; 027import java.util.zip.CRC32; 028import java.util.zip.Checksum; 029 030import javax.annotation.Nullable; 031 032/** 033 * Static methods to obtain {@link HashFunction} instances, and other static hashing-related 034 * utilities. 035 * 036 * <p>A comparison of the various hash functions can be found 037 * <a href="http://goo.gl/jS7HH">here</a>. 038 * 039 * @author Kevin Bourrillion 040 * @author Dimitris Andreou 041 * @author Kurt Alfred Kluever 042 * @since 11.0 043 */ 044@Beta 045public final class Hashing { 046 private Hashing() {} 047 048 /** 049 * Used to randomize {@link #goodFastHash} instances, so that programs which persist anything 050 * dependent on hashcodes of those, will fail sooner than later. 051 */ 052 private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis(); 053 054 // Used by goodFastHash when minimumBits == 32. 055 private static final HashFunction GOOD_FAST_HASH_FUNCTION_32 = murmur3_32(GOOD_FAST_HASH_SEED); 056 057 // Used by goodFastHash when 32 < minimumBits <= 128. 058 private static final HashFunction GOOD_FAST_HASH_FUNCTION_128 = murmur3_128(GOOD_FAST_HASH_SEED); 059 060 /** 061 * Returns a general-purpose, <b>non-cryptographic-strength</b>, streaming hash function that 062 * produces hash codes of length at least {@code minimumBits}. Users without specific 063 * compatibility requirements and who do not persist the hash codes are encouraged to 064 * choose this hash function. 065 * 066 * <p>Repeated calls to {@link #goodFastHash} with the same {@code minimumBits} value will 067 * return {@link HashFunction} instances with identical behavior (but not necessarily the 068 * same instance) for the duration of the current virtual machine. 069 * 070 * <p><b>Warning: the implementation is unspecified and is subject to change.</b> 071 * 072 * @throws IllegalArgumentException if {@code minimumBits} is not positive 073 */ 074 public static HashFunction goodFastHash(int minimumBits) { 075 int bits = checkPositiveAndMakeMultipleOf32(minimumBits); 076 077 if (bits == 32) { 078 return GOOD_FAST_HASH_FUNCTION_32; 079 } 080 if (bits <= 128) { 081 return GOOD_FAST_HASH_FUNCTION_128; 082 } 083 084 // Otherwise, join together some 128-bit murmur3s 085 int hashFunctionsNeeded = (bits + 127) / 128; 086 HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded]; 087 hashFunctions[0] = GOOD_FAST_HASH_FUNCTION_128; 088 int seed = GOOD_FAST_HASH_SEED; 089 for (int i = 1; i < hashFunctionsNeeded; i++) { 090 seed += 1500450271; // a prime; shouldn't matter 091 hashFunctions[i] = murmur3_128(seed); 092 } 093 return new ConcatenatedHashFunction(hashFunctions); 094 } 095 096 /** 097 * Returns a hash function implementing the 098 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 099 * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant), 100 * using the given seed value. 101 * 102 * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A). 103 */ 104 public static HashFunction murmur3_32(int seed) { 105 return new Murmur3_32HashFunction(seed); 106 } 107 108 /** 109 * Returns a hash function implementing the 110 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 111 * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant), 112 * using a seed value of zero. 113 * 114 * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A). 115 */ 116 public static HashFunction murmur3_32() { 117 return MURMUR3_32; 118 } 119 120 private static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0); 121 122 /** 123 * Returns a hash function implementing the 124 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 125 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), 126 * using the given seed value. 127 * 128 * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F). 129 */ 130 public static HashFunction murmur3_128(int seed) { 131 return new Murmur3_128HashFunction(seed); 132 } 133 134 /** 135 * Returns a hash function implementing the 136 * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp"> 137 * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant), 138 * using a seed value of zero. 139 * 140 * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F). 141 */ 142 public static HashFunction murmur3_128() { 143 return MURMUR3_128; 144 } 145 146 private static final HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0); 147 148 /** 149 * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to 150 * the MD5 {@link MessageDigest}. 151 */ 152 public static HashFunction md5() { 153 return MD5; 154 } 155 156 private static final HashFunction MD5 = new MessageDigestHashFunction("MD5", "Hashing.md5()"); 157 158 /** 159 * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the 160 * SHA-1 {@link MessageDigest}. 161 */ 162 public static HashFunction sha1() { 163 return SHA_1; 164 } 165 166 private static final HashFunction SHA_1 = 167 new MessageDigestHashFunction("SHA-1", "Hashing.sha1()"); 168 169 /** 170 * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to 171 * the SHA-256 {@link MessageDigest}. 172 */ 173 public static HashFunction sha256() { 174 return SHA_256; 175 } 176 177 private static final HashFunction SHA_256 = 178 new MessageDigestHashFunction("SHA-256", "Hashing.sha256()"); 179 180 /** 181 * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the 182 * SHA-512 {@link MessageDigest}. 183 */ 184 public static HashFunction sha512() { 185 return SHA_512; 186 } 187 188 private static final HashFunction SHA_512 = 189 new MessageDigestHashFunction("SHA-512", "Hashing.sha512()"); 190 191 /** 192 * Returns a hash function implementing the CRC-32 checksum algorithm (32 hash bits) by delegating 193 * to the {@link CRC32} {@link Checksum}. 194 * 195 * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a 196 * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}. 197 * 198 * @since 14.0 199 */ 200 public static HashFunction crc32() { 201 return CRC_32; 202 } 203 204 private static final HashFunction CRC_32 = 205 checksumHashFunction(ChecksumType.CRC_32, "Hashing.crc32()"); 206 207 /** 208 * Returns a hash function implementing the Adler-32 checksum algorithm (32 hash bits) by 209 * delegating to the {@link Adler32} {@link Checksum}. 210 * 211 * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a 212 * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}. 213 * 214 * @since 14.0 215 */ 216 public static HashFunction adler32() { 217 return ADLER_32; 218 } 219 220 private static final HashFunction ADLER_32 = 221 checksumHashFunction(ChecksumType.ADLER_32, "Hashing.adler32()"); 222 223 private static HashFunction checksumHashFunction(ChecksumType type, String toString) { 224 return new ChecksumHashFunction(type, type.bits, toString); 225 } 226 227 enum ChecksumType implements Supplier<Checksum> { 228 CRC_32(32) { 229 @Override 230 public Checksum get() { 231 return new CRC32(); 232 } 233 }, 234 ADLER_32(32) { 235 @Override 236 public Checksum get() { 237 return new Adler32(); 238 } 239 }; 240 241 private final int bits; 242 243 ChecksumType(int bits) { 244 this.bits = bits; 245 } 246 247 @Override 248 public abstract Checksum get(); 249 } 250 251 // Lazy initialization holder class idiom. 252 // TODO(user): Investigate whether we need to still use this idiom now that we have a fallback 253 // option for our use of Unsafe. 254 255 /** 256 * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform 257 * manner that minimizes the need for remapping as {@code buckets} grows. That is, 258 * {@code consistentHash(h, n)} equals: 259 * 260 * <ul> 261 * <li>{@code n - 1}, with approximate probability {@code 1/n} 262 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n}) 263 * </ul> 264 * 265 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia 266 * article on consistent hashing</a> for more information. 267 * <p> 268 * If you might want to have weights for the buckets in the future, take a look at 269 * {@code weightedConsistentHash}. 270 */ 271 public static int consistentHash(HashCode hashCode, int buckets) { 272 return consistentHash(hashCode.padToLong(), buckets); 273 } 274 275 /** 276 * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform 277 * manner that minimizes the need for remapping as {@code buckets} grows. That is, 278 * {@code consistentHash(h, n)} equals: 279 * 280 * <ul> 281 * <li>{@code n - 1}, with approximate probability {@code 1/n} 282 * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n}) 283 * </ul> 284 * 285 * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia 286 * article on consistent hashing</a> for more information. 287 * <p> 288 * If you might want to have weights for the buckets in the future, take a look at 289 * {@code weightedConsistentHash}. 290 */ 291 public static int consistentHash(long input, int buckets) { 292 checkArgument(buckets > 0, "buckets must be positive: %s", buckets); 293 LinearCongruentialGenerator generator = new LinearCongruentialGenerator(input); 294 int candidate = 0; 295 int next; 296 297 // Jump from bucket to bucket until we go out of range 298 while (true) { 299 next = (int) ((candidate + 1) / generator.nextDouble()); 300 if (next >= 0 && next < buckets) { 301 candidate = next; 302 } else { 303 return candidate; 304 } 305 } 306 } 307 308 /** 309 * Returns a hash code, having the same bit length as each of the input hash codes, 310 * that combines the information of these hash codes in an ordered fashion. That 311 * is, whenever two equal hash codes are produced by two calls to this method, it 312 * is <i>as likely as possible</i> that each was computed from the <i>same</i> 313 * input hash codes in the <i>same</i> order. 314 * 315 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes 316 * do not all have the same bit length 317 */ 318 public static HashCode combineOrdered(Iterable<HashCode> hashCodes) { 319 Iterator<HashCode> iterator = hashCodes.iterator(); 320 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine."); 321 int bits = iterator.next().bits(); 322 byte[] resultBytes = new byte[bits / 8]; 323 for (HashCode hashCode : hashCodes) { 324 byte[] nextBytes = hashCode.asBytes(); 325 checkArgument(nextBytes.length == resultBytes.length, 326 "All hashcodes must have the same bit length."); 327 for (int i = 0; i < nextBytes.length; i++) { 328 resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]); 329 } 330 } 331 return HashCodes.fromBytesNoCopy(resultBytes); 332 } 333 334 /** 335 * Returns a hash code, having the same bit length as each of the input hash codes, 336 * that combines the information of these hash codes in an unordered fashion. That 337 * is, whenever two equal hash codes are produced by two calls to this method, it 338 * is <i>as likely as possible</i> that each was computed from the <i>same</i> 339 * input hash codes in <i>some</i> order. 340 * 341 * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes 342 * do not all have the same bit length 343 */ 344 public static HashCode combineUnordered(Iterable<HashCode> hashCodes) { 345 Iterator<HashCode> iterator = hashCodes.iterator(); 346 checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine."); 347 byte[] resultBytes = new byte[iterator.next().bits() / 8]; 348 for (HashCode hashCode : hashCodes) { 349 byte[] nextBytes = hashCode.asBytes(); 350 checkArgument(nextBytes.length == resultBytes.length, 351 "All hashcodes must have the same bit length."); 352 for (int i = 0; i < nextBytes.length; i++) { 353 resultBytes[i] += nextBytes[i]; 354 } 355 } 356 return HashCodes.fromBytesNoCopy(resultBytes); 357 } 358 359 /** 360 * Checks that the passed argument is positive, and ceils it to a multiple of 32. 361 */ 362 static int checkPositiveAndMakeMultipleOf32(int bits) { 363 checkArgument(bits > 0, "Number of bits must be positive"); 364 return (bits + 31) & ~31; 365 } 366 367 // TODO(kevinb): Maybe expose this class via a static Hashing method? 368 @VisibleForTesting 369 static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction { 370 private final int bits; 371 372 ConcatenatedHashFunction(HashFunction... functions) { 373 super(functions); 374 int bitSum = 0; 375 for (HashFunction function : functions) { 376 bitSum += function.bits(); 377 } 378 this.bits = bitSum; 379 } 380 381 @Override 382 HashCode makeHash(Hasher[] hashers) { 383 // TODO(user): Get rid of the ByteBuffer here? 384 byte[] bytes = new byte[bits / 8]; 385 ByteBuffer buffer = ByteBuffer.wrap(bytes); 386 for (Hasher hasher : hashers) { 387 buffer.put(hasher.hash().asBytes()); 388 } 389 return HashCodes.fromBytesNoCopy(bytes); 390 } 391 392 @Override 393 public int bits() { 394 return bits; 395 } 396 397 @Override 398 public boolean equals(@Nullable Object object) { 399 if (object instanceof ConcatenatedHashFunction) { 400 ConcatenatedHashFunction other = (ConcatenatedHashFunction) object; 401 if (bits != other.bits || functions.length != other.functions.length) { 402 return false; 403 } 404 for (int i = 0; i < functions.length; i++) { 405 if (!functions[i].equals(other.functions[i])) { 406 return false; 407 } 408 } 409 return true; 410 } 411 return false; 412 } 413 414 @Override 415 public int hashCode() { 416 int hash = bits; 417 for (HashFunction function : functions) { 418 hash ^= function.hashCode(); 419 } 420 return hash; 421 } 422 } 423 424 /** 425 * Linear CongruentialGenerator to use for consistent hashing. 426 * See http://en.wikipedia.org/wiki/Linear_congruential_generator 427 */ 428 private static final class LinearCongruentialGenerator { 429 private long state; 430 431 public LinearCongruentialGenerator(long seed) { 432 this.state = seed; 433 } 434 435 public double nextDouble() { 436 state = 2862933555777941757L * state + 1; 437 return ((double) ((int) (state >>> 33) + 1)) / (0x1.0p31); 438 } 439 } 440}