001/*
002 * Copyright (C) 2011 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.hash;
016
017import static com.google.common.base.Preconditions.checkArgument;
018
019import com.google.common.annotations.Beta;
020import com.google.common.annotations.VisibleForTesting;
021import com.google.common.base.Supplier;
022
023import java.nio.ByteBuffer;
024import java.security.MessageDigest;
025import java.util.Iterator;
026import java.util.zip.Adler32;
027import java.util.zip.CRC32;
028import java.util.zip.Checksum;
029
030import javax.annotation.Nullable;
031
032/**
033 * Static methods to obtain {@link HashFunction} instances, and other static hashing-related
034 * utilities.
035 *
036 * <p>A comparison of the various hash functions can be found
037 * <a href="http://goo.gl/jS7HH">here</a>.
038 *
039 * @author Kevin Bourrillion
040 * @author Dimitris Andreou
041 * @author Kurt Alfred Kluever
042 * @since 11.0
043 */
044@Beta
045public final class Hashing {
046  private Hashing() {}
047
048  /**
049   * Used to randomize {@link #goodFastHash} instances, so that programs which persist anything
050   * dependent on hashcodes of those, will fail sooner than later.
051   */
052  private static final int GOOD_FAST_HASH_SEED = (int) System.currentTimeMillis();
053
054  // Used by goodFastHash when minimumBits == 32.
055  private static final HashFunction GOOD_FAST_HASH_FUNCTION_32 = murmur3_32(GOOD_FAST_HASH_SEED);
056
057  // Used by goodFastHash when 32 < minimumBits <= 128.
058  private static final HashFunction GOOD_FAST_HASH_FUNCTION_128 = murmur3_128(GOOD_FAST_HASH_SEED);
059
060  /**
061   * Returns a general-purpose, <b>non-cryptographic-strength</b>, streaming hash function that
062   * produces hash codes of length at least {@code minimumBits}. Users without specific
063   * compatibility requirements and who do not persist the hash codes are encouraged to
064   * choose this hash function.
065   *
066   * <p>Repeated calls to {@link #goodFastHash} with the same {@code minimumBits} value will
067   * return {@link HashFunction} instances with identical behavior (but not necessarily the
068   * same instance) for the duration of the current virtual machine.
069   *
070   * <p><b>Warning: the implementation is unspecified and is subject to change.</b>
071   *
072   * @throws IllegalArgumentException if {@code minimumBits} is not positive
073   */
074  public static HashFunction goodFastHash(int minimumBits) {
075    int bits = checkPositiveAndMakeMultipleOf32(minimumBits);
076
077    if (bits == 32) {
078      return GOOD_FAST_HASH_FUNCTION_32;
079    }
080    if (bits <= 128) {
081      return GOOD_FAST_HASH_FUNCTION_128;
082    }
083
084    // Otherwise, join together some 128-bit murmur3s
085    int hashFunctionsNeeded = (bits + 127) / 128;
086    HashFunction[] hashFunctions = new HashFunction[hashFunctionsNeeded];
087    hashFunctions[0] = GOOD_FAST_HASH_FUNCTION_128;
088    int seed = GOOD_FAST_HASH_SEED;
089    for (int i = 1; i < hashFunctionsNeeded; i++) {
090      seed += 1500450271; // a prime; shouldn't matter
091      hashFunctions[i] = murmur3_128(seed);
092    }
093    return new ConcatenatedHashFunction(hashFunctions);
094  }
095
096  /**
097   * Returns a hash function implementing the
098   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
099   * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant),
100   * using the given seed value.
101   *
102   * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
103   */
104  public static HashFunction murmur3_32(int seed) {
105    return new Murmur3_32HashFunction(seed);
106  }
107
108  /**
109   * Returns a hash function implementing the
110   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
111   * 32-bit murmur3 algorithm, x86 variant</a> (little-endian variant),
112   * using a seed value of zero.
113   *
114   * <p>The exact C++ equivalent is the MurmurHash3_x86_32 function (Murmur3A).
115   */
116  public static HashFunction murmur3_32() {
117    return MURMUR3_32;
118  }
119
120  private static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0);
121
122  /**
123   * Returns a hash function implementing the
124   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
125   * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant),
126   * using the given seed value.
127   *
128   * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F).
129   */
130  public static HashFunction murmur3_128(int seed) {
131    return new Murmur3_128HashFunction(seed);
132  }
133
134  /**
135   * Returns a hash function implementing the
136   * <a href="http://smhasher.googlecode.com/svn/trunk/MurmurHash3.cpp">
137   * 128-bit murmur3 algorithm, x64 variant</a> (little-endian variant),
138   * using a seed value of zero.
139   *
140   * <p>The exact C++ equivalent is the MurmurHash3_x64_128 function (Murmur3F).
141   */
142  public static HashFunction murmur3_128() {
143    return MURMUR3_128;
144  }
145
146  private static final HashFunction MURMUR3_128 = new Murmur3_128HashFunction(0);
147
148  /**
149   * Returns a hash function implementing the MD5 hash algorithm (128 hash bits) by delegating to
150   * the MD5 {@link MessageDigest}.
151   */
152  public static HashFunction md5() {
153    return MD5;
154  }
155
156  private static final HashFunction MD5 = new MessageDigestHashFunction("MD5", "Hashing.md5()");
157
158  /**
159   * Returns a hash function implementing the SHA-1 algorithm (160 hash bits) by delegating to the
160   * SHA-1 {@link MessageDigest}.
161   */
162  public static HashFunction sha1() {
163    return SHA_1;
164  }
165
166  private static final HashFunction SHA_1 =
167      new MessageDigestHashFunction("SHA-1", "Hashing.sha1()");
168
169  /**
170   * Returns a hash function implementing the SHA-256 algorithm (256 hash bits) by delegating to
171   * the SHA-256 {@link MessageDigest}.
172   */
173  public static HashFunction sha256() {
174    return SHA_256;
175  }
176
177  private static final HashFunction SHA_256 =
178      new MessageDigestHashFunction("SHA-256", "Hashing.sha256()");
179
180  /**
181   * Returns a hash function implementing the SHA-512 algorithm (512 hash bits) by delegating to the
182   * SHA-512 {@link MessageDigest}.
183   */
184  public static HashFunction sha512() {
185    return SHA_512;
186  }
187
188  private static final HashFunction SHA_512 =
189      new MessageDigestHashFunction("SHA-512", "Hashing.sha512()");
190
191  /**
192   * Returns a hash function implementing the CRC-32 checksum algorithm (32 hash bits) by delegating
193   * to the {@link CRC32} {@link Checksum}.
194   *
195   * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a
196   * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}.
197   *
198   * @since 14.0
199   */
200  public static HashFunction crc32() {
201    return CRC_32;
202  }
203
204  private static final HashFunction CRC_32 =
205      checksumHashFunction(ChecksumType.CRC_32, "Hashing.crc32()");
206
207  /**
208   * Returns a hash function implementing the Adler-32 checksum algorithm (32 hash bits) by
209   * delegating to the {@link Adler32} {@link Checksum}.
210   *
211   * <p>To get the {@code long} value equivalent to {@link Checksum#getValue()} for a
212   * {@code HashCode} produced by this function, use {@link HashCode#padToLong()}.
213   *
214   * @since 14.0
215   */
216  public static HashFunction adler32() {
217    return ADLER_32;
218  }
219
220  private static final HashFunction ADLER_32 =
221      checksumHashFunction(ChecksumType.ADLER_32, "Hashing.adler32()");
222
223  private static HashFunction checksumHashFunction(ChecksumType type, String toString) {
224    return new ChecksumHashFunction(type, type.bits, toString);
225  }
226
227  enum ChecksumType implements Supplier<Checksum> {
228    CRC_32(32) {
229      @Override
230      public Checksum get() {
231        return new CRC32();
232      }
233    },
234    ADLER_32(32) {
235      @Override
236      public Checksum get() {
237        return new Adler32();
238      }
239    };
240
241    private final int bits;
242
243    ChecksumType(int bits) {
244      this.bits = bits;
245    }
246
247    @Override
248    public abstract Checksum get();
249  }
250
251  // Lazy initialization holder class idiom.
252  // TODO(user): Investigate whether we need to still use this idiom now that we have a fallback
253  // option for our use of Unsafe.
254
255  /**
256   * Assigns to {@code hashCode} a "bucket" in the range {@code [0, buckets)}, in a uniform
257   * manner that minimizes the need for remapping as {@code buckets} grows. That is,
258   * {@code consistentHash(h, n)} equals:
259   *
260   * <ul>
261   * <li>{@code n - 1}, with approximate probability {@code 1/n}
262   * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
263   * </ul>
264   *
265   * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
266   * article on consistent hashing</a> for more information.
267   * <p>
268   * If you might want to have weights for the buckets in the future, take a look at
269   * {@code weightedConsistentHash}.
270   */
271  public static int consistentHash(HashCode hashCode, int buckets) {
272    return consistentHash(hashCode.padToLong(), buckets);
273  }
274
275  /**
276   * Assigns to {@code input} a "bucket" in the range {@code [0, buckets)}, in a uniform
277   * manner that minimizes the need for remapping as {@code buckets} grows. That is,
278   * {@code consistentHash(h, n)} equals:
279   *
280   * <ul>
281   * <li>{@code n - 1}, with approximate probability {@code 1/n}
282   * <li>{@code consistentHash(h, n - 1)}, otherwise (probability {@code 1 - 1/n})
283   * </ul>
284   *
285   * <p>See the <a href="http://en.wikipedia.org/wiki/Consistent_hashing">wikipedia
286   * article on consistent hashing</a> for more information.
287   * <p>
288   * If you might want to have weights for the buckets in the future, take a look at
289   * {@code weightedConsistentHash}.
290   */
291  public static int consistentHash(long input, int buckets) {
292    checkArgument(buckets > 0, "buckets must be positive: %s", buckets);
293    LinearCongruentialGenerator generator = new LinearCongruentialGenerator(input);
294    int candidate = 0;
295    int next;
296
297    // Jump from bucket to bucket until we go out of range
298    while (true) {
299      next = (int) ((candidate + 1) / generator.nextDouble());
300      if (next >= 0 && next < buckets) {
301        candidate = next;
302      } else {
303        return candidate;
304      }
305    }
306  }
307
308  /**
309   * Returns a hash code, having the same bit length as each of the input hash codes,
310   * that combines the information of these hash codes in an ordered fashion. That
311   * is, whenever two equal hash codes are produced by two calls to this method, it
312   * is <i>as likely as possible</i> that each was computed from the <i>same</i>
313   * input hash codes in the <i>same</i> order.
314   *
315   * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
316   *     do not all have the same bit length
317   */
318  public static HashCode combineOrdered(Iterable<HashCode> hashCodes) {
319    Iterator<HashCode> iterator = hashCodes.iterator();
320    checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
321    int bits = iterator.next().bits();
322    byte[] resultBytes = new byte[bits / 8];
323    for (HashCode hashCode : hashCodes) {
324      byte[] nextBytes = hashCode.asBytes();
325      checkArgument(nextBytes.length == resultBytes.length,
326          "All hashcodes must have the same bit length.");
327      for (int i = 0; i < nextBytes.length; i++) {
328        resultBytes[i] = (byte) (resultBytes[i] * 37 ^ nextBytes[i]);
329      }
330    }
331    return HashCodes.fromBytesNoCopy(resultBytes);
332  }
333
334  /**
335   * Returns a hash code, having the same bit length as each of the input hash codes,
336   * that combines the information of these hash codes in an unordered fashion. That
337   * is, whenever two equal hash codes are produced by two calls to this method, it
338   * is <i>as likely as possible</i> that each was computed from the <i>same</i>
339   * input hash codes in <i>some</i> order.
340   *
341   * @throws IllegalArgumentException if {@code hashCodes} is empty, or the hash codes
342   *     do not all have the same bit length
343   */
344  public static HashCode combineUnordered(Iterable<HashCode> hashCodes) {
345    Iterator<HashCode> iterator = hashCodes.iterator();
346    checkArgument(iterator.hasNext(), "Must be at least 1 hash code to combine.");
347    byte[] resultBytes = new byte[iterator.next().bits() / 8];
348    for (HashCode hashCode : hashCodes) {
349      byte[] nextBytes = hashCode.asBytes();
350      checkArgument(nextBytes.length == resultBytes.length,
351          "All hashcodes must have the same bit length.");
352      for (int i = 0; i < nextBytes.length; i++) {
353        resultBytes[i] += nextBytes[i];
354      }
355    }
356    return HashCodes.fromBytesNoCopy(resultBytes);
357  }
358
359  /**
360   * Checks that the passed argument is positive, and ceils it to a multiple of 32.
361   */
362  static int checkPositiveAndMakeMultipleOf32(int bits) {
363    checkArgument(bits > 0, "Number of bits must be positive");
364    return (bits + 31) & ~31;
365  }
366
367  // TODO(kevinb): Maybe expose this class via a static Hashing method?
368  @VisibleForTesting
369  static final class ConcatenatedHashFunction extends AbstractCompositeHashFunction {
370    private final int bits;
371
372    ConcatenatedHashFunction(HashFunction... functions) {
373      super(functions);
374      int bitSum = 0;
375      for (HashFunction function : functions) {
376        bitSum += function.bits();
377      }
378      this.bits = bitSum;
379    }
380
381    @Override
382    HashCode makeHash(Hasher[] hashers) {
383      // TODO(user): Get rid of the ByteBuffer here?
384      byte[] bytes = new byte[bits / 8];
385      ByteBuffer buffer = ByteBuffer.wrap(bytes);
386      for (Hasher hasher : hashers) {
387        buffer.put(hasher.hash().asBytes());
388      }
389      return HashCodes.fromBytesNoCopy(bytes);
390    }
391
392    @Override
393    public int bits() {
394      return bits;
395    }
396
397    @Override
398    public boolean equals(@Nullable Object object) {
399      if (object instanceof ConcatenatedHashFunction) {
400        ConcatenatedHashFunction other = (ConcatenatedHashFunction) object;
401        if (bits != other.bits || functions.length != other.functions.length) {
402          return false;
403        }
404        for (int i = 0; i < functions.length; i++) {
405          if (!functions[i].equals(other.functions[i])) {
406            return false;
407          }
408        }
409        return true;
410      }
411      return false;
412    }
413
414    @Override
415    public int hashCode() {
416      int hash = bits;
417      for (HashFunction function : functions) {
418        hash ^= function.hashCode();
419      }
420      return hash;
421    }
422  }
423
424  /**
425   * Linear CongruentialGenerator to use for consistent hashing.
426   * See http://en.wikipedia.org/wiki/Linear_congruential_generator
427   */
428  private static final class LinearCongruentialGenerator {
429    private long state;
430
431    public LinearCongruentialGenerator(long seed) {
432      this.state = seed;
433    }
434
435    public double nextDouble() {
436      state = 2862933555777941757L * state + 1;
437      return ((double) ((int) (state >>> 33) + 1)) / (0x1.0p31);
438    }
439  }
440}