001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024import com.google.common.annotations.GwtIncompatible;
025
026import java.util.Arrays;
027import java.util.BitSet;
028
029import javax.annotation.CheckReturnValue;
030
031/**
032 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
033 * for any {@link Object}. Also offers basic text processing methods based on this function.
034 * Implementations are strongly encouraged to be side-effect-free and immutable.
035 *
036 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
037 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
038 *
039 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
040 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
041 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
042 * treats these just as two separate characters.
043 *
044 * <p>Example usages: <pre>
045 *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
046 *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
047 *
048 * <p>See the Guava User Guide article on <a href=
049 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
050 * {@code CharMatcher}</a>.
051 *
052 * @author Kevin Bourrillion
053 * @since 1.0
054 */
055@Beta // Possibly change from chars to code points; decide constants vs. methods
056@GwtCompatible(emulated = true)
057public abstract class CharMatcher implements Predicate<Character> {
058  // Constants
059  /**
060   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
061   * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
062   * discussion of that term.
063   *
064   * @since 2.0
065   */
066  public static final CharMatcher BREAKING_WHITESPACE = new CharMatcher() {
067    @Override
068    public boolean matches(char c) {
069      switch (c) {
070        case '\t':
071        case '\n':
072        case '\013':
073        case '\f':
074        case '\r':
075        case ' ':
076        case '\u0085':
077        case '\u1680':
078        case '\u2028':
079        case '\u2029':
080        case '\u205f':
081        case '\u3000':
082          return true;
083        case '\u2007':
084          return false;
085        default:
086          return c >= '\u2000' && c <= '\u200a';
087      }
088    }
089
090    @Override
091    public String toString() {
092      return "CharMatcher.BREAKING_WHITESPACE";
093    }
094  };
095
096  /**
097   * Determines whether a character is ASCII, meaning that its code point is less than 128.
098   */
099  public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
100
101  private static class RangesMatcher extends CharMatcher {
102    private final char[] rangeStarts;
103    private final char[] rangeEnds;
104
105    RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
106      super(description);
107      this.rangeStarts = rangeStarts;
108      this.rangeEnds = rangeEnds;
109      checkArgument(rangeStarts.length == rangeEnds.length);
110      for (int i = 0; i < rangeStarts.length; i++) {
111        checkArgument(rangeStarts[i] <= rangeEnds[i]);
112        if (i + 1 < rangeStarts.length) {
113          checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
114        }
115      }
116    }
117
118    @Override
119    public boolean matches(char c) {
120      int index = Arrays.binarySearch(rangeStarts, c);
121      if (index >= 0) {
122        return true;
123      } else {
124        index = ~index - 1;
125        return index >= 0 && c <= rangeEnds[index];
126      }
127    }
128  }
129
130  // Must be in ascending order.
131  private static final String ZEROES = "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6"
132      + "\u0c66\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1b50\u1bb0"
133      + "\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
134
135  private static final String NINES;
136  static {
137    StringBuilder builder = new StringBuilder(ZEROES.length());
138    for (int i = 0; i < ZEROES.length(); i++) {
139      builder.append((char) (ZEROES.charAt(i) + 9));
140    }
141    NINES = builder.toString();
142  }
143
144  /**
145   * Determines whether a character is a digit according to
146   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
147   */
148  public static final CharMatcher DIGIT = new RangesMatcher(
149      "CharMatcher.DIGIT", ZEROES.toCharArray(), NINES.toCharArray());
150
151  /**
152   * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
153   * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
154   */
155  public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
156    @Override public boolean matches(char c) {
157      return Character.isDigit(c);
158    }
159  };
160
161  /**
162   * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
163   * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
164   * inRange('a', 'z').or(inRange('A', 'Z'))}.
165   */
166  public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
167    @Override public boolean matches(char c) {
168      return Character.isLetter(c);
169    }
170  };
171
172  /**
173   * Determines whether a character is a letter or digit according to {@link
174   * Character#isLetterOrDigit(char) Java's definition}.
175   */
176  public static final CharMatcher JAVA_LETTER_OR_DIGIT =
177      new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
178    @Override public boolean matches(char c) {
179      return Character.isLetterOrDigit(c);
180    }
181  };
182
183  /**
184   * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
185   * Java's definition}.
186   */
187  public static final CharMatcher JAVA_UPPER_CASE =
188      new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
189    @Override public boolean matches(char c) {
190      return Character.isUpperCase(c);
191    }
192  };
193
194  /**
195   * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
196   * Java's definition}.
197   */
198  public static final CharMatcher JAVA_LOWER_CASE =
199      new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
200    @Override public boolean matches(char c) {
201      return Character.isLowerCase(c);
202    }
203  };
204
205  /**
206   * Determines whether a character is an ISO control character as specified by {@link
207   * Character#isISOControl(char)}.
208   */
209  public static final CharMatcher JAVA_ISO_CONTROL =
210      inRange('\u0000', '\u001f')
211      .or(inRange('\u007f', '\u009f'))
212      .withToString("CharMatcher.JAVA_ISO_CONTROL");
213
214  /**
215   * Determines whether a character is invisible; that is, if its Unicode category is any of
216   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
217   * PRIVATE_USE according to ICU4J.
218   */
219  public static final CharMatcher INVISIBLE = new RangesMatcher("CharMatcher.INVISIBLE", (
220      "\u0000\u007f\u00ad\u0600\u06dd\u070f\u1680\u180e\u2000\u2028\u205f\u206a\u3000\ud800\ufeff"
221      + "\ufff9\ufffa").toCharArray(), (
222      "\u0020\u00a0\u00ad\u0604\u06dd\u070f\u1680\u180e\u200f\u202f\u2064\u206f\u3000\uf8ff\ufeff"
223      + "\ufff9\ufffb").toCharArray());
224
225  private static String showCharacter(char c) {
226    String hex = "0123456789ABCDEF";
227    char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
228    for (int i = 0; i < 4; i++) {
229      tmp[5 - i] = hex.charAt(c & 0xF);
230      c >>= 4;
231    }
232    return String.copyValueOf(tmp);
233
234  }
235
236  /**
237   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
238   * errs on the side of returning {@code false} (that is, it tends to assume a character is
239   * double-width).
240   *
241   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
242   * date.
243   */
244  public static final CharMatcher SINGLE_WIDTH = new RangesMatcher("CharMatcher.SINGLE_WIDTH",
245      "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
246      "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
247
248  /** Matches any character. */
249  public static final CharMatcher ANY =
250      new FastMatcher("CharMatcher.ANY") {
251        @Override public boolean matches(char c) {
252          return true;
253        }
254
255        @Override public int indexIn(CharSequence sequence) {
256          return (sequence.length() == 0) ? -1 : 0;
257        }
258
259        @Override public int indexIn(CharSequence sequence, int start) {
260          int length = sequence.length();
261          Preconditions.checkPositionIndex(start, length);
262          return (start == length) ? -1 : start;
263        }
264
265        @Override public int lastIndexIn(CharSequence sequence) {
266          return sequence.length() - 1;
267        }
268
269        @Override public boolean matchesAllOf(CharSequence sequence) {
270          checkNotNull(sequence);
271          return true;
272        }
273
274        @Override public boolean matchesNoneOf(CharSequence sequence) {
275          return sequence.length() == 0;
276        }
277
278        @Override public String removeFrom(CharSequence sequence) {
279          checkNotNull(sequence);
280          return "";
281        }
282
283        @Override public String replaceFrom(CharSequence sequence, char replacement) {
284          char[] array = new char[sequence.length()];
285          Arrays.fill(array, replacement);
286          return new String(array);
287        }
288
289        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
290          StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
291          for (int i = 0; i < sequence.length(); i++) {
292            retval.append(replacement);
293          }
294          return retval.toString();
295        }
296
297        @Override public String collapseFrom(CharSequence sequence, char replacement) {
298          return (sequence.length() == 0) ? "" : String.valueOf(replacement);
299        }
300
301        @Override public String trimFrom(CharSequence sequence) {
302          checkNotNull(sequence);
303          return "";
304        }
305
306        @Override public int countIn(CharSequence sequence) {
307          return sequence.length();
308        }
309
310        @Override public CharMatcher and(CharMatcher other) {
311          return checkNotNull(other);
312        }
313
314        @Override public CharMatcher or(CharMatcher other) {
315          checkNotNull(other);
316          return this;
317        }
318
319        @Override public CharMatcher negate() {
320          return NONE;
321        }
322      };
323
324  /** Matches no characters. */
325  public static final CharMatcher NONE =
326      new FastMatcher("CharMatcher.NONE") {
327        @Override public boolean matches(char c) {
328          return false;
329        }
330
331        @Override public int indexIn(CharSequence sequence) {
332          checkNotNull(sequence);
333          return -1;
334        }
335
336        @Override public int indexIn(CharSequence sequence, int start) {
337          int length = sequence.length();
338          Preconditions.checkPositionIndex(start, length);
339          return -1;
340        }
341
342        @Override public int lastIndexIn(CharSequence sequence) {
343          checkNotNull(sequence);
344          return -1;
345        }
346
347        @Override public boolean matchesAllOf(CharSequence sequence) {
348          return sequence.length() == 0;
349        }
350
351        @Override public boolean matchesNoneOf(CharSequence sequence) {
352          checkNotNull(sequence);
353          return true;
354        }
355
356        @Override public String removeFrom(CharSequence sequence) {
357          return sequence.toString();
358        }
359
360        @Override public String replaceFrom(CharSequence sequence, char replacement) {
361          return sequence.toString();
362        }
363
364        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
365          checkNotNull(replacement);
366          return sequence.toString();
367        }
368
369        @Override public String collapseFrom(CharSequence sequence, char replacement) {
370          return sequence.toString();
371        }
372
373        @Override public String trimFrom(CharSequence sequence) {
374          return sequence.toString();
375        }
376
377        @Override
378        public String trimLeadingFrom(CharSequence sequence) {
379          return sequence.toString();
380        }
381
382        @Override
383        public String trimTrailingFrom(CharSequence sequence) {
384          return sequence.toString();
385        }
386
387        @Override public int countIn(CharSequence sequence) {
388          checkNotNull(sequence);
389          return 0;
390        }
391
392        @Override public CharMatcher and(CharMatcher other) {
393          checkNotNull(other);
394          return this;
395        }
396
397        @Override public CharMatcher or(CharMatcher other) {
398          return checkNotNull(other);
399        }
400
401        @Override public CharMatcher negate() {
402          return ANY;
403        }
404      };
405
406  // Static factories
407
408  /**
409   * Returns a {@code char} matcher that matches only one specified character.
410   */
411  public static CharMatcher is(final char match) {
412    String description = "CharMatcher.is('" + showCharacter(match) + "')";
413    return new FastMatcher(description) {
414      @Override public boolean matches(char c) {
415        return c == match;
416      }
417
418      @Override public String replaceFrom(CharSequence sequence, char replacement) {
419        return sequence.toString().replace(match, replacement);
420      }
421
422      @Override public CharMatcher and(CharMatcher other) {
423        return other.matches(match) ? this : NONE;
424      }
425
426      @Override public CharMatcher or(CharMatcher other) {
427        return other.matches(match) ? other : super.or(other);
428      }
429
430      @Override public CharMatcher negate() {
431        return isNot(match);
432      }
433
434      @GwtIncompatible("java.util.BitSet")
435      @Override
436      void setBits(BitSet table) {
437        table.set(match);
438      }
439    };
440  }
441
442  /**
443   * Returns a {@code char} matcher that matches any character except the one specified.
444   *
445   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
446   */
447  public static CharMatcher isNot(final char match) {
448    String description = "CharMatcher.isNot(" + Integer.toHexString(match) + ")";
449    return new FastMatcher(description) {
450      @Override public boolean matches(char c) {
451        return c != match;
452      }
453
454      @Override public CharMatcher and(CharMatcher other) {
455        return other.matches(match) ? super.and(other) : other;
456      }
457
458      @Override public CharMatcher or(CharMatcher other) {
459        return other.matches(match) ? ANY : this;
460      }
461
462      @GwtIncompatible("java.util.BitSet")
463      @Override
464      void setBits(BitSet table) {
465        table.set(0, match);
466        table.set(match + 1, Character.MAX_VALUE + 1);
467      }
468
469      @Override public CharMatcher negate() {
470        return is(match);
471      }
472    };
473  }
474
475  /**
476   * Returns a {@code char} matcher that matches any character present in the given character
477   * sequence.
478   */
479  public static CharMatcher anyOf(final CharSequence sequence) {
480    switch (sequence.length()) {
481      case 0:
482        return NONE;
483      case 1:
484        return is(sequence.charAt(0));
485      case 2:
486        return isEither(sequence.charAt(0), sequence.charAt(1));
487      default:
488        // continue below to handle the general case
489    }
490    // TODO(user): is it potentially worth just going ahead and building a precomputed matcher?
491    final char[] chars = sequence.toString().toCharArray();
492    Arrays.sort(chars);
493    StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
494    for (char c : chars) {
495      description.append(showCharacter(c));
496    }
497    description.append("\")");
498    return new CharMatcher(description.toString()) {
499      @Override public boolean matches(char c) {
500        return Arrays.binarySearch(chars, c) >= 0;
501      }
502
503      @Override
504      @GwtIncompatible("java.util.BitSet")
505      void setBits(BitSet table) {
506        for (char c : chars) {
507          table.set(c);
508        }
509      }
510    };
511  }
512
513  private static CharMatcher isEither(
514      final char match1,
515      final char match2) {
516    String description = "CharMatcher.anyOf(\"" +
517        showCharacter(match1) + showCharacter(match2) + "\")";
518    return new FastMatcher(description) {
519      @Override public boolean matches(char c) {
520        return c == match1 || c == match2;
521      }
522
523      @GwtIncompatible("java.util.BitSet")
524      @Override void setBits(BitSet table) {
525        table.set(match1);
526        table.set(match2);
527      }
528    };
529  }
530
531  /**
532   * Returns a {@code char} matcher that matches any character not present in the given character
533   * sequence.
534   */
535  public static CharMatcher noneOf(CharSequence sequence) {
536    return anyOf(sequence).negate();
537  }
538
539  /**
540   * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
541   * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
542   * CharMatcher.inRange('a', 'z')}.
543   *
544   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
545   */
546  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
547    checkArgument(endInclusive >= startInclusive);
548    String description = "CharMatcher.inRange('" +
549        showCharacter(startInclusive) + "', '" +
550        showCharacter(endInclusive) + "')";
551    return inRange(startInclusive, endInclusive, description);
552  }
553
554  static CharMatcher inRange(final char startInclusive, final char endInclusive,
555      String description) {
556    return new FastMatcher(description) {
557      @Override public boolean matches(char c) {
558        return startInclusive <= c && c <= endInclusive;
559      }
560
561      @GwtIncompatible("java.util.BitSet")
562      @Override void setBits(BitSet table) {
563        table.set(startInclusive, endInclusive + 1);
564      }
565    };
566  }
567
568  /**
569   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
570   * which operates on primitive {@code char} instances instead.
571   */
572  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
573    checkNotNull(predicate);
574    if (predicate instanceof CharMatcher) {
575      return (CharMatcher) predicate;
576    }
577    String description = "CharMatcher.forPredicate(" + predicate + ")";
578    return new CharMatcher(description) {
579      @Override public boolean matches(char c) {
580        return predicate.apply(c);
581      }
582
583      @Override public boolean apply(Character character) {
584        return predicate.apply(checkNotNull(character));
585      }
586    };
587  }
588
589  // State
590  final String description;
591
592  // Constructors
593
594  /**
595   * Sets the {@code toString()} from the given description.
596   */
597  CharMatcher(String description) {
598    this.description = description;
599  }
600
601  /**
602   * Constructor for use by subclasses. When subclassing, you may want to override
603   * {@code toString()} to provide a useful description.
604   */
605  protected CharMatcher() {
606    description = super.toString();
607  }
608
609  // Abstract methods
610
611  /** Determines a true or false value for the given character. */
612  public abstract boolean matches(char c);
613
614  // Non-static factories
615
616  /**
617   * Returns a matcher that matches any character not matched by this matcher.
618   */
619  public CharMatcher negate() {
620    return new NegatedMatcher(this);
621  }
622
623  private static class NegatedMatcher extends CharMatcher {
624    final CharMatcher original;
625
626    NegatedMatcher(String toString, CharMatcher original) {
627      super(toString);
628      this.original = original;
629    }
630
631    NegatedMatcher(CharMatcher original) {
632      this(original + ".negate()", original);
633    }
634
635    @Override public boolean matches(char c) {
636      return !original.matches(c);
637    }
638
639    @Override public boolean matchesAllOf(CharSequence sequence) {
640      return original.matchesNoneOf(sequence);
641    }
642
643    @Override public boolean matchesNoneOf(CharSequence sequence) {
644      return original.matchesAllOf(sequence);
645    }
646
647    @Override public int countIn(CharSequence sequence) {
648      return sequence.length() - original.countIn(sequence);
649    }
650
651    @GwtIncompatible("java.util.BitSet")
652    @Override
653    void setBits(BitSet table) {
654      BitSet tmp = new BitSet();
655      original.setBits(tmp);
656      tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
657      table.or(tmp);
658    }
659
660    @Override public CharMatcher negate() {
661      return original;
662    }
663
664    @Override
665    CharMatcher withToString(String description) {
666      return new NegatedMatcher(description, original);
667    }
668  }
669
670  /**
671   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
672   */
673  public CharMatcher and(CharMatcher other) {
674    return new And(this, checkNotNull(other));
675  }
676
677  private static class And extends CharMatcher {
678    final CharMatcher first;
679    final CharMatcher second;
680
681    And(CharMatcher a, CharMatcher b) {
682      this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
683    }
684
685    And(CharMatcher a, CharMatcher b, String description) {
686      super(description);
687      first = checkNotNull(a);
688      second = checkNotNull(b);
689    }
690
691    @Override
692    public boolean matches(char c) {
693      return first.matches(c) && second.matches(c);
694    }
695
696    @GwtIncompatible("java.util.BitSet")
697    @Override
698    void setBits(BitSet table) {
699      BitSet tmp1 = new BitSet();
700      first.setBits(tmp1);
701      BitSet tmp2 = new BitSet();
702      second.setBits(tmp2);
703      tmp1.and(tmp2);
704      table.or(tmp1);
705    }
706
707    @Override
708    CharMatcher withToString(String description) {
709      return new And(first, second, description);
710    }
711  }
712
713  /**
714   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
715   */
716  public CharMatcher or(CharMatcher other) {
717    return new Or(this, checkNotNull(other));
718  }
719
720  private static class Or extends CharMatcher {
721    final CharMatcher first;
722    final CharMatcher second;
723
724    Or(CharMatcher a, CharMatcher b, String description) {
725      super(description);
726      first = checkNotNull(a);
727      second = checkNotNull(b);
728    }
729
730    Or(CharMatcher a, CharMatcher b) {
731      this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
732    }
733
734    @GwtIncompatible("java.util.BitSet")
735    @Override
736    void setBits(BitSet table) {
737      first.setBits(table);
738      second.setBits(table);
739    }
740
741    @Override
742    public boolean matches(char c) {
743      return first.matches(c) || second.matches(c);
744    }
745
746    @Override
747    CharMatcher withToString(String description) {
748      return new Or(first, second, description);
749    }
750  }
751
752  /**
753   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
754   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
755   * worthwhile only if the precomputed matcher is queried many thousands of times.
756   *
757   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
758   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
759   * worthwhile tradeoff in a browser.
760   */
761  public CharMatcher precomputed() {
762    return Platform.precomputeCharMatcher(this);
763  }
764
765  /**
766   * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
767   * but with their {@code toString} method overridden with the new description.
768   *
769   * <p>This is unsupported by default.
770   */
771  CharMatcher withToString(String description) {
772    throw new UnsupportedOperationException();
773  }
774
775  private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
776
777  /**
778   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a
779   * method on {@link Platform} so that we can have different behavior in GWT.
780   *
781   * <p>This implementation tries to be smart in a number of ways.  It recognizes cases where
782   * the negation is cheaper to precompute than the matcher itself; it tries to build small
783   * hash tables for matchers that only match a few characters, and so on.  In the worst-case
784   * scenario, it constructs an eight-kilobyte bit array and queries that.
785   * In many situations this produces a matcher which is faster to query than the original.
786   */
787  @GwtIncompatible("java.util.BitSet")
788  CharMatcher precomputedInternal() {
789    final BitSet table = new BitSet();
790    setBits(table);
791    int totalCharacters = table.cardinality();
792    if (totalCharacters * 2 <= DISTINCT_CHARS) {
793      return precomputedPositive(totalCharacters, table, description);
794    } else {
795      // TODO(user): is it worth it to worry about the last character of large matchers?
796      table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
797      int negatedCharacters = DISTINCT_CHARS - totalCharacters;
798      return new NegatedFastMatcher(toString(),
799          precomputedPositive(negatedCharacters, table, description + ".negate()"));
800    }
801  }
802
803  /**
804   * A matcher for which precomputation will not yield any significant benefit.
805   */
806  abstract static class FastMatcher extends CharMatcher {
807    FastMatcher() {
808      super();
809    }
810
811    FastMatcher(String description) {
812      super(description);
813    }
814
815    @Override
816    public final CharMatcher precomputed() {
817      return this;
818    }
819
820    @Override
821    public CharMatcher negate() {
822      return new NegatedFastMatcher(this);
823    }
824  }
825
826  static final class NegatedFastMatcher extends NegatedMatcher {
827    NegatedFastMatcher(CharMatcher original) {
828      super(original);
829    }
830
831    NegatedFastMatcher(String toString, CharMatcher original) {
832      super(toString, original);
833    }
834
835    @Override
836    public final CharMatcher precomputed() {
837      return this;
838    }
839
840    @Override
841    CharMatcher withToString(String description) {
842      return new NegatedFastMatcher(description, original);
843    }
844  }
845
846  /**
847   * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
848   */
849  @GwtIncompatible("java.util.BitSet")
850  private static CharMatcher precomputedPositive(
851      int totalCharacters,
852      BitSet table,
853      String description) {
854    switch (totalCharacters) {
855      case 0:
856        return NONE;
857      case 1:
858        return is((char) table.nextSetBit(0));
859      case 2:
860        char c1 = (char) table.nextSetBit(0);
861        char c2 = (char) table.nextSetBit(c1 + 1);
862        return isEither(c1, c2);
863      default:
864        return isSmall(totalCharacters, table.length())
865            ? SmallCharMatcher.from(table, description)
866            : new BitSetMatcher(table, description);
867    }
868  }
869
870  private static boolean isSmall(int totalCharacters, int tableLength) {
871    return totalCharacters <= SmallCharMatcher.MAX_SIZE
872        && tableLength > (totalCharacters * 4 * Character.SIZE);
873        // err on the side of BitSetMatcher
874  }
875
876  @GwtIncompatible("java.util.BitSet")
877  private static class BitSetMatcher extends FastMatcher {
878    private final BitSet table;
879
880    private BitSetMatcher(BitSet table, String description) {
881      super(description);
882      if (table.length() + Long.SIZE < table.size()) {
883        table = (BitSet) table.clone();
884        // If only we could actually call BitSet.trimToSize() ourselves...
885      }
886      this.table = table;
887    }
888
889    @Override public boolean matches(char c) {
890      return table.get(c);
891    }
892
893    @Override
894    void setBits(BitSet bitSet) {
895      bitSet.or(table);
896    }
897  }
898
899  /**
900   * Sets bits in {@code table} matched by this matcher.
901   */
902  @GwtIncompatible("java.util.BitSet")
903  void setBits(BitSet table) {
904    for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
905      if (matches((char) c)) {
906        table.set(c);
907      }
908    }
909  }
910
911  // Text processing routines
912
913  /**
914   * Returns {@code true} if a character sequence contains at least one matching character.
915   * Equivalent to {@code !matchesNoneOf(sequence)}.
916   *
917   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
918   * character, until this returns {@code true} or the end is reached.
919   *
920   * @param sequence the character sequence to examine, possibly empty
921   * @return {@code true} if this matcher matches at least one character in the sequence
922   * @since 8.0
923   */
924  public boolean matchesAnyOf(CharSequence sequence) {
925    return !matchesNoneOf(sequence);
926  }
927
928  /**
929   * Returns {@code true} if a character sequence contains only matching characters.
930   *
931   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
932   * character, until this returns {@code false} or the end is reached.
933   *
934   * @param sequence the character sequence to examine, possibly empty
935   * @return {@code true} if this matcher matches every character in the sequence, including when
936   *         the sequence is empty
937   */
938  public boolean matchesAllOf(CharSequence sequence) {
939    for (int i = sequence.length() - 1; i >= 0; i--) {
940      if (!matches(sequence.charAt(i))) {
941        return false;
942      }
943    }
944    return true;
945  }
946
947  /**
948   * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
949   * {@code !matchesAnyOf(sequence)}.
950   *
951   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
952   * character, until this returns {@code false} or the end is reached.
953   *
954   * @param sequence the character sequence to examine, possibly empty
955   * @return {@code true} if this matcher matches every character in the sequence, including when
956   *         the sequence is empty
957   */
958  public boolean matchesNoneOf(CharSequence sequence) {
959    return indexIn(sequence) == -1;
960  }
961
962  /**
963   * Returns the index of the first matching character in a character sequence, or {@code -1} if no
964   * matching character is present.
965   *
966   * <p>The default implementation iterates over the sequence in forward order calling {@link
967   * #matches} for each character.
968   *
969   * @param sequence the character sequence to examine from the beginning
970   * @return an index, or {@code -1} if no character matches
971   */
972  public int indexIn(CharSequence sequence) {
973    int length = sequence.length();
974    for (int i = 0; i < length; i++) {
975      if (matches(sequence.charAt(i))) {
976        return i;
977      }
978    }
979    return -1;
980  }
981
982  /**
983   * Returns the index of the first matching character in a character sequence, starting from a
984   * given position, or {@code -1} if no character matches after that position.
985   *
986   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
987   * start}, calling {@link #matches} for each character.
988   *
989   * @param sequence the character sequence to examine
990   * @param start the first index to examine; must be nonnegative and no greater than {@code
991   *        sequence.length()}
992   * @return the index of the first matching character, guaranteed to be no less than {@code start},
993   *         or {@code -1} if no character matches
994   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
995   *         sequence.length()}
996   */
997  public int indexIn(CharSequence sequence, int start) {
998    int length = sequence.length();
999    Preconditions.checkPositionIndex(start, length);
1000    for (int i = start; i < length; i++) {
1001      if (matches(sequence.charAt(i))) {
1002        return i;
1003      }
1004    }
1005    return -1;
1006  }
1007
1008  /**
1009   * Returns the index of the last matching character in a character sequence, or {@code -1} if no
1010   * matching character is present.
1011   *
1012   * <p>The default implementation iterates over the sequence in reverse order calling {@link
1013   * #matches} for each character.
1014   *
1015   * @param sequence the character sequence to examine from the end
1016   * @return an index, or {@code -1} if no character matches
1017   */
1018  public int lastIndexIn(CharSequence sequence) {
1019    for (int i = sequence.length() - 1; i >= 0; i--) {
1020      if (matches(sequence.charAt(i))) {
1021        return i;
1022      }
1023    }
1024    return -1;
1025  }
1026
1027  /**
1028   * Returns the number of matching characters found in a character sequence.
1029   */
1030  public int countIn(CharSequence sequence) {
1031    int count = 0;
1032    for (int i = 0; i < sequence.length(); i++) {
1033      if (matches(sequence.charAt(i))) {
1034        count++;
1035      }
1036    }
1037    return count;
1038  }
1039
1040  /**
1041   * Returns a string containing all non-matching characters of a character sequence, in order. For
1042   * example: <pre>   {@code
1043   *
1044   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
1045   *
1046   * ... returns {@code "bzr"}.
1047   */
1048  @CheckReturnValue
1049  public String removeFrom(CharSequence sequence) {
1050    String string = sequence.toString();
1051    int pos = indexIn(string);
1052    if (pos == -1) {
1053      return string;
1054    }
1055
1056    char[] chars = string.toCharArray();
1057    int spread = 1;
1058
1059    // This unusual loop comes from extensive benchmarking
1060    OUT: while (true) {
1061      pos++;
1062      while (true) {
1063        if (pos == chars.length) {
1064          break OUT;
1065        }
1066        if (matches(chars[pos])) {
1067          break;
1068        }
1069        chars[pos - spread] = chars[pos];
1070        pos++;
1071      }
1072      spread++;
1073    }
1074    return new String(chars, 0, pos - spread);
1075  }
1076
1077  /**
1078   * Returns a string containing all matching characters of a character sequence, in order. For
1079   * example: <pre>   {@code
1080   *
1081   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
1082   *
1083   * ... returns {@code "aaa"}.
1084   */
1085  @CheckReturnValue
1086  public String retainFrom(CharSequence sequence) {
1087    return negate().removeFrom(sequence);
1088  }
1089
1090  /**
1091   * Returns a string copy of the input character sequence, with each character that matches this
1092   * matcher replaced by a given replacement character. For example: <pre>   {@code
1093   *
1094   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
1095   *
1096   * ... returns {@code "rodor"}.
1097   *
1098   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1099   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1100   * character.
1101   *
1102   * @param sequence the character sequence to replace matching characters in
1103   * @param replacement the character to append to the result string in place of each matching
1104   *        character in {@code sequence}
1105   * @return the new string
1106   */
1107  @CheckReturnValue
1108  public String replaceFrom(CharSequence sequence, char replacement) {
1109    String string = sequence.toString();
1110    int pos = indexIn(string);
1111    if (pos == -1) {
1112      return string;
1113    }
1114    char[] chars = string.toCharArray();
1115    chars[pos] = replacement;
1116    for (int i = pos + 1; i < chars.length; i++) {
1117      if (matches(chars[i])) {
1118        chars[i] = replacement;
1119      }
1120    }
1121    return new String(chars);
1122  }
1123
1124  /**
1125   * Returns a string copy of the input character sequence, with each character that matches this
1126   * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1127   *
1128   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1129   *
1130   * ... returns {@code "yoohoo"}.
1131   *
1132   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1133   * off calling {@link #replaceFrom(CharSequence, char)} directly.
1134   *
1135   * @param sequence the character sequence to replace matching characters in
1136   * @param replacement the characters to append to the result string in place of each matching
1137   *        character in {@code sequence}
1138   * @return the new string
1139   */
1140  @CheckReturnValue
1141  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1142    int replacementLen = replacement.length();
1143    if (replacementLen == 0) {
1144      return removeFrom(sequence);
1145    }
1146    if (replacementLen == 1) {
1147      return replaceFrom(sequence, replacement.charAt(0));
1148    }
1149
1150    String string = sequence.toString();
1151    int pos = indexIn(string);
1152    if (pos == -1) {
1153      return string;
1154    }
1155
1156    int len = string.length();
1157    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1158
1159    int oldpos = 0;
1160    do {
1161      buf.append(string, oldpos, pos);
1162      buf.append(replacement);
1163      oldpos = pos + 1;
1164      pos = indexIn(string, oldpos);
1165    } while (pos != -1);
1166
1167    buf.append(string, oldpos, len);
1168    return buf.toString();
1169  }
1170
1171  /**
1172   * Returns a substring of the input character sequence that omits all characters this matcher
1173   * matches from the beginning and from the end of the string. For example: <pre>   {@code
1174   *
1175   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1176   *
1177   * ... returns {@code "cat"}.
1178   *
1179   * <p>Note that: <pre>   {@code
1180   *
1181   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1182   *
1183   * ... is equivalent to {@link String#trim()}.
1184   */
1185  @CheckReturnValue
1186  public String trimFrom(CharSequence sequence) {
1187    int len = sequence.length();
1188    int first;
1189    int last;
1190
1191    for (first = 0; first < len; first++) {
1192      if (!matches(sequence.charAt(first))) {
1193        break;
1194      }
1195    }
1196    for (last = len - 1; last > first; last--) {
1197      if (!matches(sequence.charAt(last))) {
1198        break;
1199      }
1200    }
1201
1202    return sequence.subSequence(first, last + 1).toString();
1203  }
1204
1205  /**
1206   * Returns a substring of the input character sequence that omits all characters this matcher
1207   * matches from the beginning of the string. For example: <pre> {@code
1208   *
1209   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1210   *
1211   * ... returns {@code "catbab"}.
1212   */
1213  @CheckReturnValue
1214  public String trimLeadingFrom(CharSequence sequence) {
1215    int len = sequence.length();
1216    for (int first = 0; first < len; first++) {
1217      if (!matches(sequence.charAt(first))) {
1218        return sequence.subSequence(first, len).toString();
1219      }
1220    }
1221    return "";
1222  }
1223
1224  /**
1225   * Returns a substring of the input character sequence that omits all characters this matcher
1226   * matches from the end of the string. For example: <pre> {@code
1227   *
1228   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1229   *
1230   * ... returns {@code "abacat"}.
1231   */
1232  @CheckReturnValue
1233  public String trimTrailingFrom(CharSequence sequence) {
1234    int len = sequence.length();
1235    for (int last = len - 1; last >= 0; last--) {
1236      if (!matches(sequence.charAt(last))) {
1237        return sequence.subSequence(0, last + 1).toString();
1238      }
1239    }
1240    return "";
1241  }
1242
1243  /**
1244   * Returns a string copy of the input character sequence, with each group of consecutive
1245   * characters that match this matcher replaced by a single replacement character. For example:
1246   * <pre>   {@code
1247   *
1248   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1249   *
1250   * ... returns {@code "b-p-r"}.
1251   *
1252   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1253   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1254   * character.
1255   *
1256   * @param sequence the character sequence to replace matching groups of characters in
1257   * @param replacement the character to append to the result string in place of each group of
1258   *        matching characters in {@code sequence}
1259   * @return the new string
1260   */
1261  @CheckReturnValue
1262  public String collapseFrom(CharSequence sequence, char replacement) {
1263    // This implementation avoids unnecessary allocation.
1264    int len = sequence.length();
1265    for (int i = 0; i < len; i++) {
1266      char c = sequence.charAt(i);
1267      if (matches(c)) {
1268        if (c == replacement
1269            && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1270          // a no-op replacement
1271          i++;
1272        } else {
1273          StringBuilder builder = new StringBuilder(len)
1274              .append(sequence.subSequence(0, i))
1275              .append(replacement);
1276          return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1277        }
1278      }
1279    }
1280    // no replacement needed
1281    return sequence.toString();
1282  }
1283
1284  /**
1285   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1286   * groups of matching characters at the start or end of the sequence are removed without
1287   * replacement.
1288   */
1289  @CheckReturnValue
1290  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1291    // This implementation avoids unnecessary allocation.
1292    int len = sequence.length();
1293    int first;
1294    int last;
1295
1296    for (first = 0; first < len && matches(sequence.charAt(first)); first++) {}
1297    for (last = len - 1; last > first && matches(sequence.charAt(last)); last--) {}
1298
1299    return (first == 0 && last == len - 1)
1300        ? collapseFrom(sequence, replacement)
1301        : finishCollapseFrom(
1302              sequence, first, last + 1, replacement,
1303              new StringBuilder(last + 1 - first),
1304              false);
1305  }
1306
1307  private String finishCollapseFrom(
1308      CharSequence sequence, int start, int end, char replacement,
1309      StringBuilder builder, boolean inMatchingGroup) {
1310    for (int i = start; i < end; i++) {
1311      char c = sequence.charAt(i);
1312      if (matches(c)) {
1313        if (!inMatchingGroup) {
1314          builder.append(replacement);
1315          inMatchingGroup = true;
1316        }
1317      } else {
1318        builder.append(c);
1319        inMatchingGroup = false;
1320      }
1321    }
1322    return builder.toString();
1323  }
1324
1325  // Predicate interface
1326
1327  /**
1328   * Equivalent to {@link #matches}; provided only to satisfy the {@link Predicate} interface. When
1329   * using a reference of type {@code CharMatcher}, invoke {@link #matches} directly instead.
1330   */
1331  @Override public boolean apply(Character character) {
1332    return matches(character);
1333  }
1334
1335  /**
1336   * Returns a string representation of this {@code CharMatcher}, such as
1337   * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1338   */
1339  @Override
1340  public String toString() {
1341    return description;
1342  }
1343
1344  /**
1345   * A special-case CharMatcher for Unicode whitespace characters that is extremely
1346   * efficient both in space required and in time to check for matches.
1347   *
1348   * Implementation details.
1349   * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1350   * so we can construct a lookup table of exactly 79 entries, and just check the character code
1351   * mod 79, and see if that character is in the table.
1352   *
1353   * There is a 1 at the beginning of the table so that the null character is not listed
1354   * as whitespace.
1355   *
1356   * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1357   *
1358   *   * Binary search into the sorted list of characters, i.e., what
1359   *     CharMatcher.anyOf() does</li>
1360   *   * Perfect hash function into a table of size 26 (using an offset table and a special
1361   *     Jenkins hash function)</li>
1362   *   * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1363   *   * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1364   *
1365   * --Christopher Swenson, February 2012.
1366   */
1367  private static final String WHITESPACE_TABLE = "\u0001\u0000\u00a0\u0000\u0000\u0000\u0000\u0000"
1368      + "\u0000\u0009\n\u000b\u000c\r\u0000\u0000\u2028\u2029\u0000\u0000\u0000\u0000\u0000\u202f"
1369      + "\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0020\u0000\u0000\u0000\u0000\u0000"
1370      + "\u0000\u0000\u0000\u0000\u0000\u3000\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
1371      + "\u0000\u0000\u0085\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a"
1372      + "\u0000\u0000\u0000\u0000\u0000\u205f\u1680\u0000\u0000\u180e\u0000\u0000\u0000";
1373
1374  /**
1375   * Determines whether a character is whitespace according to the latest Unicode standard, as
1376   * illustrated
1377   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1378   * This is not the same definition used by other Java APIs. (See a
1379   * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1380   * definitions of "whitespace"</a>.)
1381   *
1382   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1383   * to date.
1384   */
1385  public static final CharMatcher WHITESPACE = new FastMatcher("CharMatcher.WHITESPACE") {
1386
1387    @Override public boolean matches(char c) {
1388      return WHITESPACE_TABLE.charAt(c % 79) == c;
1389    }
1390  };
1391}