GNU Classpath (0.95) | |
Frames | No Frames |
1: /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2: Copyright (C) 1998, 1999, 2001, 2002, 2004, 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package java.lang; 40: 41: import gnu.java.lang.CharData; 42: 43: import java.io.Serializable; 44: import java.text.Collator; 45: import java.util.Locale; 46: 47: /** 48: * Wrapper class for the primitive char data type. In addition, this class 49: * allows one to retrieve property information and perform transformations 50: * on the defined characters in the Unicode Standard, Version 4.0.0. 51: * java.lang.Character is designed to be very dynamic, and as such, it 52: * retrieves information on the Unicode character set from a separate 53: * database, gnu.java.lang.CharData, which can be easily upgraded. 54: * 55: * <p>For predicates, boundaries are used to describe 56: * the set of characters for which the method will return true. 57: * This syntax uses fairly normal regular expression notation. 58: * See 5.13 of the Unicode Standard, Version 4.0, for the 59: * boundary specification. 60: * 61: * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 62: * for more information on the Unicode Standard. 63: * 64: * @author Tom Tromey (tromey@cygnus.com) 65: * @author Paul N. Fisher 66: * @author Jochen Hoenicke 67: * @author Eric Blake (ebb9@email.byu.edu) 68: * @author Andrew John Hughes (gnu_andrew@member.fsf.org) 69: * @see CharData 70: * @since 1.0 71: * @status partly updated to 1.5; some things still missing 72: */ 73: public final class Character implements Serializable, Comparable<Character> 74: { 75: /** 76: * A subset of Unicode blocks. 77: * 78: * @author Paul N. Fisher 79: * @author Eric Blake (ebb9@email.byu.edu) 80: * @since 1.2 81: */ 82: public static class Subset 83: { 84: /** The name of the subset. */ 85: private final String name; 86: 87: /** 88: * Construct a new subset of characters. 89: * 90: * @param name the name of the subset 91: * @throws NullPointerException if name is null 92: */ 93: protected Subset(String name) 94: { 95: // Note that name.toString() is name, unless name was null. 96: this.name = name.toString(); 97: } 98: 99: /** 100: * Compares two Subsets for equality. This is <code>final</code>, and 101: * restricts the comparison on the <code>==</code> operator, so it returns 102: * true only for the same object. 103: * 104: * @param o the object to compare 105: * @return true if o is this 106: */ 107: public final boolean equals(Object o) 108: { 109: return o == this; 110: } 111: 112: /** 113: * Makes the original hashCode of Object final, to be consistent with 114: * equals. 115: * 116: * @return the hash code for this object 117: */ 118: public final int hashCode() 119: { 120: return super.hashCode(); 121: } 122: 123: /** 124: * Returns the name of the subset. 125: * 126: * @return the name 127: */ 128: public final String toString() 129: { 130: return name; 131: } 132: } // class Subset 133: 134: /** 135: * A family of character subsets in the Unicode specification. A character 136: * is in at most one of these blocks. 137: * 138: * This inner class was generated automatically from 139: * <code>doc/unicode/Blocks-4.0.0.txt</code>, by some perl scripts. 140: * This Unicode definition file can be found on the 141: * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 142: * JDK 1.5 uses Unicode version 4.0.0. 143: * 144: * @author scripts/unicode-blocks.pl (written by Eric Blake) 145: * @since 1.2 146: */ 147: public static final class UnicodeBlock extends Subset 148: { 149: /** The start of the subset. */ 150: private final int start; 151: 152: /** The end of the subset. */ 153: private final int end; 154: 155: /** The canonical name of the block according to the Unicode standard. */ 156: private final String canonicalName; 157: 158: /** Enumeration for the <code>forName()</code> method */ 159: private enum NameType { CANONICAL, NO_SPACES, CONSTANT; } 160: 161: /** 162: * Constructor for strictly defined blocks. 163: * 164: * @param start the start character of the range 165: * @param end the end character of the range 166: * @param name the block name 167: * @param canonicalName the name of the block as defined in the Unicode 168: * standard. 169: */ 170: private UnicodeBlock(int start, int end, String name, 171: String canonicalName) 172: { 173: super(name); 174: this.start = start; 175: this.end = end; 176: this.canonicalName = canonicalName; 177: } 178: 179: /** 180: * Returns the Unicode character block which a character belongs to. 181: * <strong>Note</strong>: This method does not support the use of 182: * supplementary characters. For such support, <code>of(int)</code> 183: * should be used instead. 184: * 185: * @param ch the character to look up 186: * @return the set it belongs to, or null if it is not in one 187: */ 188: public static UnicodeBlock of(char ch) 189: { 190: return of((int) ch); 191: } 192: 193: /** 194: * Returns the Unicode character block which a code point belongs to. 195: * 196: * @param codePoint the character to look up 197: * @return the set it belongs to, or null if it is not in one. 198: * @throws IllegalArgumentException if the specified code point is 199: * invalid. 200: * @since 1.5 201: */ 202: public static UnicodeBlock of(int codePoint) 203: { 204: if (codePoint > MAX_CODE_POINT) 205: throw new IllegalArgumentException("The supplied integer value is " + 206: "too large to be a codepoint."); 207: // Simple binary search for the correct block. 208: int low = 0; 209: int hi = sets.length - 1; 210: while (low <= hi) 211: { 212: int mid = (low + hi) >> 1; 213: UnicodeBlock b = sets[mid]; 214: if (codePoint < b.start) 215: hi = mid - 1; 216: else if (codePoint > b.end) 217: low = mid + 1; 218: else 219: return b; 220: } 221: return null; 222: } 223: 224: /** 225: * <p> 226: * Returns the <code>UnicodeBlock</code> with the given name, as defined 227: * by the Unicode standard. The version of Unicode in use is defined by 228: * the <code>Character</code> class, and the names are given in the 229: * <code>Blocks-<version>.txt</code> file corresponding to that version. 230: * The name may be specified in one of three ways: 231: * </p> 232: * <ol> 233: * <li>The canonical, human-readable name used by the Unicode standard. 234: * This is the name with all spaces and hyphens retained. For example, 235: * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li> 236: * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li> 237: * <li>The name used for the constants specified by this class, which 238: * is the canonical name with all spaces and hyphens replaced with 239: * underscores e.g. `BASIC_LATIN'</li> 240: * </ol> 241: * <p> 242: * The names are compared case-insensitively using the case comparison 243: * associated with the U.S. English locale. The method recognises the 244: * previous names used for blocks as well as the current ones. At 245: * present, this simply means that the deprecated `SURROGATES_AREA' 246: * will be recognised by this method (the <code>of()</code> methods 247: * only return one of the three new surrogate blocks). 248: * </p> 249: * 250: * @param blockName the name of the block to look up. 251: * @return the specified block. 252: * @throws NullPointerException if the <code>blockName</code> is 253: * <code>null</code>. 254: * @throws IllegalArgumentException if the name does not match any Unicode 255: * block. 256: * @since 1.5 257: */ 258: public static final UnicodeBlock forName(String blockName) 259: { 260: NameType type; 261: if (blockName.indexOf(' ') != -1) 262: type = NameType.CANONICAL; 263: else if (blockName.indexOf('_') != -1) 264: type = NameType.CONSTANT; 265: else 266: type = NameType.NO_SPACES; 267: Collator usCollator = Collator.getInstance(Locale.US); 268: usCollator.setStrength(Collator.PRIMARY); 269: /* Special case for deprecated blocks not in sets */ 270: switch (type) 271: { 272: case CANONICAL: 273: if (usCollator.compare(blockName, "Surrogates Area") == 0) 274: return SURROGATES_AREA; 275: break; 276: case NO_SPACES: 277: if (usCollator.compare(blockName, "SurrogatesArea") == 0) 278: return SURROGATES_AREA; 279: break; 280: case CONSTANT: 281: if (usCollator.compare(blockName, "SURROGATES_AREA") == 0) 282: return SURROGATES_AREA; 283: break; 284: } 285: /* Other cases */ 286: switch (type) 287: { 288: case CANONICAL: 289: for (UnicodeBlock block : sets) 290: if (usCollator.compare(blockName, block.canonicalName) == 0) 291: return block; 292: break; 293: case NO_SPACES: 294: for (UnicodeBlock block : sets) 295: { 296: String nsName = block.canonicalName.replaceAll(" ",""); 297: if (usCollator.compare(blockName, nsName) == 0) 298: return block; 299: } 300: break; 301: case CONSTANT: 302: for (UnicodeBlock block : sets) 303: if (usCollator.compare(blockName, block.toString()) == 0) 304: return block; 305: break; 306: } 307: throw new IllegalArgumentException("No Unicode block found for " + 308: blockName + "."); 309: } 310: 311: /** 312: * Basic Latin. 313: * 0x0000 - 0x007F. 314: */ 315: public static final UnicodeBlock BASIC_LATIN 316: = new UnicodeBlock(0x0000, 0x007F, 317: "BASIC_LATIN", 318: "Basic Latin"); 319: 320: /** 321: * Latin-1 Supplement. 322: * 0x0080 - 0x00FF. 323: */ 324: public static final UnicodeBlock LATIN_1_SUPPLEMENT 325: = new UnicodeBlock(0x0080, 0x00FF, 326: "LATIN_1_SUPPLEMENT", 327: "Latin-1 Supplement"); 328: 329: /** 330: * Latin Extended-A. 331: * 0x0100 - 0x017F. 332: */ 333: public static final UnicodeBlock LATIN_EXTENDED_A 334: = new UnicodeBlock(0x0100, 0x017F, 335: "LATIN_EXTENDED_A", 336: "Latin Extended-A"); 337: 338: /** 339: * Latin Extended-B. 340: * 0x0180 - 0x024F. 341: */ 342: public static final UnicodeBlock LATIN_EXTENDED_B 343: = new UnicodeBlock(0x0180, 0x024F, 344: "LATIN_EXTENDED_B", 345: "Latin Extended-B"); 346: 347: /** 348: * IPA Extensions. 349: * 0x0250 - 0x02AF. 350: */ 351: public static final UnicodeBlock IPA_EXTENSIONS 352: = new UnicodeBlock(0x0250, 0x02AF, 353: "IPA_EXTENSIONS", 354: "IPA Extensions"); 355: 356: /** 357: * Spacing Modifier Letters. 358: * 0x02B0 - 0x02FF. 359: */ 360: public static final UnicodeBlock SPACING_MODIFIER_LETTERS 361: = new UnicodeBlock(0x02B0, 0x02FF, 362: "SPACING_MODIFIER_LETTERS", 363: "Spacing Modifier Letters"); 364: 365: /** 366: * Combining Diacritical Marks. 367: * 0x0300 - 0x036F. 368: */ 369: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 370: = new UnicodeBlock(0x0300, 0x036F, 371: "COMBINING_DIACRITICAL_MARKS", 372: "Combining Diacritical Marks"); 373: 374: /** 375: * Greek. 376: * 0x0370 - 0x03FF. 377: */ 378: public static final UnicodeBlock GREEK 379: = new UnicodeBlock(0x0370, 0x03FF, 380: "GREEK", 381: "Greek"); 382: 383: /** 384: * Cyrillic. 385: * 0x0400 - 0x04FF. 386: */ 387: public static final UnicodeBlock CYRILLIC 388: = new UnicodeBlock(0x0400, 0x04FF, 389: "CYRILLIC", 390: "Cyrillic"); 391: 392: /** 393: * Cyrillic Supplementary. 394: * 0x0500 - 0x052F. 395: * @since 1.5 396: */ 397: public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY 398: = new UnicodeBlock(0x0500, 0x052F, 399: "CYRILLIC_SUPPLEMENTARY", 400: "Cyrillic Supplementary"); 401: 402: /** 403: * Armenian. 404: * 0x0530 - 0x058F. 405: */ 406: public static final UnicodeBlock ARMENIAN 407: = new UnicodeBlock(0x0530, 0x058F, 408: "ARMENIAN", 409: "Armenian"); 410: 411: /** 412: * Hebrew. 413: * 0x0590 - 0x05FF. 414: */ 415: public static final UnicodeBlock HEBREW 416: = new UnicodeBlock(0x0590, 0x05FF, 417: "HEBREW", 418: "Hebrew"); 419: 420: /** 421: * Arabic. 422: * 0x0600 - 0x06FF. 423: */ 424: public static final UnicodeBlock ARABIC 425: = new UnicodeBlock(0x0600, 0x06FF, 426: "ARABIC", 427: "Arabic"); 428: 429: /** 430: * Syriac. 431: * 0x0700 - 0x074F. 432: * @since 1.4 433: */ 434: public static final UnicodeBlock SYRIAC 435: = new UnicodeBlock(0x0700, 0x074F, 436: "SYRIAC", 437: "Syriac"); 438: 439: /** 440: * Thaana. 441: * 0x0780 - 0x07BF. 442: * @since 1.4 443: */ 444: public static final UnicodeBlock THAANA 445: = new UnicodeBlock(0x0780, 0x07BF, 446: "THAANA", 447: "Thaana"); 448: 449: /** 450: * Devanagari. 451: * 0x0900 - 0x097F. 452: */ 453: public static final UnicodeBlock DEVANAGARI 454: = new UnicodeBlock(0x0900, 0x097F, 455: "DEVANAGARI", 456: "Devanagari"); 457: 458: /** 459: * Bengali. 460: * 0x0980 - 0x09FF. 461: */ 462: public static final UnicodeBlock BENGALI 463: = new UnicodeBlock(0x0980, 0x09FF, 464: "BENGALI", 465: "Bengali"); 466: 467: /** 468: * Gurmukhi. 469: * 0x0A00 - 0x0A7F. 470: */ 471: public static final UnicodeBlock GURMUKHI 472: = new UnicodeBlock(0x0A00, 0x0A7F, 473: "GURMUKHI", 474: "Gurmukhi"); 475: 476: /** 477: * Gujarati. 478: * 0x0A80 - 0x0AFF. 479: */ 480: public static final UnicodeBlock GUJARATI 481: = new UnicodeBlock(0x0A80, 0x0AFF, 482: "GUJARATI", 483: "Gujarati"); 484: 485: /** 486: * Oriya. 487: * 0x0B00 - 0x0B7F. 488: */ 489: public static final UnicodeBlock ORIYA 490: = new UnicodeBlock(0x0B00, 0x0B7F, 491: "ORIYA", 492: "Oriya"); 493: 494: /** 495: * Tamil. 496: * 0x0B80 - 0x0BFF. 497: */ 498: public static final UnicodeBlock TAMIL 499: = new UnicodeBlock(0x0B80, 0x0BFF, 500: "TAMIL", 501: "Tamil"); 502: 503: /** 504: * Telugu. 505: * 0x0C00 - 0x0C7F. 506: */ 507: public static final UnicodeBlock TELUGU 508: = new UnicodeBlock(0x0C00, 0x0C7F, 509: "TELUGU", 510: "Telugu"); 511: 512: /** 513: * Kannada. 514: * 0x0C80 - 0x0CFF. 515: */ 516: public static final UnicodeBlock KANNADA 517: = new UnicodeBlock(0x0C80, 0x0CFF, 518: "KANNADA", 519: "Kannada"); 520: 521: /** 522: * Malayalam. 523: * 0x0D00 - 0x0D7F. 524: */ 525: public static final UnicodeBlock MALAYALAM 526: = new UnicodeBlock(0x0D00, 0x0D7F, 527: "MALAYALAM", 528: "Malayalam"); 529: 530: /** 531: * Sinhala. 532: * 0x0D80 - 0x0DFF. 533: * @since 1.4 534: */ 535: public static final UnicodeBlock SINHALA 536: = new UnicodeBlock(0x0D80, 0x0DFF, 537: "SINHALA", 538: "Sinhala"); 539: 540: /** 541: * Thai. 542: * 0x0E00 - 0x0E7F. 543: */ 544: public static final UnicodeBlock THAI 545: = new UnicodeBlock(0x0E00, 0x0E7F, 546: "THAI", 547: "Thai"); 548: 549: /** 550: * Lao. 551: * 0x0E80 - 0x0EFF. 552: */ 553: public static final UnicodeBlock LAO 554: = new UnicodeBlock(0x0E80, 0x0EFF, 555: "LAO", 556: "Lao"); 557: 558: /** 559: * Tibetan. 560: * 0x0F00 - 0x0FFF. 561: */ 562: public static final UnicodeBlock TIBETAN 563: = new UnicodeBlock(0x0F00, 0x0FFF, 564: "TIBETAN", 565: "Tibetan"); 566: 567: /** 568: * Myanmar. 569: * 0x1000 - 0x109F. 570: * @since 1.4 571: */ 572: public static final UnicodeBlock MYANMAR 573: = new UnicodeBlock(0x1000, 0x109F, 574: "MYANMAR", 575: "Myanmar"); 576: 577: /** 578: * Georgian. 579: * 0x10A0 - 0x10FF. 580: */ 581: public static final UnicodeBlock GEORGIAN 582: = new UnicodeBlock(0x10A0, 0x10FF, 583: "GEORGIAN", 584: "Georgian"); 585: 586: /** 587: * Hangul Jamo. 588: * 0x1100 - 0x11FF. 589: */ 590: public static final UnicodeBlock HANGUL_JAMO 591: = new UnicodeBlock(0x1100, 0x11FF, 592: "HANGUL_JAMO", 593: "Hangul Jamo"); 594: 595: /** 596: * Ethiopic. 597: * 0x1200 - 0x137F. 598: * @since 1.4 599: */ 600: public static final UnicodeBlock ETHIOPIC 601: = new UnicodeBlock(0x1200, 0x137F, 602: "ETHIOPIC", 603: "Ethiopic"); 604: 605: /** 606: * Cherokee. 607: * 0x13A0 - 0x13FF. 608: * @since 1.4 609: */ 610: public static final UnicodeBlock CHEROKEE 611: = new UnicodeBlock(0x13A0, 0x13FF, 612: "CHEROKEE", 613: "Cherokee"); 614: 615: /** 616: * Unified Canadian Aboriginal Syllabics. 617: * 0x1400 - 0x167F. 618: * @since 1.4 619: */ 620: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 621: = new UnicodeBlock(0x1400, 0x167F, 622: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", 623: "Unified Canadian Aboriginal Syllabics"); 624: 625: /** 626: * Ogham. 627: * 0x1680 - 0x169F. 628: * @since 1.4 629: */ 630: public static final UnicodeBlock OGHAM 631: = new UnicodeBlock(0x1680, 0x169F, 632: "OGHAM", 633: "Ogham"); 634: 635: /** 636: * Runic. 637: * 0x16A0 - 0x16FF. 638: * @since 1.4 639: */ 640: public static final UnicodeBlock RUNIC 641: = new UnicodeBlock(0x16A0, 0x16FF, 642: "RUNIC", 643: "Runic"); 644: 645: /** 646: * Tagalog. 647: * 0x1700 - 0x171F. 648: * @since 1.5 649: */ 650: public static final UnicodeBlock TAGALOG 651: = new UnicodeBlock(0x1700, 0x171F, 652: "TAGALOG", 653: "Tagalog"); 654: 655: /** 656: * Hanunoo. 657: * 0x1720 - 0x173F. 658: * @since 1.5 659: */ 660: public static final UnicodeBlock HANUNOO 661: = new UnicodeBlock(0x1720, 0x173F, 662: "HANUNOO", 663: "Hanunoo"); 664: 665: /** 666: * Buhid. 667: * 0x1740 - 0x175F. 668: * @since 1.5 669: */ 670: public static final UnicodeBlock BUHID 671: = new UnicodeBlock(0x1740, 0x175F, 672: "BUHID", 673: "Buhid"); 674: 675: /** 676: * Tagbanwa. 677: * 0x1760 - 0x177F. 678: * @since 1.5 679: */ 680: public static final UnicodeBlock TAGBANWA 681: = new UnicodeBlock(0x1760, 0x177F, 682: "TAGBANWA", 683: "Tagbanwa"); 684: 685: /** 686: * Khmer. 687: * 0x1780 - 0x17FF. 688: * @since 1.4 689: */ 690: public static final UnicodeBlock KHMER 691: = new UnicodeBlock(0x1780, 0x17FF, 692: "KHMER", 693: "Khmer"); 694: 695: /** 696: * Mongolian. 697: * 0x1800 - 0x18AF. 698: * @since 1.4 699: */ 700: public static final UnicodeBlock MONGOLIAN 701: = new UnicodeBlock(0x1800, 0x18AF, 702: "MONGOLIAN", 703: "Mongolian"); 704: 705: /** 706: * Limbu. 707: * 0x1900 - 0x194F. 708: * @since 1.5 709: */ 710: public static final UnicodeBlock LIMBU 711: = new UnicodeBlock(0x1900, 0x194F, 712: "LIMBU", 713: "Limbu"); 714: 715: /** 716: * Tai Le. 717: * 0x1950 - 0x197F. 718: * @since 1.5 719: */ 720: public static final UnicodeBlock TAI_LE 721: = new UnicodeBlock(0x1950, 0x197F, 722: "TAI_LE", 723: "Tai Le"); 724: 725: /** 726: * Khmer Symbols. 727: * 0x19E0 - 0x19FF. 728: * @since 1.5 729: */ 730: public static final UnicodeBlock KHMER_SYMBOLS 731: = new UnicodeBlock(0x19E0, 0x19FF, 732: "KHMER_SYMBOLS", 733: "Khmer Symbols"); 734: 735: /** 736: * Phonetic Extensions. 737: * 0x1D00 - 0x1D7F. 738: * @since 1.5 739: */ 740: public static final UnicodeBlock PHONETIC_EXTENSIONS 741: = new UnicodeBlock(0x1D00, 0x1D7F, 742: "PHONETIC_EXTENSIONS", 743: "Phonetic Extensions"); 744: 745: /** 746: * Latin Extended Additional. 747: * 0x1E00 - 0x1EFF. 748: */ 749: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 750: = new UnicodeBlock(0x1E00, 0x1EFF, 751: "LATIN_EXTENDED_ADDITIONAL", 752: "Latin Extended Additional"); 753: 754: /** 755: * Greek Extended. 756: * 0x1F00 - 0x1FFF. 757: */ 758: public static final UnicodeBlock GREEK_EXTENDED 759: = new UnicodeBlock(0x1F00, 0x1FFF, 760: "GREEK_EXTENDED", 761: "Greek Extended"); 762: 763: /** 764: * General Punctuation. 765: * 0x2000 - 0x206F. 766: */ 767: public static final UnicodeBlock GENERAL_PUNCTUATION 768: = new UnicodeBlock(0x2000, 0x206F, 769: "GENERAL_PUNCTUATION", 770: "General Punctuation"); 771: 772: /** 773: * Superscripts and Subscripts. 774: * 0x2070 - 0x209F. 775: */ 776: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 777: = new UnicodeBlock(0x2070, 0x209F, 778: "SUPERSCRIPTS_AND_SUBSCRIPTS", 779: "Superscripts and Subscripts"); 780: 781: /** 782: * Currency Symbols. 783: * 0x20A0 - 0x20CF. 784: */ 785: public static final UnicodeBlock CURRENCY_SYMBOLS 786: = new UnicodeBlock(0x20A0, 0x20CF, 787: "CURRENCY_SYMBOLS", 788: "Currency Symbols"); 789: 790: /** 791: * Combining Marks for Symbols. 792: * 0x20D0 - 0x20FF. 793: */ 794: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 795: = new UnicodeBlock(0x20D0, 0x20FF, 796: "COMBINING_MARKS_FOR_SYMBOLS", 797: "Combining Marks for Symbols"); 798: 799: /** 800: * Letterlike Symbols. 801: * 0x2100 - 0x214F. 802: */ 803: public static final UnicodeBlock LETTERLIKE_SYMBOLS 804: = new UnicodeBlock(0x2100, 0x214F, 805: "LETTERLIKE_SYMBOLS", 806: "Letterlike Symbols"); 807: 808: /** 809: * Number Forms. 810: * 0x2150 - 0x218F. 811: */ 812: public static final UnicodeBlock NUMBER_FORMS 813: = new UnicodeBlock(0x2150, 0x218F, 814: "NUMBER_FORMS", 815: "Number Forms"); 816: 817: /** 818: * Arrows. 819: * 0x2190 - 0x21FF. 820: */ 821: public static final UnicodeBlock ARROWS 822: = new UnicodeBlock(0x2190, 0x21FF, 823: "ARROWS", 824: "Arrows"); 825: 826: /** 827: * Mathematical Operators. 828: * 0x2200 - 0x22FF. 829: */ 830: public static final UnicodeBlock MATHEMATICAL_OPERATORS 831: = new UnicodeBlock(0x2200, 0x22FF, 832: "MATHEMATICAL_OPERATORS", 833: "Mathematical Operators"); 834: 835: /** 836: * Miscellaneous Technical. 837: * 0x2300 - 0x23FF. 838: */ 839: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 840: = new UnicodeBlock(0x2300, 0x23FF, 841: "MISCELLANEOUS_TECHNICAL", 842: "Miscellaneous Technical"); 843: 844: /** 845: * Control Pictures. 846: * 0x2400 - 0x243F. 847: */ 848: public static final UnicodeBlock CONTROL_PICTURES 849: = new UnicodeBlock(0x2400, 0x243F, 850: "CONTROL_PICTURES", 851: "Control Pictures"); 852: 853: /** 854: * Optical Character Recognition. 855: * 0x2440 - 0x245F. 856: */ 857: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 858: = new UnicodeBlock(0x2440, 0x245F, 859: "OPTICAL_CHARACTER_RECOGNITION", 860: "Optical Character Recognition"); 861: 862: /** 863: * Enclosed Alphanumerics. 864: * 0x2460 - 0x24FF. 865: */ 866: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 867: = new UnicodeBlock(0x2460, 0x24FF, 868: "ENCLOSED_ALPHANUMERICS", 869: "Enclosed Alphanumerics"); 870: 871: /** 872: * Box Drawing. 873: * 0x2500 - 0x257F. 874: */ 875: public static final UnicodeBlock BOX_DRAWING 876: = new UnicodeBlock(0x2500, 0x257F, 877: "BOX_DRAWING", 878: "Box Drawing"); 879: 880: /** 881: * Block Elements. 882: * 0x2580 - 0x259F. 883: */ 884: public static final UnicodeBlock BLOCK_ELEMENTS 885: = new UnicodeBlock(0x2580, 0x259F, 886: "BLOCK_ELEMENTS", 887: "Block Elements"); 888: 889: /** 890: * Geometric Shapes. 891: * 0x25A0 - 0x25FF. 892: */ 893: public static final UnicodeBlock GEOMETRIC_SHAPES 894: = new UnicodeBlock(0x25A0, 0x25FF, 895: "GEOMETRIC_SHAPES", 896: "Geometric Shapes"); 897: 898: /** 899: * Miscellaneous Symbols. 900: * 0x2600 - 0x26FF. 901: */ 902: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 903: = new UnicodeBlock(0x2600, 0x26FF, 904: "MISCELLANEOUS_SYMBOLS", 905: "Miscellaneous Symbols"); 906: 907: /** 908: * Dingbats. 909: * 0x2700 - 0x27BF. 910: */ 911: public static final UnicodeBlock DINGBATS 912: = new UnicodeBlock(0x2700, 0x27BF, 913: "DINGBATS", 914: "Dingbats"); 915: 916: /** 917: * Miscellaneous Mathematical Symbols-A. 918: * 0x27C0 - 0x27EF. 919: * @since 1.5 920: */ 921: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A 922: = new UnicodeBlock(0x27C0, 0x27EF, 923: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", 924: "Miscellaneous Mathematical Symbols-A"); 925: 926: /** 927: * Supplemental Arrows-A. 928: * 0x27F0 - 0x27FF. 929: * @since 1.5 930: */ 931: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A 932: = new UnicodeBlock(0x27F0, 0x27FF, 933: "SUPPLEMENTAL_ARROWS_A", 934: "Supplemental Arrows-A"); 935: 936: /** 937: * Braille Patterns. 938: * 0x2800 - 0x28FF. 939: * @since 1.4 940: */ 941: public static final UnicodeBlock BRAILLE_PATTERNS 942: = new UnicodeBlock(0x2800, 0x28FF, 943: "BRAILLE_PATTERNS", 944: "Braille Patterns"); 945: 946: /** 947: * Supplemental Arrows-B. 948: * 0x2900 - 0x297F. 949: * @since 1.5 950: */ 951: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B 952: = new UnicodeBlock(0x2900, 0x297F, 953: "SUPPLEMENTAL_ARROWS_B", 954: "Supplemental Arrows-B"); 955: 956: /** 957: * Miscellaneous Mathematical Symbols-B. 958: * 0x2980 - 0x29FF. 959: * @since 1.5 960: */ 961: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B 962: = new UnicodeBlock(0x2980, 0x29FF, 963: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", 964: "Miscellaneous Mathematical Symbols-B"); 965: 966: /** 967: * Supplemental Mathematical Operators. 968: * 0x2A00 - 0x2AFF. 969: * @since 1.5 970: */ 971: public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS 972: = new UnicodeBlock(0x2A00, 0x2AFF, 973: "SUPPLEMENTAL_MATHEMATICAL_OPERATORS", 974: "Supplemental Mathematical Operators"); 975: 976: /** 977: * Miscellaneous Symbols and Arrows. 978: * 0x2B00 - 0x2BFF. 979: * @since 1.5 980: */ 981: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS 982: = new UnicodeBlock(0x2B00, 0x2BFF, 983: "MISCELLANEOUS_SYMBOLS_AND_ARROWS", 984: "Miscellaneous Symbols and Arrows"); 985: 986: /** 987: * CJK Radicals Supplement. 988: * 0x2E80 - 0x2EFF. 989: * @since 1.4 990: */ 991: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 992: = new UnicodeBlock(0x2E80, 0x2EFF, 993: "CJK_RADICALS_SUPPLEMENT", 994: "CJK Radicals Supplement"); 995: 996: /** 997: * Kangxi Radicals. 998: * 0x2F00 - 0x2FDF. 999: * @since 1.4 1000: */ 1001: public static final UnicodeBlock KANGXI_RADICALS 1002: = new UnicodeBlock(0x2F00, 0x2FDF, 1003: "KANGXI_RADICALS", 1004: "Kangxi Radicals"); 1005: 1006: /** 1007: * Ideographic Description Characters. 1008: * 0x2FF0 - 0x2FFF. 1009: * @since 1.4 1010: */ 1011: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 1012: = new UnicodeBlock(0x2FF0, 0x2FFF, 1013: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS", 1014: "Ideographic Description Characters"); 1015: 1016: /** 1017: * CJK Symbols and Punctuation. 1018: * 0x3000 - 0x303F. 1019: */ 1020: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 1021: = new UnicodeBlock(0x3000, 0x303F, 1022: "CJK_SYMBOLS_AND_PUNCTUATION", 1023: "CJK Symbols and Punctuation"); 1024: 1025: /** 1026: * Hiragana. 1027: * 0x3040 - 0x309F. 1028: */ 1029: public static final UnicodeBlock HIRAGANA 1030: = new UnicodeBlock(0x3040, 0x309F, 1031: "HIRAGANA", 1032: "Hiragana"); 1033: 1034: /** 1035: * Katakana. 1036: * 0x30A0 - 0x30FF. 1037: */ 1038: public static final UnicodeBlock KATAKANA 1039: = new UnicodeBlock(0x30A0, 0x30FF, 1040: "KATAKANA", 1041: "Katakana"); 1042: 1043: /** 1044: * Bopomofo. 1045: * 0x3100 - 0x312F. 1046: */ 1047: public static final UnicodeBlock BOPOMOFO 1048: = new UnicodeBlock(0x3100, 0x312F, 1049: "BOPOMOFO", 1050: "Bopomofo"); 1051: 1052: /** 1053: * Hangul Compatibility Jamo. 1054: * 0x3130 - 0x318F. 1055: */ 1056: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 1057: = new UnicodeBlock(0x3130, 0x318F, 1058: "HANGUL_COMPATIBILITY_JAMO", 1059: "Hangul Compatibility Jamo"); 1060: 1061: /** 1062: * Kanbun. 1063: * 0x3190 - 0x319F. 1064: */ 1065: public static final UnicodeBlock KANBUN 1066: = new UnicodeBlock(0x3190, 0x319F, 1067: "KANBUN", 1068: "Kanbun"); 1069: 1070: /** 1071: * Bopomofo Extended. 1072: * 0x31A0 - 0x31BF. 1073: * @since 1.4 1074: */ 1075: public static final UnicodeBlock BOPOMOFO_EXTENDED 1076: = new UnicodeBlock(0x31A0, 0x31BF, 1077: "BOPOMOFO_EXTENDED", 1078: "Bopomofo Extended"); 1079: 1080: /** 1081: * Katakana Phonetic Extensions. 1082: * 0x31F0 - 0x31FF. 1083: * @since 1.5 1084: */ 1085: public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS 1086: = new UnicodeBlock(0x31F0, 0x31FF, 1087: "KATAKANA_PHONETIC_EXTENSIONS", 1088: "Katakana Phonetic Extensions"); 1089: 1090: /** 1091: * Enclosed CJK Letters and Months. 1092: * 0x3200 - 0x32FF. 1093: */ 1094: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 1095: = new UnicodeBlock(0x3200, 0x32FF, 1096: "ENCLOSED_CJK_LETTERS_AND_MONTHS", 1097: "Enclosed CJK Letters and Months"); 1098: 1099: /** 1100: * CJK Compatibility. 1101: * 0x3300 - 0x33FF. 1102: */ 1103: public static final UnicodeBlock CJK_COMPATIBILITY 1104: = new UnicodeBlock(0x3300, 0x33FF, 1105: "CJK_COMPATIBILITY", 1106: "CJK Compatibility"); 1107: 1108: /** 1109: * CJK Unified Ideographs Extension A. 1110: * 0x3400 - 0x4DBF. 1111: * @since 1.4 1112: */ 1113: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1114: = new UnicodeBlock(0x3400, 0x4DBF, 1115: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", 1116: "CJK Unified Ideographs Extension A"); 1117: 1118: /** 1119: * Yijing Hexagram Symbols. 1120: * 0x4DC0 - 0x4DFF. 1121: * @since 1.5 1122: */ 1123: public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS 1124: = new UnicodeBlock(0x4DC0, 0x4DFF, 1125: "YIJING_HEXAGRAM_SYMBOLS", 1126: "Yijing Hexagram Symbols"); 1127: 1128: /** 1129: * CJK Unified Ideographs. 1130: * 0x4E00 - 0x9FFF. 1131: */ 1132: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 1133: = new UnicodeBlock(0x4E00, 0x9FFF, 1134: "CJK_UNIFIED_IDEOGRAPHS", 1135: "CJK Unified Ideographs"); 1136: 1137: /** 1138: * Yi Syllables. 1139: * 0xA000 - 0xA48F. 1140: * @since 1.4 1141: */ 1142: public static final UnicodeBlock YI_SYLLABLES 1143: = new UnicodeBlock(0xA000, 0xA48F, 1144: "YI_SYLLABLES", 1145: "Yi Syllables"); 1146: 1147: /** 1148: * Yi Radicals. 1149: * 0xA490 - 0xA4CF. 1150: * @since 1.4 1151: */ 1152: public static final UnicodeBlock YI_RADICALS 1153: = new UnicodeBlock(0xA490, 0xA4CF, 1154: "YI_RADICALS", 1155: "Yi Radicals"); 1156: 1157: /** 1158: * Hangul Syllables. 1159: * 0xAC00 - 0xD7AF. 1160: */ 1161: public static final UnicodeBlock HANGUL_SYLLABLES 1162: = new UnicodeBlock(0xAC00, 0xD7AF, 1163: "HANGUL_SYLLABLES", 1164: "Hangul Syllables"); 1165: 1166: /** 1167: * High Surrogates. 1168: * 0xD800 - 0xDB7F. 1169: * @since 1.5 1170: */ 1171: public static final UnicodeBlock HIGH_SURROGATES 1172: = new UnicodeBlock(0xD800, 0xDB7F, 1173: "HIGH_SURROGATES", 1174: "High Surrogates"); 1175: 1176: /** 1177: * High Private Use Surrogates. 1178: * 0xDB80 - 0xDBFF. 1179: * @since 1.5 1180: */ 1181: public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES 1182: = new UnicodeBlock(0xDB80, 0xDBFF, 1183: "HIGH_PRIVATE_USE_SURROGATES", 1184: "High Private Use Surrogates"); 1185: 1186: /** 1187: * Low Surrogates. 1188: * 0xDC00 - 0xDFFF. 1189: * @since 1.5 1190: */ 1191: public static final UnicodeBlock LOW_SURROGATES 1192: = new UnicodeBlock(0xDC00, 0xDFFF, 1193: "LOW_SURROGATES", 1194: "Low Surrogates"); 1195: 1196: /** 1197: * Private Use Area. 1198: * 0xE000 - 0xF8FF. 1199: */ 1200: public static final UnicodeBlock PRIVATE_USE_AREA 1201: = new UnicodeBlock(0xE000, 0xF8FF, 1202: "PRIVATE_USE_AREA", 1203: "Private Use Area"); 1204: 1205: /** 1206: * CJK Compatibility Ideographs. 1207: * 0xF900 - 0xFAFF. 1208: */ 1209: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 1210: = new UnicodeBlock(0xF900, 0xFAFF, 1211: "CJK_COMPATIBILITY_IDEOGRAPHS", 1212: "CJK Compatibility Ideographs"); 1213: 1214: /** 1215: * Alphabetic Presentation Forms. 1216: * 0xFB00 - 0xFB4F. 1217: */ 1218: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 1219: = new UnicodeBlock(0xFB00, 0xFB4F, 1220: "ALPHABETIC_PRESENTATION_FORMS", 1221: "Alphabetic Presentation Forms"); 1222: 1223: /** 1224: * Arabic Presentation Forms-A. 1225: * 0xFB50 - 0xFDFF. 1226: */ 1227: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 1228: = new UnicodeBlock(0xFB50, 0xFDFF, 1229: "ARABIC_PRESENTATION_FORMS_A", 1230: "Arabic Presentation Forms-A"); 1231: 1232: /** 1233: * Variation Selectors. 1234: * 0xFE00 - 0xFE0F. 1235: * @since 1.5 1236: */ 1237: public static final UnicodeBlock VARIATION_SELECTORS 1238: = new UnicodeBlock(0xFE00, 0xFE0F, 1239: "VARIATION_SELECTORS", 1240: "Variation Selectors"); 1241: 1242: /** 1243: * Combining Half Marks. 1244: * 0xFE20 - 0xFE2F. 1245: */ 1246: public static final UnicodeBlock COMBINING_HALF_MARKS 1247: = new UnicodeBlock(0xFE20, 0xFE2F, 1248: "COMBINING_HALF_MARKS", 1249: "Combining Half Marks"); 1250: 1251: /** 1252: * CJK Compatibility Forms. 1253: * 0xFE30 - 0xFE4F. 1254: */ 1255: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 1256: = new UnicodeBlock(0xFE30, 0xFE4F, 1257: "CJK_COMPATIBILITY_FORMS", 1258: "CJK Compatibility Forms"); 1259: 1260: /** 1261: * Small Form Variants. 1262: * 0xFE50 - 0xFE6F. 1263: */ 1264: public static final UnicodeBlock SMALL_FORM_VARIANTS 1265: = new UnicodeBlock(0xFE50, 0xFE6F, 1266: "SMALL_FORM_VARIANTS", 1267: "Small Form Variants"); 1268: 1269: /** 1270: * Arabic Presentation Forms-B. 1271: * 0xFE70 - 0xFEFF. 1272: */ 1273: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 1274: = new UnicodeBlock(0xFE70, 0xFEFF, 1275: "ARABIC_PRESENTATION_FORMS_B", 1276: "Arabic Presentation Forms-B"); 1277: 1278: /** 1279: * Halfwidth and Fullwidth Forms. 1280: * 0xFF00 - 0xFFEF. 1281: */ 1282: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 1283: = new UnicodeBlock(0xFF00, 0xFFEF, 1284: "HALFWIDTH_AND_FULLWIDTH_FORMS", 1285: "Halfwidth and Fullwidth Forms"); 1286: 1287: /** 1288: * Specials. 1289: * 0xFFF0 - 0xFFFF. 1290: */ 1291: public static final UnicodeBlock SPECIALS 1292: = new UnicodeBlock(0xFFF0, 0xFFFF, 1293: "SPECIALS", 1294: "Specials"); 1295: 1296: /** 1297: * Linear B Syllabary. 1298: * 0x10000 - 0x1007F. 1299: * @since 1.5 1300: */ 1301: public static final UnicodeBlock LINEAR_B_SYLLABARY 1302: = new UnicodeBlock(0x10000, 0x1007F, 1303: "LINEAR_B_SYLLABARY", 1304: "Linear B Syllabary"); 1305: 1306: /** 1307: * Linear B Ideograms. 1308: * 0x10080 - 0x100FF. 1309: * @since 1.5 1310: */ 1311: public static final UnicodeBlock LINEAR_B_IDEOGRAMS 1312: = new UnicodeBlock(0x10080, 0x100FF, 1313: "LINEAR_B_IDEOGRAMS", 1314: "Linear B Ideograms"); 1315: 1316: /** 1317: * Aegean Numbers. 1318: * 0x10100 - 0x1013F. 1319: * @since 1.5 1320: */ 1321: public static final UnicodeBlock AEGEAN_NUMBERS 1322: = new UnicodeBlock(0x10100, 0x1013F, 1323: "AEGEAN_NUMBERS", 1324: "Aegean Numbers"); 1325: 1326: /** 1327: * Old Italic. 1328: * 0x10300 - 0x1032F. 1329: * @since 1.5 1330: */ 1331: public static final UnicodeBlock OLD_ITALIC 1332: = new UnicodeBlock(0x10300, 0x1032F, 1333: "OLD_ITALIC", 1334: "Old Italic"); 1335: 1336: /** 1337: * Gothic. 1338: * 0x10330 - 0x1034F. 1339: * @since 1.5 1340: */ 1341: public static final UnicodeBlock GOTHIC 1342: = new UnicodeBlock(0x10330, 0x1034F, 1343: "GOTHIC", 1344: "Gothic"); 1345: 1346: /** 1347: * Ugaritic. 1348: * 0x10380 - 0x1039F. 1349: * @since 1.5 1350: */ 1351: public static final UnicodeBlock UGARITIC 1352: = new UnicodeBlock(0x10380, 0x1039F, 1353: "UGARITIC", 1354: "Ugaritic"); 1355: 1356: /** 1357: * Deseret. 1358: * 0x10400 - 0x1044F. 1359: * @since 1.5 1360: */ 1361: public static final UnicodeBlock DESERET 1362: = new UnicodeBlock(0x10400, 0x1044F, 1363: "DESERET", 1364: "Deseret"); 1365: 1366: /** 1367: * Shavian. 1368: * 0x10450 - 0x1047F. 1369: * @since 1.5 1370: */ 1371: public static final UnicodeBlock SHAVIAN 1372: = new UnicodeBlock(0x10450, 0x1047F, 1373: "SHAVIAN", 1374: "Shavian"); 1375: 1376: /** 1377: * Osmanya. 1378: * 0x10480 - 0x104AF. 1379: * @since 1.5 1380: */ 1381: public static final UnicodeBlock OSMANYA 1382: = new UnicodeBlock(0x10480, 0x104AF, 1383: "OSMANYA", 1384: "Osmanya"); 1385: 1386: /** 1387: * Cypriot Syllabary. 1388: * 0x10800 - 0x1083F. 1389: * @since 1.5 1390: */ 1391: public static final UnicodeBlock CYPRIOT_SYLLABARY 1392: = new UnicodeBlock(0x10800, 0x1083F, 1393: "CYPRIOT_SYLLABARY", 1394: "Cypriot Syllabary"); 1395: 1396: /** 1397: * Byzantine Musical Symbols. 1398: * 0x1D000 - 0x1D0FF. 1399: * @since 1.5 1400: */ 1401: public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS 1402: = new UnicodeBlock(0x1D000, 0x1D0FF, 1403: "BYZANTINE_MUSICAL_SYMBOLS", 1404: "Byzantine Musical Symbols"); 1405: 1406: /** 1407: * Musical Symbols. 1408: * 0x1D100 - 0x1D1FF. 1409: * @since 1.5 1410: */ 1411: public static final UnicodeBlock MUSICAL_SYMBOLS 1412: = new UnicodeBlock(0x1D100, 0x1D1FF, 1413: "MUSICAL_SYMBOLS", 1414: "Musical Symbols"); 1415: 1416: /** 1417: * Tai Xuan Jing Symbols. 1418: * 0x1D300 - 0x1D35F. 1419: * @since 1.5 1420: */ 1421: public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS 1422: = new UnicodeBlock(0x1D300, 0x1D35F, 1423: "TAI_XUAN_JING_SYMBOLS", 1424: "Tai Xuan Jing Symbols"); 1425: 1426: /** 1427: * Mathematical Alphanumeric Symbols. 1428: * 0x1D400 - 0x1D7FF. 1429: * @since 1.5 1430: */ 1431: public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS 1432: = new UnicodeBlock(0x1D400, 0x1D7FF, 1433: "MATHEMATICAL_ALPHANUMERIC_SYMBOLS", 1434: "Mathematical Alphanumeric Symbols"); 1435: 1436: /** 1437: * CJK Unified Ideographs Extension B. 1438: * 0x20000 - 0x2A6DF. 1439: * @since 1.5 1440: */ 1441: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1442: = new UnicodeBlock(0x20000, 0x2A6DF, 1443: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", 1444: "CJK Unified Ideographs Extension B"); 1445: 1446: /** 1447: * CJK Compatibility Ideographs Supplement. 1448: * 0x2F800 - 0x2FA1F. 1449: * @since 1.5 1450: */ 1451: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT 1452: = new UnicodeBlock(0x2F800, 0x2FA1F, 1453: "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", 1454: "CJK Compatibility Ideographs Supplement"); 1455: 1456: /** 1457: * Tags. 1458: * 0xE0000 - 0xE007F. 1459: * @since 1.5 1460: */ 1461: public static final UnicodeBlock TAGS 1462: = new UnicodeBlock(0xE0000, 0xE007F, 1463: "TAGS", 1464: "Tags"); 1465: 1466: /** 1467: * Variation Selectors Supplement. 1468: * 0xE0100 - 0xE01EF. 1469: * @since 1.5 1470: */ 1471: public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT 1472: = new UnicodeBlock(0xE0100, 0xE01EF, 1473: "VARIATION_SELECTORS_SUPPLEMENT", 1474: "Variation Selectors Supplement"); 1475: 1476: /** 1477: * Supplementary Private Use Area-A. 1478: * 0xF0000 - 0xFFFFF. 1479: * @since 1.5 1480: */ 1481: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A 1482: = new UnicodeBlock(0xF0000, 0xFFFFF, 1483: "SUPPLEMENTARY_PRIVATE_USE_AREA_A", 1484: "Supplementary Private Use Area-A"); 1485: 1486: /** 1487: * Supplementary Private Use Area-B. 1488: * 0x100000 - 0x10FFFF. 1489: * @since 1.5 1490: */ 1491: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B 1492: = new UnicodeBlock(0x100000, 0x10FFFF, 1493: "SUPPLEMENTARY_PRIVATE_USE_AREA_B", 1494: "Supplementary Private Use Area-B"); 1495: 1496: /** 1497: * Surrogates Area. 1498: * 'D800' - 'DFFF'. 1499: * @deprecated As of 1.5, the three areas, 1500: * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>, 1501: * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a> 1502: * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined 1503: * by the Unicode standard, should be used in preference to 1504: * this. These are also returned from calls to <code>of(int)</code> 1505: * and <code>of(char)</code>. 1506: */ 1507: @Deprecated 1508: public static final UnicodeBlock SURROGATES_AREA 1509: = new UnicodeBlock(0xD800, 0xDFFF, 1510: "SURROGATES_AREA", 1511: "Surrogates Area"); 1512: 1513: /** 1514: * The defined subsets. 1515: */ 1516: private static final UnicodeBlock sets[] = { 1517: BASIC_LATIN, 1518: LATIN_1_SUPPLEMENT, 1519: LATIN_EXTENDED_A, 1520: LATIN_EXTENDED_B, 1521: IPA_EXTENSIONS, 1522: SPACING_MODIFIER_LETTERS, 1523: COMBINING_DIACRITICAL_MARKS, 1524: GREEK, 1525: CYRILLIC, 1526: CYRILLIC_SUPPLEMENTARY, 1527: ARMENIAN, 1528: HEBREW, 1529: ARABIC, 1530: SYRIAC, 1531: THAANA, 1532: DEVANAGARI, 1533: BENGALI, 1534: GURMUKHI, 1535: GUJARATI, 1536: ORIYA, 1537: TAMIL, 1538: TELUGU, 1539: KANNADA, 1540: MALAYALAM, 1541: SINHALA, 1542: THAI, 1543: LAO, 1544: TIBETAN, 1545: MYANMAR, 1546: GEORGIAN, 1547: HANGUL_JAMO, 1548: ETHIOPIC, 1549: CHEROKEE, 1550: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 1551: OGHAM, 1552: RUNIC, 1553: TAGALOG, 1554: HANUNOO, 1555: BUHID, 1556: TAGBANWA, 1557: KHMER, 1558: MONGOLIAN, 1559: LIMBU, 1560: TAI_LE, 1561: KHMER_SYMBOLS, 1562: PHONETIC_EXTENSIONS, 1563: LATIN_EXTENDED_ADDITIONAL, 1564: GREEK_EXTENDED, 1565: GENERAL_PUNCTUATION, 1566: SUPERSCRIPTS_AND_SUBSCRIPTS, 1567: CURRENCY_SYMBOLS, 1568: COMBINING_MARKS_FOR_SYMBOLS, 1569: LETTERLIKE_SYMBOLS, 1570: NUMBER_FORMS, 1571: ARROWS, 1572: MATHEMATICAL_OPERATORS, 1573: MISCELLANEOUS_TECHNICAL, 1574: CONTROL_PICTURES, 1575: OPTICAL_CHARACTER_RECOGNITION, 1576: ENCLOSED_ALPHANUMERICS, 1577: BOX_DRAWING, 1578: BLOCK_ELEMENTS, 1579: GEOMETRIC_SHAPES, 1580: MISCELLANEOUS_SYMBOLS, 1581: DINGBATS, 1582: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, 1583: SUPPLEMENTAL_ARROWS_A, 1584: BRAILLE_PATTERNS, 1585: SUPPLEMENTAL_ARROWS_B, 1586: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, 1587: SUPPLEMENTAL_MATHEMATICAL_OPERATORS, 1588: MISCELLANEOUS_SYMBOLS_AND_ARROWS, 1589: CJK_RADICALS_SUPPLEMENT, 1590: KANGXI_RADICALS, 1591: IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 1592: CJK_SYMBOLS_AND_PUNCTUATION, 1593: HIRAGANA, 1594: KATAKANA, 1595: BOPOMOFO, 1596: HANGUL_COMPATIBILITY_JAMO, 1597: KANBUN, 1598: BOPOMOFO_EXTENDED, 1599: KATAKANA_PHONETIC_EXTENSIONS, 1600: ENCLOSED_CJK_LETTERS_AND_MONTHS, 1601: CJK_COMPATIBILITY, 1602: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 1603: YIJING_HEXAGRAM_SYMBOLS, 1604: CJK_UNIFIED_IDEOGRAPHS, 1605: YI_SYLLABLES, 1606: YI_RADICALS, 1607: HANGUL_SYLLABLES, 1608: HIGH_SURROGATES, 1609: HIGH_PRIVATE_USE_SURROGATES, 1610: LOW_SURROGATES, 1611: PRIVATE_USE_AREA, 1612: CJK_COMPATIBILITY_IDEOGRAPHS, 1613: ALPHABETIC_PRESENTATION_FORMS, 1614: ARABIC_PRESENTATION_FORMS_A, 1615: VARIATION_SELECTORS, 1616: COMBINING_HALF_MARKS, 1617: CJK_COMPATIBILITY_FORMS, 1618: SMALL_FORM_VARIANTS, 1619: ARABIC_PRESENTATION_FORMS_B, 1620: HALFWIDTH_AND_FULLWIDTH_FORMS, 1621: SPECIALS, 1622: LINEAR_B_SYLLABARY, 1623: LINEAR_B_IDEOGRAMS, 1624: AEGEAN_NUMBERS, 1625: OLD_ITALIC, 1626: GOTHIC, 1627: UGARITIC, 1628: DESERET, 1629: SHAVIAN, 1630: OSMANYA, 1631: CYPRIOT_SYLLABARY, 1632: BYZANTINE_MUSICAL_SYMBOLS, 1633: MUSICAL_SYMBOLS, 1634: TAI_XUAN_JING_SYMBOLS, 1635: MATHEMATICAL_ALPHANUMERIC_SYMBOLS, 1636: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 1637: CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, 1638: TAGS, 1639: VARIATION_SELECTORS_SUPPLEMENT, 1640: SUPPLEMENTARY_PRIVATE_USE_AREA_A, 1641: SUPPLEMENTARY_PRIVATE_USE_AREA_B, 1642: }; 1643: } // class UnicodeBlock 1644: 1645: /** 1646: * A class to encompass all the properties of characters in the 1647: * private use blocks in the Unicode standard. This class extends 1648: * UnassignedCharacters because the return type from getType() is 1649: * different. 1650: * @author Anthony Balkissoon abalkiss at redhat dot com 1651: * 1652: */ 1653: private static class PrivateUseCharacters extends UnassignedCharacters 1654: { 1655: /** 1656: * Returns the type of the character cp. 1657: */ 1658: static int getType(int cp) 1659: { 1660: // The upper 2 code points in any plane are considered unassigned, 1661: // even in the private-use planes. 1662: if ((cp & 0xffff) >= 0xfffe) 1663: return UnassignedCharacters.getType(cp); 1664: return PRIVATE_USE; 1665: } 1666: 1667: /** 1668: * Returns true if the character cp is defined. 1669: */ 1670: static boolean isDefined(int cp) 1671: { 1672: // The upper 2 code points in any plane are considered unassigned, 1673: // even in the private-use planes. 1674: if ((cp & 0xffff) >= 0xfffe) 1675: return UnassignedCharacters.isDefined(cp); 1676: return true; 1677: } 1678: 1679: /** 1680: * Gets the directionality for the character cp. 1681: */ 1682: static byte getDirectionality(int cp) 1683: { 1684: if ((cp & 0xffff) >= 0xfffe) 1685: return UnassignedCharacters.getDirectionality(cp); 1686: return DIRECTIONALITY_LEFT_TO_RIGHT; 1687: } 1688: } 1689: 1690: /** 1691: * A class to encompass all the properties of code points that are 1692: * currently undefined in the Unicode standard. 1693: * @author Anthony Balkissoon abalkiss at redhat dot com 1694: * 1695: */ 1696: private static class UnassignedCharacters 1697: { 1698: /** 1699: * Returns the numeric value for the unassigned characters. 1700: * @param cp the character 1701: * @param radix the radix (not used) 1702: * @return the numeric value of this character in this radix 1703: */ 1704: static int digit(int cp, int radix) 1705: { 1706: return -1; 1707: } 1708: 1709: /** 1710: * Returns the Unicode directionality property for unassigned 1711: * characters. 1712: * @param cp the character 1713: * @return DIRECTIONALITY_UNDEFINED 1714: */ 1715: static byte getDirectionality(int cp) 1716: { 1717: return DIRECTIONALITY_UNDEFINED; 1718: } 1719: 1720: /** 1721: * Returns -1, the numeric value for unassigned Unicode characters. 1722: * @param cp the character 1723: * @return -1 1724: */ 1725: static int getNumericValue(int cp) 1726: { 1727: return -1; 1728: } 1729: 1730: /** 1731: * Returns UNASSIGNED, the type of unassigned Unicode characters. 1732: * @param cp the character 1733: * @return UNASSIGNED 1734: */ 1735: static int getType(int cp) 1736: { 1737: return UNASSIGNED; 1738: } 1739: 1740: /** 1741: * Returns false to indiciate that the character is not defined in the 1742: * Unicode standard. 1743: * @param cp the character 1744: * @return false 1745: */ 1746: static boolean isDefined(int cp) 1747: { 1748: return false; 1749: } 1750: 1751: /** 1752: * Returns false to indicate that the character is not a digit. 1753: * @param cp the character 1754: * @return false 1755: */ 1756: static boolean isDigit(int cp) 1757: { 1758: return false; 1759: } 1760: 1761: /** 1762: * Returns false to indicate that the character cannot be ignored 1763: * within an identifier 1764: * @param cp the character 1765: * @return false 1766: */ 1767: static boolean isIdentifierIgnorable(int cp) 1768: { 1769: return false; 1770: } 1771: 1772: /** 1773: * Returns false to indicate that the character cannot be part of a 1774: * Java identifier. 1775: * @param cp the character 1776: * @return false 1777: */ 1778: static boolean isJavaIdentifierPart(int cp) 1779: { 1780: return false; 1781: } 1782: 1783: /** 1784: * Returns false to indicate that the character cannot be start a 1785: * Java identifier. 1786: * @param cp the character 1787: * @return false 1788: */ 1789: static boolean isJavaIdentiferStart(int cp) 1790: { 1791: return false; 1792: } 1793: 1794: /** 1795: * Returns false to indicate that the character is not a letter. 1796: * @param cp the character 1797: * @return false 1798: */ 1799: static boolean isLetter(int cp) 1800: { 1801: return false; 1802: } 1803: 1804: /** 1805: * Returns false to indicate that the character cannot is neither a letter 1806: * nor a digit. 1807: * @param cp the character 1808: * @return false 1809: */ 1810: static boolean isLetterOrDigit(int cp) 1811: { 1812: return false; 1813: } 1814: 1815: /** 1816: * Returns false to indicate that the character is not a lowercase letter. 1817: * @param cp the character 1818: * @return false 1819: */ 1820: static boolean isLowerCase(int cp) 1821: { 1822: return false; 1823: } 1824: 1825: /** 1826: * Returns false to indicate that the character cannot is not mirrored. 1827: * @param cp the character 1828: * @return false 1829: */ 1830: static boolean isMirrored(int cp) 1831: { 1832: return false; 1833: } 1834: 1835: /** 1836: * Returns false to indicate that the character is not a space character. 1837: * @param cp the character 1838: * @return false 1839: */ 1840: static boolean isSpaceChar(int cp) 1841: { 1842: return false; 1843: } 1844: 1845: /** 1846: * Returns false to indicate that the character it not a titlecase letter. 1847: * @param cp the character 1848: * @return false 1849: */ 1850: static boolean isTitleCase(int cp) 1851: { 1852: return false; 1853: } 1854: 1855: /** 1856: * Returns false to indicate that the character cannot be part of a 1857: * Unicode identifier. 1858: * @param cp the character 1859: * @return false 1860: */ 1861: static boolean isUnicodeIdentifierPart(int cp) 1862: { 1863: return false; 1864: } 1865: 1866: /** 1867: * Returns false to indicate that the character cannot start a 1868: * Unicode identifier. 1869: * @param cp the character 1870: * @return false 1871: */ 1872: static boolean isUnicodeIdentifierStart(int cp) 1873: { 1874: return false; 1875: } 1876: 1877: /** 1878: * Returns false to indicate that the character is not an uppercase letter. 1879: * @param cp the character 1880: * @return false 1881: */ 1882: static boolean isUpperCase(int cp) 1883: { 1884: return false; 1885: } 1886: 1887: /** 1888: * Returns false to indicate that the character is not a whitespace 1889: * character. 1890: * @param cp the character 1891: * @return false 1892: */ 1893: static boolean isWhiteSpace(int cp) 1894: { 1895: return false; 1896: } 1897: 1898: /** 1899: * Returns cp to indicate this character has no lowercase conversion. 1900: * @param cp the character 1901: * @return cp 1902: */ 1903: static int toLowerCase(int cp) 1904: { 1905: return cp; 1906: } 1907: 1908: /** 1909: * Returns cp to indicate this character has no titlecase conversion. 1910: * @param cp the character 1911: * @return cp 1912: */ 1913: static int toTitleCase(int cp) 1914: { 1915: return cp; 1916: } 1917: 1918: /** 1919: * Returns cp to indicate this character has no uppercase conversion. 1920: * @param cp the character 1921: * @return cp 1922: */ 1923: static int toUpperCase(int cp) 1924: { 1925: return cp; 1926: } 1927: } 1928: 1929: /** 1930: * The immutable value of this Character. 1931: * 1932: * @serial the value of this Character 1933: */ 1934: private final char value; 1935: 1936: /** 1937: * Compatible with JDK 1.0+. 1938: */ 1939: private static final long serialVersionUID = 3786198910865385080L; 1940: 1941: /** 1942: * Smallest value allowed for radix arguments in Java. This value is 2. 1943: * 1944: * @see #digit(char, int) 1945: * @see #forDigit(int, int) 1946: * @see Integer#toString(int, int) 1947: * @see Integer#valueOf(String) 1948: */ 1949: public static final int MIN_RADIX = 2; 1950: 1951: /** 1952: * Largest value allowed for radix arguments in Java. This value is 36. 1953: * 1954: * @see #digit(char, int) 1955: * @see #forDigit(int, int) 1956: * @see Integer#toString(int, int) 1957: * @see Integer#valueOf(String) 1958: */ 1959: public static final int MAX_RADIX = 36; 1960: 1961: /** 1962: * The minimum value the char data type can hold. 1963: * This value is <code>'\\u0000'</code>. 1964: */ 1965: public static final char MIN_VALUE = '\u0000'; 1966: 1967: /** 1968: * The maximum value the char data type can hold. 1969: * This value is <code>'\\uFFFF'</code>. 1970: */ 1971: public static final char MAX_VALUE = '\uFFFF'; 1972: 1973: /** 1974: * The minimum Unicode 4.0 code point. This value is <code>0</code>. 1975: * @since 1.5 1976: */ 1977: public static final int MIN_CODE_POINT = 0; 1978: 1979: /** 1980: * The maximum Unicode 4.0 code point, which is greater than the range 1981: * of the char data type. 1982: * This value is <code>0x10FFFF</code>. 1983: * @since 1.5 1984: */ 1985: public static final int MAX_CODE_POINT = 0x10FFFF; 1986: 1987: /** 1988: * The minimum Unicode high surrogate code unit, or 1989: * <emph>leading-surrogate</emph>, in the UTF-16 character encoding. 1990: * This value is <code>'\uD800'</code>. 1991: * @since 1.5 1992: */ 1993: public static final char MIN_HIGH_SURROGATE = '\uD800'; 1994: 1995: /** 1996: * The maximum Unicode high surrogate code unit, or 1997: * <emph>leading-surrogate</emph>, in the UTF-16 character encoding. 1998: * This value is <code>'\uDBFF'</code>. 1999: * @since 1.5 2000: */ 2001: public static final char MAX_HIGH_SURROGATE = '\uDBFF'; 2002: 2003: /** 2004: * The minimum Unicode low surrogate code unit, or 2005: * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding. 2006: * This value is <code>'\uDC00'</code>. 2007: * @since 1.5 2008: */ 2009: public static final char MIN_LOW_SURROGATE = '\uDC00'; 2010: 2011: /** 2012: * The maximum Unicode low surrogate code unit, or 2013: * <emph>trailing-surrogate</emph>, in the UTF-16 character encoding. 2014: * This value is <code>'\uDFFF'</code>. 2015: * @since 1.5 2016: */ 2017: public static final char MAX_LOW_SURROGATE = '\uDFFF'; 2018: 2019: /** 2020: * The minimum Unicode surrogate code unit in the UTF-16 character encoding. 2021: * This value is <code>'\uD800'</code>. 2022: * @since 1.5 2023: */ 2024: public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE; 2025: 2026: /** 2027: * The maximum Unicode surrogate code unit in the UTF-16 character encoding. 2028: * This value is <code>'\uDFFF'</code>. 2029: * @since 1.5 2030: */ 2031: public static final char MAX_SURROGATE = MAX_LOW_SURROGATE; 2032: 2033: /** 2034: * The lowest possible supplementary Unicode code point (the first code 2035: * point outside the basic multilingual plane (BMP)). 2036: * This value is <code>0x10000</code>. 2037: */ 2038: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 2039: 2040: /** 2041: * Class object representing the primitive char data type. 2042: * 2043: * @since 1.1 2044: */ 2045: public static final Class<Character> TYPE = (Class<Character>) VMClassLoader.getPrimitiveClass('C'); 2046: 2047: /** 2048: * The number of bits needed to represent a <code>char</code>. 2049: * @since 1.5 2050: */ 2051: public static final int SIZE = 16; 2052: 2053: // This caches some Character values, and is used by boxing 2054: // conversions via valueOf(). We must cache at least 0..127; 2055: // this constant controls how much we actually cache. 2056: private static final int MAX_CACHE = 127; 2057: private static Character[] charCache = new Character[MAX_CACHE + 1]; 2058: 2059: /** 2060: * Lu = Letter, Uppercase (Informative). 2061: * 2062: * @since 1.1 2063: */ 2064: public static final byte UPPERCASE_LETTER = 1; 2065: 2066: /** 2067: * Ll = Letter, Lowercase (Informative). 2068: * 2069: * @since 1.1 2070: */ 2071: public static final byte LOWERCASE_LETTER = 2; 2072: 2073: /** 2074: * Lt = Letter, Titlecase (Informative). 2075: * 2076: * @since 1.1 2077: */ 2078: public static final byte TITLECASE_LETTER = 3; 2079: 2080: /** 2081: * Mn = Mark, Non-Spacing (Normative). 2082: * 2083: * @since 1.1 2084: */ 2085: public static final byte NON_SPACING_MARK = 6; 2086: 2087: /** 2088: * Mc = Mark, Spacing Combining (Normative). 2089: * 2090: * @since 1.1 2091: */ 2092: public static final byte COMBINING_SPACING_MARK = 8; 2093: 2094: /** 2095: * Me = Mark, Enclosing (Normative). 2096: * 2097: * @since 1.1 2098: */ 2099: public static final byte ENCLOSING_MARK = 7; 2100: 2101: /** 2102: * Nd = Number, Decimal Digit (Normative). 2103: * 2104: * @since 1.1 2105: */ 2106: public static final byte DECIMAL_DIGIT_NUMBER = 9; 2107: 2108: /** 2109: * Nl = Number, Letter (Normative). 2110: * 2111: * @since 1.1 2112: */ 2113: public static final byte LETTER_NUMBER = 10; 2114: 2115: /** 2116: * No = Number, Other (Normative). 2117: * 2118: * @since 1.1 2119: */ 2120: public static final byte OTHER_NUMBER = 11; 2121: 2122: /** 2123: * Zs = Separator, Space (Normative). 2124: * 2125: * @since 1.1 2126: */ 2127: public static final byte SPACE_SEPARATOR = 12; 2128: 2129: /** 2130: * Zl = Separator, Line (Normative). 2131: * 2132: * @since 1.1 2133: */ 2134: public static final byte LINE_SEPARATOR = 13; 2135: 2136: /** 2137: * Zp = Separator, Paragraph (Normative). 2138: * 2139: * @since 1.1 2140: */ 2141: public static final byte PARAGRAPH_SEPARATOR = 14; 2142: 2143: /** 2144: * Cc = Other, Control (Normative). 2145: * 2146: * @since 1.1 2147: */ 2148: public static final byte CONTROL = 15; 2149: 2150: /** 2151: * Cf = Other, Format (Normative). 2152: * 2153: * @since 1.1 2154: */ 2155: public static final byte FORMAT = 16; 2156: 2157: /** 2158: * Cs = Other, Surrogate (Normative). 2159: * 2160: * @since 1.1 2161: */ 2162: public static final byte SURROGATE = 19; 2163: 2164: /** 2165: * Co = Other, Private Use (Normative). 2166: * 2167: * @since 1.1 2168: */ 2169: public static final byte PRIVATE_USE = 18; 2170: 2171: /** 2172: * Cn = Other, Not Assigned (Normative). 2173: * 2174: * @since 1.1 2175: */ 2176: public static final byte UNASSIGNED = 0; 2177: 2178: /** 2179: * Lm = Letter, Modifier (Informative). 2180: * 2181: * @since 1.1 2182: */ 2183: public static final byte MODIFIER_LETTER = 4; 2184: 2185: /** 2186: * Lo = Letter, Other (Informative). 2187: * 2188: * @since 1.1 2189: */ 2190: public static final byte OTHER_LETTER = 5; 2191: 2192: /** 2193: * Pc = Punctuation, Connector (Informative). 2194: * 2195: * @since 1.1 2196: */ 2197: public static final byte CONNECTOR_PUNCTUATION = 23; 2198: 2199: /** 2200: * Pd = Punctuation, Dash (Informative). 2201: * 2202: * @since 1.1 2203: */ 2204: public static final byte DASH_PUNCTUATION = 20; 2205: 2206: /** 2207: * Ps = Punctuation, Open (Informative). 2208: * 2209: * @since 1.1 2210: */ 2211: public static final byte START_PUNCTUATION = 21; 2212: 2213: /** 2214: * Pe = Punctuation, Close (Informative). 2215: * 2216: * @since 1.1 2217: */ 2218: public static final byte END_PUNCTUATION = 22; 2219: 2220: /** 2221: * Pi = Punctuation, Initial Quote (Informative). 2222: * 2223: * @since 1.4 2224: */ 2225: public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 2226: 2227: /** 2228: * Pf = Punctuation, Final Quote (Informative). 2229: * 2230: * @since 1.4 2231: */ 2232: public static final byte FINAL_QUOTE_PUNCTUATION = 30; 2233: 2234: /** 2235: * Po = Punctuation, Other (Informative). 2236: * 2237: * @since 1.1 2238: */ 2239: public static final byte OTHER_PUNCTUATION = 24; 2240: 2241: /** 2242: * Sm = Symbol, Math (Informative). 2243: * 2244: * @since 1.1 2245: */ 2246: public static final byte MATH_SYMBOL = 25; 2247: 2248: /** 2249: * Sc = Symbol, Currency (Informative). 2250: * 2251: * @since 1.1 2252: */ 2253: public static final byte CURRENCY_SYMBOL = 26; 2254: 2255: /** 2256: * Sk = Symbol, Modifier (Informative). 2257: * 2258: * @since 1.1 2259: */ 2260: public static final byte MODIFIER_SYMBOL = 27; 2261: 2262: /** 2263: * So = Symbol, Other (Informative). 2264: * 2265: * @since 1.1 2266: */ 2267: public static final byte OTHER_SYMBOL = 28; 2268: 2269: /** 2270: * Undefined bidirectional character type. Undefined char values have 2271: * undefined directionality in the Unicode specification. 2272: * 2273: * @since 1.4 2274: */ 2275: public static final byte DIRECTIONALITY_UNDEFINED = -1; 2276: 2277: /** 2278: * Strong bidirectional character type "L". 2279: * 2280: * @since 1.4 2281: */ 2282: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 2283: 2284: /** 2285: * Strong bidirectional character type "R". 2286: * 2287: * @since 1.4 2288: */ 2289: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 2290: 2291: /** 2292: * Strong bidirectional character type "AL". 2293: * 2294: * @since 1.4 2295: */ 2296: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 2297: 2298: /** 2299: * Weak bidirectional character type "EN". 2300: * 2301: * @since 1.4 2302: */ 2303: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 2304: 2305: /** 2306: * Weak bidirectional character type "ES". 2307: * 2308: * @since 1.4 2309: */ 2310: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 2311: 2312: /** 2313: * Weak bidirectional character type "ET". 2314: * 2315: * @since 1.4 2316: */ 2317: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 2318: 2319: /** 2320: * Weak bidirectional character type "AN". 2321: * 2322: * @since 1.4 2323: */ 2324: public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 2325: 2326: /** 2327: * Weak bidirectional character type "CS". 2328: * 2329: * @since 1.4 2330: */ 2331: public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 2332: 2333: /** 2334: * Weak bidirectional character type "NSM". 2335: * 2336: * @since 1.4 2337: */ 2338: public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 2339: 2340: /** 2341: * Weak bidirectional character type "BN". 2342: * 2343: * @since 1.4 2344: */ 2345: public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 2346: 2347: /** 2348: * Neutral bidirectional character type "B". 2349: * 2350: * @since 1.4 2351: */ 2352: public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 2353: 2354: /** 2355: * Neutral bidirectional character type "S". 2356: * 2357: * @since 1.4 2358: */ 2359: public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 2360: 2361: /** 2362: * Strong bidirectional character type "WS". 2363: * 2364: * @since 1.4 2365: */ 2366: public static final byte DIRECTIONALITY_WHITESPACE = 12; 2367: 2368: /** 2369: * Neutral bidirectional character type "ON". 2370: * 2371: * @since 1.4 2372: */ 2373: public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 2374: 2375: /** 2376: * Strong bidirectional character type "LRE". 2377: * 2378: * @since 1.4 2379: */ 2380: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 2381: 2382: /** 2383: * Strong bidirectional character type "LRO". 2384: * 2385: * @since 1.4 2386: */ 2387: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 2388: 2389: /** 2390: * Strong bidirectional character type "RLE". 2391: * 2392: * @since 1.4 2393: */ 2394: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 2395: 2396: /** 2397: * Strong bidirectional character type "RLO". 2398: * 2399: * @since 1.4 2400: */ 2401: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 2402: 2403: /** 2404: * Weak bidirectional character type "PDF". 2405: * 2406: * @since 1.4 2407: */ 2408: public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 2409: 2410: /** 2411: * Stores unicode block offset lookup table. Exploit package visibility of 2412: * String.value to avoid copying the array. 2413: * @see #readCodePoint(int) 2414: * @see CharData#BLOCKS 2415: */ 2416: private static final char[][] blocks = 2417: new char[][]{ 2418: String.zeroBasedStringValue(CharData.BLOCKS[0]), 2419: String.zeroBasedStringValue(CharData.BLOCKS[1]), 2420: String.zeroBasedStringValue(CharData.BLOCKS[2]), 2421: String.zeroBasedStringValue(CharData.BLOCKS[3]), 2422: String.zeroBasedStringValue(CharData.BLOCKS[4]), 2423: String.zeroBasedStringValue(CharData.BLOCKS[5]), 2424: String.zeroBasedStringValue(CharData.BLOCKS[6]), 2425: String.zeroBasedStringValue(CharData.BLOCKS[7]), 2426: String.zeroBasedStringValue(CharData.BLOCKS[8]), 2427: String.zeroBasedStringValue(CharData.BLOCKS[9]), 2428: String.zeroBasedStringValue(CharData.BLOCKS[10]), 2429: String.zeroBasedStringValue(CharData.BLOCKS[11]), 2430: String.zeroBasedStringValue(CharData.BLOCKS[12]), 2431: String.zeroBasedStringValue(CharData.BLOCKS[13]), 2432: String.zeroBasedStringValue(CharData.BLOCKS[14]), 2433: String.zeroBasedStringValue(CharData.BLOCKS[15]), 2434: String.zeroBasedStringValue(CharData.BLOCKS[16])}; 2435: 2436: /** 2437: * Stores unicode attribute offset lookup table. Exploit package visibility 2438: * of String.value to avoid copying the array. 2439: * @see CharData#DATA 2440: */ 2441: private static final char[][] data = 2442: new char[][]{ 2443: String.zeroBasedStringValue(CharData.DATA[0]), 2444: String.zeroBasedStringValue(CharData.DATA[1]), 2445: String.zeroBasedStringValue(CharData.DATA[2]), 2446: String.zeroBasedStringValue(CharData.DATA[3]), 2447: String.zeroBasedStringValue(CharData.DATA[4]), 2448: String.zeroBasedStringValue(CharData.DATA[5]), 2449: String.zeroBasedStringValue(CharData.DATA[6]), 2450: String.zeroBasedStringValue(CharData.DATA[7]), 2451: String.zeroBasedStringValue(CharData.DATA[8]), 2452: String.zeroBasedStringValue(CharData.DATA[9]), 2453: String.zeroBasedStringValue(CharData.DATA[10]), 2454: String.zeroBasedStringValue(CharData.DATA[11]), 2455: String.zeroBasedStringValue(CharData.DATA[12]), 2456: String.zeroBasedStringValue(CharData.DATA[13]), 2457: String.zeroBasedStringValue(CharData.DATA[14]), 2458: String.zeroBasedStringValue(CharData.DATA[15]), 2459: String.zeroBasedStringValue(CharData.DATA[16])}; 2460: 2461: /** 2462: * Stores unicode numeric value attribute table. Exploit package visibility 2463: * of String.value to avoid copying the array. 2464: * @see CharData#NUM_VALUE 2465: */ 2466: private static final char[][] numValue = 2467: new char[][]{ 2468: String.zeroBasedStringValue(CharData.NUM_VALUE[0]), 2469: String.zeroBasedStringValue(CharData.NUM_VALUE[1]), 2470: String.zeroBasedStringValue(CharData.NUM_VALUE[2]), 2471: String.zeroBasedStringValue(CharData.NUM_VALUE[3]), 2472: String.zeroBasedStringValue(CharData.NUM_VALUE[4]), 2473: String.zeroBasedStringValue(CharData.NUM_VALUE[5]), 2474: String.zeroBasedStringValue(CharData.NUM_VALUE[6]), 2475: String.zeroBasedStringValue(CharData.NUM_VALUE[7]), 2476: String.zeroBasedStringValue(CharData.NUM_VALUE[8]), 2477: String.zeroBasedStringValue(CharData.NUM_VALUE[9]), 2478: String.zeroBasedStringValue(CharData.NUM_VALUE[10]), 2479: String.zeroBasedStringValue(CharData.NUM_VALUE[11]), 2480: String.zeroBasedStringValue(CharData.NUM_VALUE[12]), 2481: String.zeroBasedStringValue(CharData.NUM_VALUE[13]), 2482: String.zeroBasedStringValue(CharData.NUM_VALUE[14]), 2483: String.zeroBasedStringValue(CharData.NUM_VALUE[15]), 2484: String.zeroBasedStringValue(CharData.NUM_VALUE[16])}; 2485: 2486: /** 2487: * Stores unicode uppercase attribute table. Exploit package visibility 2488: * of String.value to avoid copying the array. 2489: * @see CharData#UPPER 2490: */ 2491: private static final char[][] upper = 2492: new char[][]{ 2493: String.zeroBasedStringValue(CharData.UPPER[0]), 2494: String.zeroBasedStringValue(CharData.UPPER[1]), 2495: String.zeroBasedStringValue(CharData.UPPER[2]), 2496: String.zeroBasedStringValue(CharData.UPPER[3]), 2497: String.zeroBasedStringValue(CharData.UPPER[4]), 2498: String.zeroBasedStringValue(CharData.UPPER[5]), 2499: String.zeroBasedStringValue(CharData.UPPER[6]), 2500: String.zeroBasedStringValue(CharData.UPPER[7]), 2501: String.zeroBasedStringValue(CharData.UPPER[8]), 2502: String.zeroBasedStringValue(CharData.UPPER[9]), 2503: String.zeroBasedStringValue(CharData.UPPER[10]), 2504: String.zeroBasedStringValue(CharData.UPPER[11]), 2505: String.zeroBasedStringValue(CharData.UPPER[12]), 2506: String.zeroBasedStringValue(CharData.UPPER[13]), 2507: String.zeroBasedStringValue(CharData.UPPER[14]), 2508: String.zeroBasedStringValue(CharData.UPPER[15]), 2509: String.zeroBasedStringValue(CharData.UPPER[16])}; 2510: 2511: /** 2512: * Stores unicode lowercase attribute table. Exploit package visibility 2513: * of String.value to avoid copying the array. 2514: * @see CharData#LOWER 2515: */ 2516: private static final char[][] lower = 2517: new char[][]{ 2518: String.zeroBasedStringValue(CharData.LOWER[0]), 2519: String.zeroBasedStringValue(CharData.LOWER[1]), 2520: String.zeroBasedStringValue(CharData.LOWER[2]), 2521: String.zeroBasedStringValue(CharData.LOWER[3]), 2522: String.zeroBasedStringValue(CharData.LOWER[4]), 2523: String.zeroBasedStringValue(CharData.LOWER[5]), 2524: String.zeroBasedStringValue(CharData.LOWER[6]), 2525: String.zeroBasedStringValue(CharData.LOWER[7]), 2526: String.zeroBasedStringValue(CharData.LOWER[8]), 2527: String.zeroBasedStringValue(CharData.LOWER[9]), 2528: String.zeroBasedStringValue(CharData.LOWER[10]), 2529: String.zeroBasedStringValue(CharData.LOWER[11]), 2530: String.zeroBasedStringValue(CharData.LOWER[12]), 2531: String.zeroBasedStringValue(CharData.LOWER[13]), 2532: String.zeroBasedStringValue(CharData.LOWER[14]), 2533: String.zeroBasedStringValue(CharData.LOWER[15]), 2534: String.zeroBasedStringValue(CharData.LOWER[16])}; 2535: 2536: /** 2537: * Stores unicode direction attribute table. Exploit package visibility 2538: * of String.value to avoid copying the array. 2539: * @see CharData#DIRECTION 2540: */ 2541: // Package visible for use by String. 2542: static final char[][] direction = 2543: new char[][]{ 2544: String.zeroBasedStringValue(CharData.DIRECTION[0]), 2545: String.zeroBasedStringValue(CharData.DIRECTION[1]), 2546: String.zeroBasedStringValue(CharData.DIRECTION[2]), 2547: String.zeroBasedStringValue(CharData.DIRECTION[3]), 2548: String.zeroBasedStringValue(CharData.DIRECTION[4]), 2549: String.zeroBasedStringValue(CharData.DIRECTION[5]), 2550: String.zeroBasedStringValue(CharData.DIRECTION[6]), 2551: String.zeroBasedStringValue(CharData.DIRECTION[7]), 2552: String.zeroBasedStringValue(CharData.DIRECTION[8]), 2553: String.zeroBasedStringValue(CharData.DIRECTION[9]), 2554: String.zeroBasedStringValue(CharData.DIRECTION[10]), 2555: String.zeroBasedStringValue(CharData.DIRECTION[11]), 2556: String.zeroBasedStringValue(CharData.DIRECTION[12]), 2557: String.zeroBasedStringValue(CharData.DIRECTION[13]), 2558: String.zeroBasedStringValue(CharData.DIRECTION[14]), 2559: String.zeroBasedStringValue(CharData.DIRECTION[15]), 2560: String.zeroBasedStringValue(CharData.DIRECTION[16])}; 2561: 2562: /** 2563: * Stores unicode titlecase table. Exploit package visibility of 2564: * String.value to avoid copying the array. 2565: * @see CharData#TITLE 2566: */ 2567: private static final char[] title = String.zeroBasedStringValue(CharData.TITLE); 2568: 2569: /** 2570: * Mask for grabbing the type out of the contents of data. 2571: * @see CharData#DATA 2572: */ 2573: private static final int TYPE_MASK = 0x1F; 2574: 2575: /** 2576: * Mask for grabbing the non-breaking space flag out of the contents of 2577: * data. 2578: * @see CharData#DATA 2579: */ 2580: private static final int NO_BREAK_MASK = 0x20; 2581: 2582: /** 2583: * Mask for grabbing the mirrored directionality flag out of the contents 2584: * of data. 2585: * @see CharData#DATA 2586: */ 2587: private static final int MIRROR_MASK = 0x40; 2588: 2589: /** 2590: * Grabs an attribute offset from the Unicode attribute database. The lower 2591: * 5 bits are the character type, the next 2 bits are flags, and the top 2592: * 9 bits are the offset into the attribute tables. 2593: * 2594: * @param codePoint the character to look up 2595: * @return the character's attribute offset and type 2596: * @see #TYPE_MASK 2597: * @see #NO_BREAK_MASK 2598: * @see #MIRROR_MASK 2599: * @see CharData#DATA 2600: * @see CharData#SHIFT 2601: */ 2602: // Package visible for use in String. 2603: static char readCodePoint(int codePoint) 2604: { 2605: int plane = codePoint >>> 16; 2606: char offset = (char) (codePoint & 0xffff); 2607: return data[plane][(char) (blocks[plane][offset >> CharData.SHIFT[plane]] + offset)]; 2608: } 2609: 2610: /** 2611: * Wraps up a character. 2612: * 2613: * @param value the character to wrap 2614: */ 2615: public Character(char value) 2616: { 2617: this.value = value; 2618: } 2619: 2620: /** 2621: * Returns the character which has been wrapped by this class. 2622: * 2623: * @return the character wrapped 2624: */ 2625: public char charValue() 2626: { 2627: return value; 2628: } 2629: 2630: /** 2631: * Returns the numerical value (unsigned) of the wrapped character. 2632: * Range of returned values: 0x0000-0xFFFF. 2633: * 2634: * @return the value of the wrapped character 2635: */ 2636: public int hashCode() 2637: { 2638: return value; 2639: } 2640: 2641: /** 2642: * Determines if an object is equal to this object. This is only true for 2643: * another Character object wrapping the same value. 2644: * 2645: * @param o object to compare 2646: * @return true if o is a Character with the same value 2647: */ 2648: public boolean equals(Object o) 2649: { 2650: return o instanceof Character && value == ((Character) o).value; 2651: } 2652: 2653: /** 2654: * Converts the wrapped character into a String. 2655: * 2656: * @return a String containing one character -- the wrapped character 2657: * of this instance 2658: */ 2659: public String toString() 2660: { 2661: // Package constructor avoids an array copy. 2662: return new String(new char[] { value }, 0, 1, true); 2663: } 2664: 2665: /** 2666: * Returns a String of length 1 representing the specified character. 2667: * 2668: * @param ch the character to convert 2669: * @return a String containing the character 2670: * @since 1.4 2671: */ 2672: public static String toString(char ch) 2673: { 2674: // Package constructor avoids an array copy. 2675: return new String(new char[] { ch }, 0, 1, true); 2676: } 2677: 2678: /** 2679: * Determines if a character is a Unicode lowercase letter. For example, 2680: * <code>'a'</code> is lowercase. Returns true if getType() returns 2681: * LOWERCASE_LETTER. 2682: * <br> 2683: * lowercase = [Ll] 2684: * 2685: * @param ch character to test 2686: * @return true if ch is a Unicode lowercase letter, else false 2687: * @see #isUpperCase(char) 2688: * @see #isTitleCase(char) 2689: * @see #toLowerCase(char) 2690: * @see #getType(char) 2691: */ 2692: public static boolean isLowerCase(char ch) 2693: { 2694: return isLowerCase((int)ch); 2695: } 2696: 2697: /** 2698: * Determines if a character is a Unicode lowercase letter. For example, 2699: * <code>'a'</code> is lowercase. Returns true if getType() returns 2700: * LOWERCASE_LETTER. 2701: * <br> 2702: * lowercase = [Ll] 2703: * 2704: * @param codePoint character to test 2705: * @return true if ch is a Unicode lowercase letter, else false 2706: * @see #isUpperCase(char) 2707: * @see #isTitleCase(char) 2708: * @see #toLowerCase(char) 2709: * @see #getType(char) 2710: * 2711: * @since 1.5 2712: */ 2713: public static boolean isLowerCase(int codePoint) 2714: { 2715: return getType(codePoint) == LOWERCASE_LETTER; 2716: } 2717: 2718: /** 2719: * Determines if a character is a Unicode uppercase letter. For example, 2720: * <code>'A'</code> is uppercase. Returns true if getType() returns 2721: * UPPERCASE_LETTER. 2722: * <br> 2723: * uppercase = [Lu] 2724: * 2725: * @param ch character to test 2726: * @return true if ch is a Unicode uppercase letter, else false 2727: * @see #isLowerCase(char) 2728: * @see #isTitleCase(char) 2729: * @see #toUpperCase(char) 2730: * @see #getType(char) 2731: */ 2732: public static boolean isUpperCase(char ch) 2733: { 2734: return isUpperCase((int)ch); 2735: } 2736: 2737: /** 2738: * Determines if a character is a Unicode uppercase letter. For example, 2739: * <code>'A'</code> is uppercase. Returns true if getType() returns 2740: * UPPERCASE_LETTER. 2741: * <br> 2742: * uppercase = [Lu] 2743: * 2744: * @param codePoint character to test 2745: * @return true if ch is a Unicode uppercase letter, else false 2746: * @see #isLowerCase(char) 2747: * @see #isTitleCase(char) 2748: * @see #toUpperCase(char) 2749: * @see #getType(char) 2750: * 2751: * @since 1.5 2752: */ 2753: public static boolean isUpperCase(int codePoint) 2754: { 2755: return getType(codePoint) == UPPERCASE_LETTER; 2756: } 2757: 2758: /** 2759: * Determines if a character is a Unicode titlecase letter. For example, 2760: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2761: * True if getType() returns TITLECASE_LETTER. 2762: * <br> 2763: * titlecase = [Lt] 2764: * 2765: * @param ch character to test 2766: * @return true if ch is a Unicode titlecase letter, else false 2767: * @see #isLowerCase(char) 2768: * @see #isUpperCase(char) 2769: * @see #toTitleCase(char) 2770: * @see #getType(char) 2771: */ 2772: public static boolean isTitleCase(char ch) 2773: { 2774: return isTitleCase((int)ch); 2775: } 2776: 2777: /** 2778: * Determines if a character is a Unicode titlecase letter. For example, 2779: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2780: * True if getType() returns TITLECASE_LETTER. 2781: * <br> 2782: * titlecase = [Lt] 2783: * 2784: * @param codePoint character to test 2785: * @return true if ch is a Unicode titlecase letter, else false 2786: * @see #isLowerCase(char) 2787: * @see #isUpperCase(char) 2788: * @see #toTitleCase(char) 2789: * @see #getType(char) 2790: * 2791: * @since 1.5 2792: */ 2793: public static boolean isTitleCase(int codePoint) 2794: { 2795: return getType(codePoint) == TITLECASE_LETTER; 2796: } 2797: 2798: 2799: /** 2800: * Determines if a character is a Unicode decimal digit. For example, 2801: * <code>'0'</code> is a digit. A character is a Unicode digit if 2802: * getType() returns DECIMAL_DIGIT_NUMBER. 2803: * <br> 2804: * Unicode decimal digit = [Nd] 2805: * 2806: * @param ch character to test 2807: * @return true if ch is a Unicode decimal digit, else false 2808: * @see #digit(char, int) 2809: * @see #forDigit(int, int) 2810: * @see #getType(char) 2811: */ 2812: public static boolean isDigit(char ch) 2813: { 2814: return isDigit((int)ch); 2815: } 2816: 2817: /** 2818: * Determines if a character is a Unicode decimal digit. For example, 2819: * <code>'0'</code> is a digit. A character is a Unicode digit if 2820: * getType() returns DECIMAL_DIGIT_NUMBER. 2821: * <br> 2822: * Unicode decimal digit = [Nd] 2823: * 2824: * @param codePoint character to test 2825: * @return true if ch is a Unicode decimal digit, else false 2826: * @see #digit(char, int) 2827: * @see #forDigit(int, int) 2828: * @see #getType(char) 2829: * 2830: * @since 1.5 2831: */ 2832: 2833: public static boolean isDigit(int codePoint) 2834: { 2835: return getType(codePoint) == DECIMAL_DIGIT_NUMBER; 2836: } 2837: 2838: /** 2839: * Determines if a character is part of the Unicode Standard. This is an 2840: * evolving standard, but covers every character in the data file. 2841: * <br> 2842: * defined = not [Cn] 2843: * 2844: * @param ch character to test 2845: * @return true if ch is a Unicode character, else false 2846: * @see #isDigit(char) 2847: * @see #isLetter(char) 2848: * @see #isLetterOrDigit(char) 2849: * @see #isLowerCase(char) 2850: * @see #isTitleCase(char) 2851: * @see #isUpperCase(char) 2852: */ 2853: public static boolean isDefined(char ch) 2854: { 2855: return isDefined((int)ch); 2856: } 2857: 2858: /** 2859: * Determines if a character is part of the Unicode Standard. This is an 2860: * evolving standard, but covers every character in the data file. 2861: * <br> 2862: * defined = not [Cn] 2863: * 2864: * @param codePoint character to test 2865: * @return true if ch is a Unicode character, else false 2866: * @see #isDigit(char) 2867: * @see #isLetter(char) 2868: * @see #isLetterOrDigit(char) 2869: * @see #isLowerCase(char) 2870: * @see #isTitleCase(char) 2871: * @see #isUpperCase(char) 2872: * 2873: * @since 1.5 2874: */ 2875: public static boolean isDefined(int codePoint) 2876: { 2877: return getType(codePoint) != UNASSIGNED; 2878: } 2879: 2880: /** 2881: * Determines if a character is a Unicode letter. Not all letters have case, 2882: * so this may return true when isLowerCase and isUpperCase return false. 2883: * A character is a Unicode letter if getType() returns one of 2884: * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, 2885: * or OTHER_LETTER. 2886: * <br> 2887: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2888: * 2889: * @param ch character to test 2890: * @return true if ch is a Unicode letter, else false 2891: * @see #isDigit(char) 2892: * @see #isJavaIdentifierStart(char) 2893: * @see #isJavaLetter(char) 2894: * @see #isJavaLetterOrDigit(char) 2895: * @see #isLetterOrDigit(char) 2896: * @see #isLowerCase(char) 2897: * @see #isTitleCase(char) 2898: * @see #isUnicodeIdentifierStart(char) 2899: * @see #isUpperCase(char) 2900: */ 2901: public static boolean isLetter(char ch) 2902: { 2903: return isLetter((int)ch); 2904: } 2905: 2906: /** 2907: * Determines if a character is a Unicode letter. Not all letters have case, 2908: * so this may return true when isLowerCase and isUpperCase return false. 2909: * A character is a Unicode letter if getType() returns one of 2910: * UPPERCASE_LETTER, LOWERCASE_LETTER, TITLECASE_LETTER, MODIFIER_LETTER, 2911: * or OTHER_LETTER. 2912: * <br> 2913: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2914: * 2915: * @param codePoint character to test 2916: * @return true if ch is a Unicode letter, else false 2917: * @see #isDigit(char) 2918: * @see #isJavaIdentifierStart(char) 2919: * @see #isJavaLetter(char) 2920: * @see #isJavaLetterOrDigit(char) 2921: * @see #isLetterOrDigit(char) 2922: * @see #isLowerCase(char) 2923: * @see #isTitleCase(char) 2924: * @see #isUnicodeIdentifierStart(char) 2925: * @see #isUpperCase(char) 2926: * 2927: * @since 1.5 2928: */ 2929: public static boolean isLetter(int codePoint) 2930: { 2931: return ((1 << getType(codePoint)) 2932: & ((1 << UPPERCASE_LETTER) 2933: | (1 << LOWERCASE_LETTER) 2934: | (1 << TITLECASE_LETTER) 2935: | (1 << MODIFIER_LETTER) 2936: | (1 << OTHER_LETTER))) != 0; 2937: } 2938: /** 2939: * Returns the index into the given CharSequence that is offset 2940: * <code>codePointOffset</code> code points from <code>index</code>. 2941: * @param seq the CharSequence 2942: * @param index the start position in the CharSequence 2943: * @param codePointOffset the number of code points offset from the start 2944: * position 2945: * @return the index into the CharSequence that is codePointOffset code 2946: * points offset from index 2947: * 2948: * @throws NullPointerException if seq is null 2949: * @throws IndexOutOfBoundsException if index is negative or greater than the 2950: * length of the sequence. 2951: * @throws IndexOutOfBoundsException if codePointOffset is positive and the 2952: * subsequence from index to the end of seq has fewer than codePointOffset 2953: * code points 2954: * @throws IndexOutOfBoundsException if codePointOffset is negative and the 2955: * subsequence from the start of seq to index has fewer than 2956: * (-codePointOffset) code points 2957: * @since 1.5 2958: */ 2959: public static int offsetByCodePoints(CharSequence seq, 2960: int index, 2961: int codePointOffset) 2962: { 2963: int len = seq.length(); 2964: if (index < 0 || index > len) 2965: throw new IndexOutOfBoundsException(); 2966: 2967: int numToGo = codePointOffset; 2968: int offset = index; 2969: int adjust = 1; 2970: if (numToGo >= 0) 2971: { 2972: for (; numToGo > 0; offset++) 2973: { 2974: numToGo--; 2975: if (Character.isHighSurrogate(seq.charAt(offset)) 2976: && (offset + 1) < len 2977: && Character.isLowSurrogate(seq.charAt(offset + 1))) 2978: offset++; 2979: } 2980: return offset; 2981: } 2982: else 2983: { 2984: numToGo *= -1; 2985: for (; numToGo > 0;) 2986: { 2987: numToGo--; 2988: offset--; 2989: if (Character.isLowSurrogate(seq.charAt(offset)) 2990: && (offset - 1) >= 0 2991: && Character.isHighSurrogate(seq.charAt(offset - 1))) 2992: offset--; 2993: } 2994: return offset; 2995: } 2996: } 2997: 2998: /** 2999: * Returns the index into the given char subarray that is offset 3000: * <code>codePointOffset</code> code points from <code>index</code>. 3001: * @param a the char array 3002: * @param start the start index of the subarray 3003: * @param count the length of the subarray 3004: * @param index the index to be offset 3005: * @param codePointOffset the number of code points offset from <code>index 3006: * </code> 3007: * @return the index into the char array 3008: * 3009: * @throws NullPointerException if a is null 3010: * @throws IndexOutOfBoundsException if start or count is negative or if 3011: * start + count is greater than the length of the array 3012: * @throws IndexOutOfBoundsException if index is less than start or larger 3013: * than start + count 3014: * @throws IndexOutOfBoundsException if codePointOffset is positive and the 3015: * subarray from index to start + count - 1 has fewer than codePointOffset 3016: * code points. 3017: * @throws IndexOutOfBoundsException if codePointOffset is negative and the 3018: * subarray from start to index - 1 has fewer than (-codePointOffset) code 3019: * points 3020: * 3021: * @since 1.5 3022: */ 3023: public static int offsetByCodePoints(char[] a, 3024: int start, 3025: int count, 3026: int index, 3027: int codePointOffset) 3028: { 3029: int len = a.length; 3030: int end = start + count; 3031: if (start < 0 || count < 0 || end > len || index < start || index > end) 3032: throw new IndexOutOfBoundsException(); 3033: 3034: int numToGo = codePointOffset; 3035: int offset = index; 3036: int adjust = 1; 3037: if (numToGo >= 0) 3038: { 3039: for (; numToGo > 0; offset++) 3040: { 3041: numToGo--; 3042: if (Character.isHighSurrogate(a[offset]) 3043: && (offset + 1) < len 3044: && Character.isLowSurrogate(a[offset + 1])) 3045: offset++; 3046: } 3047: return offset; 3048: } 3049: else 3050: { 3051: numToGo *= -1; 3052: for (; numToGo > 0;) 3053: { 3054: numToGo--; 3055: offset--; 3056: if (Character.isLowSurrogate(a[offset]) 3057: && (offset - 1) >= 0 3058: && Character.isHighSurrogate(a[offset - 1])) 3059: offset--; 3060: if (offset < start) 3061: throw new IndexOutOfBoundsException(); 3062: } 3063: return offset; 3064: } 3065: 3066: } 3067: 3068: /** 3069: * Returns the number of Unicode code points in the specified range of the 3070: * given CharSequence. The first char in the range is at position 3071: * beginIndex and the last one is at position endIndex - 1. Paired 3072: * surrogates (supplementary characters are represented by a pair of chars - 3073: * one from the high surrogates and one from the low surrogates) 3074: * count as just one code point. 3075: * @param seq the CharSequence to inspect 3076: * @param beginIndex the beginning of the range 3077: * @param endIndex the end of the range 3078: * @return the number of Unicode code points in the given range of the 3079: * sequence 3080: * @throws NullPointerException if seq is null 3081: * @throws IndexOutOfBoundsException if beginIndex is negative, endIndex is 3082: * larger than the length of seq, or if beginIndex is greater than endIndex. 3083: * @since 1.5 3084: */ 3085: public static int codePointCount(CharSequence seq, int beginIndex, 3086: int endIndex) 3087: { 3088: int len = seq.length(); 3089: if (beginIndex < 0 || endIndex > len || beginIndex > endIndex) 3090: throw new IndexOutOfBoundsException(); 3091: 3092: int count = 0; 3093: for (int i = beginIndex; i < endIndex; i++) 3094: { 3095: count++; 3096: // If there is a pairing, count it only once. 3097: if (isHighSurrogate(seq.charAt(i)) && (i + 1) < endIndex 3098: && isLowSurrogate(seq.charAt(i + 1))) 3099: i ++; 3100: } 3101: return count; 3102: } 3103: 3104: /** 3105: * Returns the number of Unicode code points in the specified range of the 3106: * given char array. The first char in the range is at position 3107: * offset and the length of the range is count. Paired surrogates 3108: * (supplementary characters are represented by a pair of chars - 3109: * one from the high surrogates and one from the low surrogates) 3110: * count as just one code point. 3111: * @param a the char array to inspect 3112: * @param offset the beginning of the range 3113: * @param count the length of the range 3114: * @return the number of Unicode code points in the given range of the 3115: * array 3116: * @throws NullPointerException if a is null 3117: * @throws IndexOutOfBoundsException if offset or count is negative or if 3118: * offset + countendIndex is larger than the length of a. 3119: * @since 1.5 3120: */ 3121: public static int codePointCount(char[] a, int offset, 3122: int count) 3123: { 3124: int len = a.length; 3125: int end = offset + count; 3126: if (offset < 0 || count < 0 || end > len) 3127: throw new IndexOutOfBoundsException(); 3128: 3129: int counter = 0; 3130: for (int i = offset; i < end; i++) 3131: { 3132: counter++; 3133: // If there is a pairing, count it only once. 3134: if (isHighSurrogate(a[i]) && (i + 1) < end 3135: && isLowSurrogate(a[i + 1])) 3136: i ++; 3137: } 3138: return counter; 3139: } 3140: 3141: /** 3142: * Determines if a character is a Unicode letter or a Unicode digit. This 3143: * is the combination of isLetter and isDigit. 3144: * <br> 3145: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 3146: * 3147: * @param ch character to test 3148: * @return true if ch is a Unicode letter or a Unicode digit, else false 3149: * @see #isDigit(char) 3150: * @see #isJavaIdentifierPart(char) 3151: * @see #isJavaLetter(char) 3152: * @see #isJavaLetterOrDigit(char) 3153: * @see #isLetter(char) 3154: * @see #isUnicodeIdentifierPart(char) 3155: */ 3156: public static boolean isLetterOrDigit(char ch) 3157: { 3158: return isLetterOrDigit((int)ch); 3159: } 3160: 3161: /** 3162: * Determines if a character is a Unicode letter or a Unicode digit. This 3163: * is the combination of isLetter and isDigit. 3164: * <br> 3165: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 3166: * 3167: * @param codePoint character to test 3168: * @return true if ch is a Unicode letter or a Unicode digit, else false 3169: * @see #isDigit(char) 3170: * @see #isJavaIdentifierPart(char) 3171: * @see #isJavaLetter(char) 3172: * @see #isJavaLetterOrDigit(char) 3173: * @see #isLetter(char) 3174: * @see #isUnicodeIdentifierPart(char) 3175: * 3176: * @since 1.5 3177: */ 3178: public static boolean isLetterOrDigit(int codePoint) 3179: { 3180: return ((1 << getType(codePoint)) 3181: & ((1 << UPPERCASE_LETTER) 3182: | (1 << LOWERCASE_LETTER) 3183: | (1 << TITLECASE_LETTER) 3184: | (1 << MODIFIER_LETTER) 3185: | (1 << OTHER_LETTER) 3186: | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 3187: } 3188: 3189: /** 3190: * Determines if a character can start a Java identifier. This is the 3191: * combination of isLetter, any character where getType returns 3192: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3193: * (like '_'). 3194: * 3195: * @param ch character to test 3196: * @return true if ch can start a Java identifier, else false 3197: * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 3198: * @see #isJavaLetterOrDigit(char) 3199: * @see #isJavaIdentifierStart(char) 3200: * @see #isJavaIdentifierPart(char) 3201: * @see #isLetter(char) 3202: * @see #isLetterOrDigit(char) 3203: * @see #isUnicodeIdentifierStart(char) 3204: */ 3205: public static boolean isJavaLetter(char ch) 3206: { 3207: return isJavaIdentifierStart(ch); 3208: } 3209: 3210: /** 3211: * Determines if a character can follow the first letter in 3212: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3213: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3214: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3215: * or isIdentifierIgnorable. 3216: * 3217: * @param ch character to test 3218: * @return true if ch can follow the first letter in a Java identifier 3219: * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 3220: * @see #isJavaLetter(char) 3221: * @see #isJavaIdentifierStart(char) 3222: * @see #isJavaIdentifierPart(char) 3223: * @see #isLetter(char) 3224: * @see #isLetterOrDigit(char) 3225: * @see #isUnicodeIdentifierPart(char) 3226: * @see #isIdentifierIgnorable(char) 3227: */ 3228: public static boolean isJavaLetterOrDigit(char ch) 3229: { 3230: return isJavaIdentifierPart(ch); 3231: } 3232: 3233: /** 3234: * Determines if a character can start a Java identifier. This is the 3235: * combination of isLetter, any character where getType returns 3236: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3237: * (like '_'). 3238: * <br> 3239: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 3240: * 3241: * @param ch character to test 3242: * @return true if ch can start a Java identifier, else false 3243: * @see #isJavaIdentifierPart(char) 3244: * @see #isLetter(char) 3245: * @see #isUnicodeIdentifierStart(char) 3246: * @since 1.1 3247: */ 3248: public static boolean isJavaIdentifierStart(char ch) 3249: { 3250: return isJavaIdentifierStart((int)ch); 3251: } 3252: 3253: /** 3254: * Determines if a character can start a Java identifier. This is the 3255: * combination of isLetter, any character where getType returns 3256: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 3257: * (like '_'). 3258: * <br> 3259: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 3260: * 3261: * @param codePoint character to test 3262: * @return true if ch can start a Java identifier, else false 3263: * @see #isJavaIdentifierPart(char) 3264: * @see #isLetter(char) 3265: * @see #isUnicodeIdentifierStart(char) 3266: * @since 1.5 3267: */ 3268: public static boolean isJavaIdentifierStart(int codePoint) 3269: { 3270: return ((1 << getType(codePoint)) 3271: & ((1 << UPPERCASE_LETTER) 3272: | (1 << LOWERCASE_LETTER) 3273: | (1 << TITLECASE_LETTER) 3274: | (1 << MODIFIER_LETTER) 3275: | (1 << OTHER_LETTER) 3276: | (1 << LETTER_NUMBER) 3277: | (1 << CURRENCY_SYMBOL) 3278: | (1 << CONNECTOR_PUNCTUATION))) != 0; 3279: } 3280: 3281: /** 3282: * Determines if a character can follow the first letter in 3283: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3284: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3285: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3286: * or isIdentifierIgnorable. 3287: * <br> 3288: * Java identifier extender = 3289: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 3290: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3291: * 3292: * @param ch character to test 3293: * @return true if ch can follow the first letter in a Java identifier 3294: * @see #isIdentifierIgnorable(char) 3295: * @see #isJavaIdentifierStart(char) 3296: * @see #isLetterOrDigit(char) 3297: * @see #isUnicodeIdentifierPart(char) 3298: * @since 1.1 3299: */ 3300: public static boolean isJavaIdentifierPart(char ch) 3301: { 3302: return isJavaIdentifierPart((int)ch); 3303: } 3304: 3305: /** 3306: * Determines if a character can follow the first letter in 3307: * a Java identifier. This is the combination of isJavaLetter (isLetter, 3308: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 3309: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 3310: * or isIdentifierIgnorable. 3311: * <br> 3312: * Java identifier extender = 3313: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 3314: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3315: * 3316: * @param codePoint character to test 3317: * @return true if ch can follow the first letter in a Java identifier 3318: * @see #isIdentifierIgnorable(char) 3319: * @see #isJavaIdentifierStart(char) 3320: * @see #isLetterOrDigit(char) 3321: * @see #isUnicodeIdentifierPart(char) 3322: * @since 1.5 3323: */ 3324: public static boolean isJavaIdentifierPart(int codePoint) 3325: { 3326: int category = getType(codePoint); 3327: return ((1 << category) 3328: & ((1 << UPPERCASE_LETTER) 3329: | (1 << LOWERCASE_LETTER) 3330: | (1 << TITLECASE_LETTER) 3331: | (1 << MODIFIER_LETTER) 3332: | (1 << OTHER_LETTER) 3333: | (1 << NON_SPACING_MARK) 3334: | (1 << COMBINING_SPACING_MARK) 3335: | (1 << DECIMAL_DIGIT_NUMBER) 3336: | (1 << LETTER_NUMBER) 3337: | (1 << CURRENCY_SYMBOL) 3338: | (1 << CONNECTOR_PUNCTUATION) 3339: | (1 << FORMAT))) != 0 3340: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3341: } 3342: 3343: /** 3344: * Determines if a character can start a Unicode identifier. Only 3345: * letters can start a Unicode identifier, but this includes characters 3346: * in LETTER_NUMBER. 3347: * <br> 3348: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 3349: * 3350: * @param ch character to test 3351: * @return true if ch can start a Unicode identifier, else false 3352: * @see #isJavaIdentifierStart(char) 3353: * @see #isLetter(char) 3354: * @see #isUnicodeIdentifierPart(char) 3355: * @since 1.1 3356: */ 3357: public static boolean isUnicodeIdentifierStart(char ch) 3358: { 3359: return isUnicodeIdentifierStart((int)ch); 3360: } 3361: 3362: /** 3363: * Determines if a character can start a Unicode identifier. Only 3364: * letters can start a Unicode identifier, but this includes characters 3365: * in LETTER_NUMBER. 3366: * <br> 3367: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 3368: * 3369: * @param codePoint character to test 3370: * @return true if ch can start a Unicode identifier, else false 3371: * @see #isJavaIdentifierStart(char) 3372: * @see #isLetter(char) 3373: * @see #isUnicodeIdentifierPart(char) 3374: * @since 1.5 3375: */ 3376: public static boolean isUnicodeIdentifierStart(int codePoint) 3377: { 3378: return ((1 << getType(codePoint)) 3379: & ((1 << UPPERCASE_LETTER) 3380: | (1 << LOWERCASE_LETTER) 3381: | (1 << TITLECASE_LETTER) 3382: | (1 << MODIFIER_LETTER) 3383: | (1 << OTHER_LETTER) 3384: | (1 << LETTER_NUMBER))) != 0; 3385: } 3386: 3387: /** 3388: * Determines if a character can follow the first letter in 3389: * a Unicode identifier. This includes letters, connecting punctuation, 3390: * digits, numeric letters, combining marks, non-spacing marks, and 3391: * isIdentifierIgnorable. 3392: * <br> 3393: * Unicode identifier extender = 3394: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3395: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3396: * 3397: * @param ch character to test 3398: * @return true if ch can follow the first letter in a Unicode identifier 3399: * @see #isIdentifierIgnorable(char) 3400: * @see #isJavaIdentifierPart(char) 3401: * @see #isLetterOrDigit(char) 3402: * @see #isUnicodeIdentifierStart(char) 3403: * @since 1.1 3404: */ 3405: public static boolean isUnicodeIdentifierPart(char ch) 3406: { 3407: return isUnicodeIdentifierPart((int)ch); 3408: } 3409: 3410: /** 3411: * Determines if a character can follow the first letter in 3412: * a Unicode identifier. This includes letters, connecting punctuation, 3413: * digits, numeric letters, combining marks, non-spacing marks, and 3414: * isIdentifierIgnorable. 3415: * <br> 3416: * Unicode identifier extender = 3417: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 3418: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 3419: * 3420: * @param codePoint character to test 3421: * @return true if ch can follow the first letter in a Unicode identifier 3422: * @see #isIdentifierIgnorable(char) 3423: * @see #isJavaIdentifierPart(char) 3424: * @see #isLetterOrDigit(char) 3425: * @see #isUnicodeIdentifierStart(char) 3426: * @since 1.5 3427: */ 3428: public static boolean isUnicodeIdentifierPart(int codePoint) 3429: { 3430: int category = getType(codePoint); 3431: return ((1 << category) 3432: & ((1 << UPPERCASE_LETTER) 3433: | (1 << LOWERCASE_LETTER) 3434: | (1 << TITLECASE_LETTER) 3435: | (1 << MODIFIER_LETTER) 3436: | (1 << OTHER_LETTER) 3437: | (1 << NON_SPACING_MARK) 3438: | (1 << COMBINING_SPACING_MARK) 3439: | (1 << DECIMAL_DIGIT_NUMBER) 3440: | (1 << LETTER_NUMBER) 3441: | (1 << CONNECTOR_PUNCTUATION) 3442: | (1 << FORMAT))) != 0 3443: || (category == CONTROL && isIdentifierIgnorable(codePoint)); 3444: } 3445: 3446: /** 3447: * Determines if a character is ignorable in a Unicode identifier. This 3448: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3449: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3450: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3451: * <code>'\u009F'</code>), and FORMAT characters. 3452: * <br> 3453: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3454: * |U+007F-U+009F 3455: * 3456: * @param ch character to test 3457: * @return true if ch is ignorable in a Unicode or Java identifier 3458: * @see #isJavaIdentifierPart(char) 3459: * @see #isUnicodeIdentifierPart(char) 3460: * @since 1.1 3461: */ 3462: public static boolean isIdentifierIgnorable(char ch) 3463: { 3464: return isIdentifierIgnorable((int)ch); 3465: } 3466: 3467: /** 3468: * Determines if a character is ignorable in a Unicode identifier. This 3469: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 3470: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 3471: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 3472: * <code>'\u009F'</code>), and FORMAT characters. 3473: * <br> 3474: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 3475: * |U+007F-U+009F 3476: * 3477: * @param codePoint character to test 3478: * @return true if ch is ignorable in a Unicode or Java identifier 3479: * @see #isJavaIdentifierPart(char) 3480: * @see #isUnicodeIdentifierPart(char) 3481: * @since 1.5 3482: */ 3483: public static boolean isIdentifierIgnorable(int codePoint) 3484: { 3485: if ((codePoint >= 0 && codePoint <= 0x0008) 3486: || (codePoint >= 0x000E && codePoint <= 0x001B) 3487: || (codePoint >= 0x007F && codePoint <= 0x009F) 3488: || getType(codePoint) == FORMAT) 3489: return true; 3490: return false; 3491: } 3492: 3493: /** 3494: * Converts a Unicode character into its lowercase equivalent mapping. 3495: * If a mapping does not exist, then the character passed is returned. 3496: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3497: * 3498: * @param ch character to convert to lowercase 3499: * @return lowercase mapping of ch, or ch if lowercase mapping does 3500: * not exist 3501: * @see #isLowerCase(char) 3502: * @see #isUpperCase(char) 3503: * @see #toTitleCase(char) 3504: * @see #toUpperCase(char) 3505: */ 3506: public static char toLowerCase(char ch) 3507: { 3508: return (char) (lower[0][readCodePoint((int)ch) >>> 7] + ch); 3509: } 3510: 3511: /** 3512: * Converts a Unicode character into its lowercase equivalent mapping. 3513: * If a mapping does not exist, then the character passed is returned. 3514: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 3515: * 3516: * @param codePoint character to convert to lowercase 3517: * @return lowercase mapping of ch, or ch if lowercase mapping does 3518: * not exist 3519: * @see #isLowerCase(char) 3520: * @see #isUpperCase(char) 3521: * @see #toTitleCase(char) 3522: * @see #toUpperCase(char) 3523: * 3524: * @since 1.5 3525: */ 3526: public static int toLowerCase(int codePoint) 3527: { 3528: // If the code point is unassigned or in one of the private use areas 3529: // then we delegate the call to the appropriate private static inner class. 3530: int plane = codePoint >>> 16; 3531: if (plane > 2 && plane < 14) 3532: return UnassignedCharacters.toLowerCase(codePoint); 3533: if (plane > 14) 3534: return PrivateUseCharacters.toLowerCase(codePoint); 3535: 3536: // The short value stored in lower[plane] is the signed difference between 3537: // codePoint and its lowercase conversion. 3538: return ((short)lower[plane][readCodePoint(codePoint) >>> 7]) + codePoint; 3539: } 3540: 3541: /** 3542: * Converts a Unicode character into its uppercase equivalent mapping. 3543: * If a mapping does not exist, then the character passed is returned. 3544: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3545: * 3546: * @param ch character to convert to uppercase 3547: * @return uppercase mapping of ch, or ch if uppercase mapping does 3548: * not exist 3549: * @see #isLowerCase(char) 3550: * @see #isUpperCase(char) 3551: * @see #toLowerCase(char) 3552: * @see #toTitleCase(char) 3553: */ 3554: public static char toUpperCase(char ch) 3555: { 3556: return (char) (upper[0][readCodePoint((int)ch) >>> 7] + ch); 3557: } 3558: 3559: /** 3560: * Converts a Unicode character into its uppercase equivalent mapping. 3561: * If a mapping does not exist, then the character passed is returned. 3562: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 3563: * 3564: * @param codePoint character to convert to uppercase 3565: * @return uppercase mapping of ch, or ch if uppercase mapping does 3566: * not exist 3567: * @see #isLowerCase(char) 3568: * @see #isUpperCase(char) 3569: * @see #toLowerCase(char) 3570: * @see #toTitleCase(char) 3571: * 3572: * @since 1.5 3573: */ 3574: public static int toUpperCase(int codePoint) 3575: { 3576: // If the code point is unassigned or in one of the private use areas 3577: // then we delegate the call to the appropriate private static inner class. 3578: int plane = codePoint >>> 16; 3579: if (plane > 2 && plane < 14) 3580: return UnassignedCharacters.toUpperCase(codePoint); 3581: if (plane > 14) 3582: return PrivateUseCharacters.toUpperCase(codePoint); 3583: 3584: // The short value stored in upper[plane] is the signed difference between 3585: // codePoint and its uppercase conversion. 3586: return ((short)upper[plane][readCodePoint(codePoint) >>> 7]) + codePoint; 3587: } 3588: 3589: /** 3590: * Converts a Unicode character into its titlecase equivalent mapping. 3591: * If a mapping does not exist, then the character passed is returned. 3592: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3593: * 3594: * @param ch character to convert to titlecase 3595: * @return titlecase mapping of ch, or ch if titlecase mapping does 3596: * not exist 3597: * @see #isTitleCase(char) 3598: * @see #toLowerCase(char) 3599: * @see #toUpperCase(char) 3600: */ 3601: public static char toTitleCase(char ch) 3602: { 3603: // As title is short, it doesn't hurt to exhaustively iterate over it. 3604: for (int i = title.length - 2; i >= 0; i -= 2) 3605: if (title[i] == ch) 3606: return title[i + 1]; 3607: return toUpperCase(ch); 3608: } 3609: 3610: /** 3611: * Converts a Unicode character into its titlecase equivalent mapping. 3612: * If a mapping does not exist, then the character passed is returned. 3613: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 3614: * 3615: * @param codePoint character to convert to titlecase 3616: * @return titlecase mapping of ch, or ch if titlecase mapping does 3617: * not exist 3618: * @see #isTitleCase(char) 3619: * @see #toLowerCase(char) 3620: * @see #toUpperCase(char) 3621: * 3622: * @since 1.5 3623: */ 3624: public static int toTitleCase(int codePoint) 3625: { 3626: // As of Unicode 4.0.0 no characters outside of plane 0 have 3627: // titlecase mappings that are different from their uppercase 3628: // mapping. 3629: if (codePoint < 0x10000) 3630: return (int) toTitleCase((char)codePoint); 3631: return toUpperCase(codePoint); 3632: } 3633: 3634: /** 3635: * Converts a character into a digit of the specified radix. If the radix 3636: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3637: * exceeds the radix, or if ch is not a decimal digit or in the case 3638: * insensitive set of 'a'-'z', the result is -1. 3639: * <br> 3640: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3641: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3642: * 3643: * @param ch character to convert into a digit 3644: * @param radix radix in which ch is a digit 3645: * @return digit which ch represents in radix, or -1 not a valid digit 3646: * @see #MIN_RADIX 3647: * @see #MAX_RADIX 3648: * @see #forDigit(int, int) 3649: * @see #isDigit(char) 3650: * @see #getNumericValue(char) 3651: */ 3652: public static int digit(char ch, int radix) 3653: { 3654: if (radix < MIN_RADIX || radix > MAX_RADIX) 3655: return -1; 3656: char attr = readCodePoint((int)ch); 3657: if (((1 << (attr & TYPE_MASK)) 3658: & ((1 << UPPERCASE_LETTER) 3659: | (1 << LOWERCASE_LETTER) 3660: | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 3661: { 3662: // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 3663: int digit = numValue[0][attr >> 7]; 3664: return (digit < radix) ? digit : -1; 3665: } 3666: return -1; 3667: } 3668: 3669: /** 3670: * Converts a character into a digit of the specified radix. If the radix 3671: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 3672: * exceeds the radix, or if ch is not a decimal digit or in the case 3673: * insensitive set of 'a'-'z', the result is -1. 3674: * <br> 3675: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 3676: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3677: * 3678: * @param codePoint character to convert into a digit 3679: * @param radix radix in which ch is a digit 3680: * @return digit which ch represents in radix, or -1 not a valid digit 3681: * @see #MIN_RADIX 3682: * @see #MAX_RADIX 3683: * @see #forDigit(int, int) 3684: * @see #isDigit(char) 3685: * @see #getNumericValue(char) 3686: */ 3687: public static int digit(int codePoint, int radix) 3688: { 3689: if (radix < MIN_RADIX || radix > MAX_RADIX) 3690: return -1; 3691: 3692: // If the code point is unassigned or in one of the private use areas 3693: // then we delegate the call to the appropriate private static inner class. 3694: int plane = codePoint >>> 16; 3695: if (plane > 2 && plane < 14) 3696: return UnassignedCharacters.digit(codePoint, radix); 3697: if (plane > 14) 3698: return PrivateUseCharacters.digit(codePoint, radix); 3699: char attr = readCodePoint(codePoint); 3700: if (((1 << (attr & TYPE_MASK)) 3701: & ((1 << UPPERCASE_LETTER) 3702: | (1 << LOWERCASE_LETTER) 3703: | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 3704: { 3705: // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 3706: int digit = numValue[plane][attr >> 7]; 3707: 3708: // If digit is less than or equal to -3 then the numerical value was 3709: // too large to fit into numValue and is stored in CharData.LARGENUMS. 3710: if (digit <= -3) 3711: digit = CharData.LARGENUMS[-digit - 3]; 3712: return (digit < radix) ? digit : -1; 3713: } 3714: return -1; 3715: } 3716: 3717: /** 3718: * Returns the Unicode numeric value property of a character. For example, 3719: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3720: * 3721: * <p>This method also returns values for the letters A through Z, (not 3722: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3723: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3724: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3725: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3726: * <code>'\uFF5A'</code> (full width variants). 3727: * 3728: * <p>If the character lacks a numeric value property, -1 is returned. 3729: * If the character has a numeric value property which is not representable 3730: * as a nonnegative integer, such as a fraction, -2 is returned. 3731: * 3732: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3733: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3734: * 3735: * @param ch character from which the numeric value property will 3736: * be retrieved 3737: * @return the numeric value property of ch, or -1 if it does not exist, or 3738: * -2 if it is not representable as a nonnegative integer 3739: * @see #forDigit(int, int) 3740: * @see #digit(char, int) 3741: * @see #isDigit(char) 3742: * @since 1.1 3743: */ 3744: public static int getNumericValue(char ch) 3745: { 3746: // Treat numValue as signed. 3747: return (short) numValue[0][readCodePoint((int)ch) >> 7]; 3748: } 3749: 3750: /** 3751: * Returns the Unicode numeric value property of a character. For example, 3752: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 3753: * 3754: * <p>This method also returns values for the letters A through Z, (not 3755: * specified by Unicode), in these ranges: <code>'\u0041'</code> 3756: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 3757: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 3758: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 3759: * <code>'\uFF5A'</code> (full width variants). 3760: * 3761: * <p>If the character lacks a numeric value property, -1 is returned. 3762: * If the character has a numeric value property which is not representable 3763: * as a nonnegative integer, such as a fraction, -2 is returned. 3764: * 3765: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 3766: * |U+FF21-U+FF3A|U+FF41-U+FF5A 3767: * 3768: * @param codePoint character from which the numeric value property will 3769: * be retrieved 3770: * @return the numeric value property of ch, or -1 if it does not exist, or 3771: * -2 if it is not representable as a nonnegative integer 3772: * @see #forDigit(int, int) 3773: * @see #digit(char, int) 3774: * @see #isDigit(char) 3775: * @since 1.5 3776: */ 3777: public static int getNumericValue(int codePoint) 3778: { 3779: // If the code point is unassigned or in one of the private use areas 3780: // then we delegate the call to the appropriate private static inner class. 3781: int plane = codePoint >>> 16; 3782: if (plane > 2 && plane < 14) 3783: return UnassignedCharacters.getNumericValue(codePoint); 3784: if (plane > 14) 3785: return PrivateUseCharacters.getNumericValue(codePoint); 3786: 3787: // If the value N found in numValue[plane] is less than or equal to -3 3788: // then the numeric value was too big to fit into 16 bits and is 3789: // stored in CharData.LARGENUMS at offset (-N - 3). 3790: short num = (short)numValue[plane][readCodePoint(codePoint) >> 7]; 3791: if (num <= -3) 3792: return CharData.LARGENUMS[-num - 3]; 3793: return num; 3794: } 3795: 3796: /** 3797: * Determines if a character is a ISO-LATIN-1 space. This is only the five 3798: * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 3799: * <code>'\r'</code>, and <code>' '</code>. 3800: * <br> 3801: * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 3802: * 3803: * @param ch character to test 3804: * @return true if ch is a space, else false 3805: * @deprecated Replaced by {@link #isWhitespace(char)} 3806: * @see #isSpaceChar(char) 3807: * @see #isWhitespace(char) 3808: */ 3809: public static boolean isSpace(char ch) 3810: { 3811: // Performing the subtraction up front alleviates need to compare longs. 3812: return ch-- <= ' ' && ((1 << ch) 3813: & ((1 << (' ' - 1)) 3814: | (1 << ('\t' - 1)) 3815: | (1 << ('\n' - 1)) 3816: | (1 << ('\r' - 1)) 3817: | (1 << ('\f' - 1)))) != 0; 3818: } 3819: 3820: /** 3821: * Determines if a character is a Unicode space character. This includes 3822: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3823: * <br> 3824: * Unicode space = [Zs]|[Zp]|[Zl] 3825: * 3826: * @param ch character to test 3827: * @return true if ch is a Unicode space, else false 3828: * @see #isWhitespace(char) 3829: * @since 1.1 3830: */ 3831: public static boolean isSpaceChar(char ch) 3832: { 3833: return isSpaceChar((int)ch); 3834: } 3835: 3836: /** 3837: * Determines if a character is a Unicode space character. This includes 3838: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 3839: * <br> 3840: * Unicode space = [Zs]|[Zp]|[Zl] 3841: * 3842: * @param codePoint character to test 3843: * @return true if ch is a Unicode space, else false 3844: * @see #isWhitespace(char) 3845: * @since 1.5 3846: */ 3847: public static boolean isSpaceChar(int codePoint) 3848: { 3849: return ((1 << getType(codePoint)) 3850: & ((1 << SPACE_SEPARATOR) 3851: | (1 << LINE_SEPARATOR) 3852: | (1 << PARAGRAPH_SEPARATOR))) != 0; 3853: } 3854: 3855: /** 3856: * Determines if a character is Java whitespace. This includes Unicode 3857: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3858: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3859: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3860: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3861: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3862: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3863: * and <code>'\u001F'</code>. 3864: * <br> 3865: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3866: * 3867: * @param ch character to test 3868: * @return true if ch is Java whitespace, else false 3869: * @see #isSpaceChar(char) 3870: * @since 1.1 3871: */ 3872: public static boolean isWhitespace(char ch) 3873: { 3874: return isWhitespace((int) ch); 3875: } 3876: 3877: /** 3878: * Determines if a character is Java whitespace. This includes Unicode 3879: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 3880: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 3881: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 3882: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 3883: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 3884: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 3885: * and <code>'\u001F'</code>. 3886: * <br> 3887: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 3888: * 3889: * @param codePoint character to test 3890: * @return true if ch is Java whitespace, else false 3891: * @see #isSpaceChar(char) 3892: * @since 1.5 3893: */ 3894: public static boolean isWhitespace(int codePoint) 3895: { 3896: int plane = codePoint >>> 16; 3897: if (plane > 2 && plane < 14) 3898: return UnassignedCharacters.isWhiteSpace(codePoint); 3899: if (plane > 14) 3900: return PrivateUseCharacters.isWhiteSpace(codePoint); 3901: 3902: int attr = readCodePoint(codePoint); 3903: return ((((1 << (attr & TYPE_MASK)) 3904: & ((1 << SPACE_SEPARATOR) 3905: | (1 << LINE_SEPARATOR) 3906: | (1 << PARAGRAPH_SEPARATOR))) != 0) 3907: && (attr & NO_BREAK_MASK) == 0) 3908: || (codePoint <= '\u001F' && ((1 << codePoint) 3909: & ((1 << '\t') 3910: | (1 << '\n') 3911: | (1 << '\u000B') 3912: | (1 << '\u000C') 3913: | (1 << '\r') 3914: | (1 << '\u001C') 3915: | (1 << '\u001D') 3916: | (1 << '\u001E') 3917: | (1 << '\u001F'))) != 0); 3918: } 3919: 3920: /** 3921: * Determines if a character has the ISO Control property. 3922: * <br> 3923: * ISO Control = [Cc] 3924: * 3925: * @param ch character to test 3926: * @return true if ch is an ISO Control character, else false 3927: * @see #isSpaceChar(char) 3928: * @see #isWhitespace(char) 3929: * @since 1.1 3930: */ 3931: public static boolean isISOControl(char ch) 3932: { 3933: return isISOControl((int)ch); 3934: } 3935: 3936: /** 3937: * Determines if the character is an ISO Control character. This is true 3938: * if the code point is in the range [0, 0x001F] or if it is in the range 3939: * [0x007F, 0x009F]. 3940: * @param codePoint the character to check 3941: * @return true if the character is in one of the above ranges 3942: * 3943: * @since 1.5 3944: */ 3945: public static boolean isISOControl(int codePoint) 3946: { 3947: if ((codePoint >= 0 && codePoint <= 0x001F) 3948: || (codePoint >= 0x007F && codePoint <= 0x009F)) 3949: return true; 3950: return false; 3951: } 3952: 3953: /** 3954: * Returns the Unicode general category property of a character. 3955: * 3956: * @param ch character from which the general category property will 3957: * be retrieved 3958: * @return the character category property of ch as an integer 3959: * @see #UNASSIGNED 3960: * @see #UPPERCASE_LETTER 3961: * @see #LOWERCASE_LETTER 3962: * @see #TITLECASE_LETTER 3963: * @see #MODIFIER_LETTER 3964: * @see #OTHER_LETTER 3965: * @see #NON_SPACING_MARK 3966: * @see #ENCLOSING_MARK 3967: * @see #COMBINING_SPACING_MARK 3968: * @see #DECIMAL_DIGIT_NUMBER 3969: * @see #LETTER_NUMBER 3970: * @see #OTHER_NUMBER 3971: * @see #SPACE_SEPARATOR 3972: * @see #LINE_SEPARATOR 3973: * @see #PARAGRAPH_SEPARATOR 3974: * @see #CONTROL 3975: * @see #FORMAT 3976: * @see #PRIVATE_USE 3977: * @see #SURROGATE 3978: * @see #DASH_PUNCTUATION 3979: * @see #START_PUNCTUATION 3980: * @see #END_PUNCTUATION 3981: * @see #CONNECTOR_PUNCTUATION 3982: * @see #OTHER_PUNCTUATION 3983: * @see #MATH_SYMBOL 3984: * @see #CURRENCY_SYMBOL 3985: * @see #MODIFIER_SYMBOL 3986: * @see #INITIAL_QUOTE_PUNCTUATION 3987: * @see #FINAL_QUOTE_PUNCTUATION 3988: * @since 1.1 3989: */ 3990: public static int getType(char ch) 3991: { 3992: return getType((int)ch); 3993: } 3994: 3995: /** 3996: * Returns the Unicode general category property of a character. 3997: * 3998: * @param codePoint character from which the general category property will 3999: * be retrieved 4000: * @return the character category property of ch as an integer 4001: * @see #UNASSIGNED 4002: * @see #UPPERCASE_LETTER 4003: * @see #LOWERCASE_LETTER 4004: * @see #TITLECASE_LETTER 4005: * @see #MODIFIER_LETTER 4006: * @see #OTHER_LETTER 4007: * @see #NON_SPACING_MARK 4008: * @see #ENCLOSING_MARK 4009: * @see #COMBINING_SPACING_MARK 4010: * @see #DECIMAL_DIGIT_NUMBER 4011: * @see #LETTER_NUMBER 4012: * @see #OTHER_NUMBER 4013: * @see #SPACE_SEPARATOR 4014: * @see #LINE_SEPARATOR 4015: * @see #PARAGRAPH_SEPARATOR 4016: * @see #CONTROL 4017: * @see #FORMAT 4018: * @see #PRIVATE_USE 4019: * @see #SURROGATE 4020: * @see #DASH_PUNCTUATION 4021: * @see #START_PUNCTUATION 4022: * @see #END_PUNCTUATION 4023: * @see #CONNECTOR_PUNCTUATION 4024: * @see #OTHER_PUNCTUATION 4025: * @see #MATH_SYMBOL 4026: * @see #CURRENCY_SYMBOL 4027: * @see #MODIFIER_SYMBOL 4028: * @see #INITIAL_QUOTE_PUNCTUATION 4029: * @see #FINAL_QUOTE_PUNCTUATION 4030: * 4031: * @since 1.5 4032: */ 4033: public static int getType(int codePoint) 4034: { 4035: // If the codePoint is unassigned or in one of the private use areas 4036: // then we delegate the call to the appropriate private static inner class. 4037: int plane = codePoint >>> 16; 4038: if (plane > 2 && plane < 14) 4039: return UnassignedCharacters.getType(codePoint); 4040: if (plane > 14) 4041: return PrivateUseCharacters.getType(codePoint); 4042: 4043: return readCodePoint(codePoint) & TYPE_MASK; 4044: } 4045: 4046: /** 4047: * Converts a digit into a character which represents that digit 4048: * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 4049: * or the digit exceeds the radix, then the null character <code>'\0'</code> 4050: * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 4051: * <br> 4052: * return value boundary = U+0030-U+0039|U+0061-U+007A 4053: * 4054: * @param digit digit to be converted into a character 4055: * @param radix radix of digit 4056: * @return character representing digit in radix, or '\0' 4057: * @see #MIN_RADIX 4058: * @see #MAX_RADIX 4059: * @see #digit(char, int) 4060: */ 4061: public static char forDigit(int digit, int radix) 4062: { 4063: if (radix < MIN_RADIX || radix > MAX_RADIX 4064: || digit < 0 || digit >= radix) 4065: return '\0'; 4066: return Number.digits[digit]; 4067: } 4068: 4069: /** 4070: * Returns the Unicode directionality property of the character. This 4071: * is used in the visual ordering of text. 4072: * 4073: * @param ch the character to look up 4074: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 4075: * @see #DIRECTIONALITY_UNDEFINED 4076: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 4077: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 4078: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 4079: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 4080: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 4081: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 4082: * @see #DIRECTIONALITY_ARABIC_NUMBER 4083: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 4084: * @see #DIRECTIONALITY_NONSPACING_MARK 4085: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 4086: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 4087: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 4088: * @see #DIRECTIONALITY_WHITESPACE 4089: * @see #DIRECTIONALITY_OTHER_NEUTRALS 4090: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 4091: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 4092: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 4093: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 4094: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 4095: * @since 1.4 4096: */ 4097: public static byte getDirectionality(char ch) 4098: { 4099: // The result will correctly be signed. 4100: return getDirectionality((int)ch); 4101: } 4102: 4103: 4104: /** 4105: * Returns the Unicode directionality property of the character. This 4106: * is used in the visual ordering of text. 4107: * 4108: * @param codePoint the character to look up 4109: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 4110: * @see #DIRECTIONALITY_UNDEFINED 4111: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 4112: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 4113: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 4114: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 4115: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 4116: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 4117: * @see #DIRECTIONALITY_ARABIC_NUMBER 4118: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 4119: * @see #DIRECTIONALITY_NONSPACING_MARK 4120: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 4121: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 4122: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 4123: * @see #DIRECTIONALITY_WHITESPACE 4124: * @see #DIRECTIONALITY_OTHER_NEUTRALS 4125: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 4126: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 4127: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 4128: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 4129: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 4130: * @since 1.5 4131: */ 4132: public static byte getDirectionality(int codePoint) 4133: { 4134: // If the code point is unassigned or in one of the private use areas 4135: // then we delegate the call to the appropriate private static inner class. 4136: int plane = codePoint >>> 16; 4137: if (plane > 2 && plane < 14) 4138: return UnassignedCharacters.getDirectionality(codePoint); 4139: if (plane > 14) 4140: return PrivateUseCharacters.getDirectionality(codePoint); 4141: 4142: // The result will correctly be signed. 4143: return (byte) (direction[plane][readCodePoint(codePoint) >> 7] >> 2); 4144: } 4145: 4146: /** 4147: * Determines whether the character is mirrored according to Unicode. For 4148: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 4149: * left-to-right text, but ')' in right-to-left text. 4150: * 4151: * @param ch the character to look up 4152: * @return true if the character is mirrored 4153: * @since 1.4 4154: */ 4155: public static boolean isMirrored(char ch) 4156: { 4157: return (readCodePoint((int)ch) & MIRROR_MASK) != 0; 4158: } 4159: 4160: /** 4161: * Determines whether the character is mirrored according to Unicode. For 4162: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 4163: * left-to-right text, but ')' in right-to-left text. 4164: * 4165: * @param codePoint the character to look up 4166: * @return true if the character is mirrored 4167: * @since 1.5 4168: */ 4169: public static boolean isMirrored(int codePoint) 4170: { 4171: // If the code point is unassigned or part of one of the private use areas 4172: // then we delegate the call to the appropriate private static inner class. 4173: int plane = codePoint >>> 16; 4174: if (plane > 2 && plane < 14) 4175: return UnassignedCharacters.isMirrored(codePoint); 4176: if (plane > 14) 4177: return PrivateUseCharacters.isMirrored(codePoint); 4178: 4179: return (readCodePoint(codePoint) & MIRROR_MASK) != 0; 4180: } 4181: 4182: /** 4183: * Compares another Character to this Character, numerically. 4184: * 4185: * @param anotherCharacter Character to compare with this Character 4186: * @return a negative integer if this Character is less than 4187: * anotherCharacter, zero if this Character is equal, and 4188: * a positive integer if this Character is greater 4189: * @throws NullPointerException if anotherCharacter is null 4190: * @since 1.2 4191: */ 4192: public int compareTo(Character anotherCharacter) 4193: { 4194: return value - anotherCharacter.value; 4195: } 4196: 4197: /** 4198: * Returns an <code>Character</code> object wrapping the value. 4199: * In contrast to the <code>Character</code> constructor, this method 4200: * will cache some values. It is used by boxing conversion. 4201: * 4202: * @param val the value to wrap 4203: * @return the <code>Character</code> 4204: * 4205: * @since 1.5 4206: */ 4207: public static Character valueOf(char val) 4208: { 4209: if (val > MAX_CACHE) 4210: return new Character(val); 4211: synchronized (charCache) 4212: { 4213: if (charCache[val - MIN_VALUE] == null) 4214: charCache[val - MIN_VALUE] = new Character(val); 4215: return charCache[val - MIN_VALUE]; 4216: } 4217: } 4218: 4219: /** 4220: * Reverse the bytes in val. 4221: * @since 1.5 4222: */ 4223: public static char reverseBytes(char val) 4224: { 4225: return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00)); 4226: } 4227: 4228: /** 4229: * Converts a unicode code point to a UTF-16 representation of that 4230: * code point. 4231: * 4232: * @param codePoint the unicode code point 4233: * 4234: * @return the UTF-16 representation of that code point 4235: * 4236: * @throws IllegalArgumentException if the code point is not a valid 4237: * unicode code point 4238: * 4239: * @since 1.5 4240: */ 4241: public static char[] toChars(int codePoint) 4242: { 4243: if (!isValidCodePoint(codePoint)) 4244: throw new IllegalArgumentException("Illegal Unicode code point : " 4245: + codePoint); 4246: char[] result = new char[charCount(codePoint)]; 4247: int ignore = toChars(codePoint, result, 0); 4248: return result; 4249: } 4250: 4251: /** 4252: * Converts a unicode code point to its UTF-16 representation. 4253: * 4254: * @param codePoint the unicode code point 4255: * @param dst the target char array 4256: * @param dstIndex the start index for the target 4257: * 4258: * @return number of characters written to <code>dst</code> 4259: * 4260: * @throws IllegalArgumentException if <code>codePoint</code> is not a 4261: * valid unicode code point 4262: * @throws NullPointerException if <code>dst</code> is <code>null</code> 4263: * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 4264: * in <code>dst</code> or if the UTF-16 representation does not 4265: * fit into <code>dst</code> 4266: * 4267: * @since 1.5 4268: */ 4269: public static int toChars(int codePoint, char[] dst, int dstIndex) 4270: { 4271: if (!isValidCodePoint(codePoint)) 4272: { 4273: throw new IllegalArgumentException("not a valid code point: " 4274: + codePoint); 4275: } 4276: 4277: int result; 4278: if (isSupplementaryCodePoint(codePoint)) 4279: { 4280: // Write second char first to cause IndexOutOfBoundsException 4281: // immediately. 4282: final int cp2 = codePoint - 0x10000; 4283: dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE); 4284: dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE); 4285: result = 2; 4286: } 4287: else 4288: { 4289: dst[dstIndex] = (char) codePoint; 4290: result = 1; 4291: } 4292: return result; 4293: } 4294: 4295: /** 4296: * Return number of 16-bit characters required to represent the given 4297: * code point. 4298: * 4299: * @param codePoint a unicode code point 4300: * 4301: * @return 2 if codePoint >= 0x10000, 1 otherwise. 4302: * 4303: * @since 1.5 4304: */ 4305: public static int charCount(int codePoint) 4306: { 4307: return 4308: (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 4309: ? 2 4310: : 1; 4311: } 4312: 4313: /** 4314: * Determines whether the specified code point is 4315: * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 4316: * supplementary character range. 4317: * 4318: * @param codePoint a Unicode code point 4319: * 4320: * @return <code>true</code> if code point is in supplementary range 4321: * 4322: * @since 1.5 4323: */ 4324: public static boolean isSupplementaryCodePoint(int codePoint) 4325: { 4326: return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 4327: && codePoint <= MAX_CODE_POINT; 4328: } 4329: 4330: /** 4331: * Determines whether the specified code point is 4332: * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 4333: * 4334: * @param codePoint a Unicode code point 4335: * 4336: * @return <code>true</code> if code point is valid 4337: * 4338: * @since 1.5 4339: */ 4340: public static boolean isValidCodePoint(int codePoint) 4341: { 4342: return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 4343: } 4344: 4345: /** 4346: * Return true if the given character is a high surrogate. 4347: * @param ch the character 4348: * @return true if the character is a high surrogate character 4349: * 4350: * @since 1.5 4351: */ 4352: public static boolean isHighSurrogate(char ch) 4353: { 4354: return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE; 4355: } 4356: 4357: /** 4358: * Return true if the given character is a low surrogate. 4359: * @param ch the character 4360: * @return true if the character is a low surrogate character 4361: * 4362: * @since 1.5 4363: */ 4364: public static boolean isLowSurrogate(char ch) 4365: { 4366: return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE; 4367: } 4368: 4369: /** 4370: * Return true if the given characters compose a surrogate pair. 4371: * This is true if the first character is a high surrogate and the 4372: * second character is a low surrogate. 4373: * @param ch1 the first character 4374: * @param ch2 the first character 4375: * @return true if the characters compose a surrogate pair 4376: * 4377: * @since 1.5 4378: */ 4379: public static boolean isSurrogatePair(char ch1, char ch2) 4380: { 4381: return isHighSurrogate(ch1) && isLowSurrogate(ch2); 4382: } 4383: 4384: /** 4385: * Given a valid surrogate pair, this returns the corresponding 4386: * code point. 4387: * @param high the high character of the pair 4388: * @param low the low character of the pair 4389: * @return the corresponding code point 4390: * 4391: * @since 1.5 4392: */ 4393: public static int toCodePoint(char high, char low) 4394: { 4395: return ((high - MIN_HIGH_SURROGATE) * 0x400) + 4396: (low - MIN_LOW_SURROGATE) + 0x10000; 4397: } 4398: 4399: /** 4400: * Get the code point at the specified index in the CharSequence. 4401: * This is like CharSequence#charAt(int), but if the character is 4402: * the start of a surrogate pair, and there is a following 4403: * character, and this character completes the pair, then the 4404: * corresponding supplementary code point is returned. Otherwise, 4405: * the character at the index is returned. 4406: * 4407: * @param sequence the CharSequence 4408: * @param index the index of the codepoint to get, starting at 0 4409: * @return the codepoint at the specified index 4410: * @throws IndexOutOfBoundsException if index is negative or >= length() 4411: * @since 1.5 4412: */ 4413: public static int codePointAt(CharSequence sequence, int index) 4414: { 4415: int len = sequence.length(); 4416: if (index < 0 || index >= len) 4417: throw new IndexOutOfBoundsException(); 4418: char high = sequence.charAt(index); 4419: if (! isHighSurrogate(high) || ++index >= len) 4420: return high; 4421: char low = sequence.charAt(index); 4422: if (! isLowSurrogate(low)) 4423: return high; 4424: return toCodePoint(high, low); 4425: } 4426: 4427: /** 4428: * Get the code point at the specified index in the CharSequence. 4429: * If the character is the start of a surrogate pair, and there is a 4430: * following character, and this character completes the pair, then 4431: * the corresponding supplementary code point is returned. 4432: * Otherwise, the character at the index is returned. 4433: * 4434: * @param chars the character array in which to look 4435: * @param index the index of the codepoint to get, starting at 0 4436: * @return the codepoint at the specified index 4437: * @throws IndexOutOfBoundsException if index is negative or >= length() 4438: * @since 1.5 4439: */ 4440: public static int codePointAt(char[] chars, int index) 4441: { 4442: return codePointAt(chars, index, chars.length); 4443: } 4444: 4445: /** 4446: * Get the code point at the specified index in the CharSequence. 4447: * If the character is the start of a surrogate pair, and there is a 4448: * following character within the specified range, and this 4449: * character completes the pair, then the corresponding 4450: * supplementary code point is returned. Otherwise, the character 4451: * at the index is returned. 4452: * 4453: * @param chars the character array in which to look 4454: * @param index the index of the codepoint to get, starting at 0 4455: * @param limit the limit past which characters should not be examined 4456: * @return the codepoint at the specified index 4457: * @throws IndexOutOfBoundsException if index is negative or >= 4458: * limit, or if limit is negative or >= the length of the array 4459: * @since 1.5 4460: */ 4461: public static int codePointAt(char[] chars, int index, int limit) 4462: { 4463: if (index < 0 || index >= limit || limit < 0 || limit > chars.length) 4464: throw new IndexOutOfBoundsException(); 4465: char high = chars[index]; 4466: if (! isHighSurrogate(high) || ++index >= limit) 4467: return high; 4468: char low = chars[index]; 4469: if (! isLowSurrogate(low)) 4470: return high; 4471: return toCodePoint(high, low); 4472: } 4473: 4474: /** 4475: * Get the code point before the specified index. This is like 4476: * #codePointAt(char[], int), but checks the characters at 4477: * <code>index-1</code> and <code>index-2</code> to see if they form 4478: * a supplementary code point. If they do not, the character at 4479: * <code>index-1</code> is returned. 4480: * 4481: * @param chars the character array 4482: * @param index the index just past the codepoint to get, starting at 0 4483: * @return the codepoint at the specified index 4484: * @throws IndexOutOfBoundsException if index is negative or >= length() 4485: * @since 1.5 4486: */ 4487: public static int codePointBefore(char[] chars, int index) 4488: { 4489: return codePointBefore(chars, index, 1); 4490: } 4491: 4492: /** 4493: * Get the code point before the specified index. This is like 4494: * #codePointAt(char[], int), but checks the characters at 4495: * <code>index-1</code> and <code>index-2</code> to see if they form 4496: * a supplementary code point. If they do not, the character at 4497: * <code>index-1</code> is returned. The start parameter is used to 4498: * limit the range of the array which may be examined. 4499: * 4500: * @param chars the character array 4501: * @param index the index just past the codepoint to get, starting at 0 4502: * @param start the index before which characters should not be examined 4503: * @return the codepoint at the specified index 4504: * @throws IndexOutOfBoundsException if index is > start or > 4505: * the length of the array, or if limit is negative or >= the 4506: * length of the array 4507: * @since 1.5 4508: */ 4509: public static int codePointBefore(char[] chars, int index, int start) 4510: { 4511: if (index < start || index > chars.length 4512: || start < 0 || start >= chars.length) 4513: throw new IndexOutOfBoundsException(); 4514: --index; 4515: char low = chars[index]; 4516: if (! isLowSurrogate(low) || --index < start) 4517: return low; 4518: char high = chars[index]; 4519: if (! isHighSurrogate(high)) 4520: return low; 4521: return toCodePoint(high, low); 4522: } 4523: 4524: /** 4525: * Get the code point before the specified index. This is like 4526: * #codePointAt(CharSequence, int), but checks the characters at 4527: * <code>index-1</code> and <code>index-2</code> to see if they form 4528: * a supplementary code point. If they do not, the character at 4529: * <code>index-1</code> is returned. 4530: * 4531: * @param sequence the CharSequence 4532: * @param index the index just past the codepoint to get, starting at 0 4533: * @return the codepoint at the specified index 4534: * @throws IndexOutOfBoundsException if index is negative or >= length() 4535: * @since 1.5 4536: */ 4537: public static int codePointBefore(CharSequence sequence, int index) 4538: { 4539: int len = sequence.length(); 4540: if (index < 1 || index > len) 4541: throw new IndexOutOfBoundsException(); 4542: --index; 4543: char low = sequence.charAt(index); 4544: if (! isLowSurrogate(low) || --index < 0) 4545: return low; 4546: char high = sequence.charAt(index); 4547: if (! isHighSurrogate(high)) 4548: return low; 4549: return toCodePoint(high, low); 4550: } 4551: } // class Character
GNU Classpath (0.95) |