2021-01-30 07:51:39 +01:00
|
|
|
//
|
|
|
|
// Unicode.h
|
|
|
|
//
|
|
|
|
// Library: Foundation
|
|
|
|
// Package: Text
|
|
|
|
// Module: Unicode
|
|
|
|
//
|
|
|
|
// Definition of the Unicode class.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
|
|
|
|
// and Contributors.
|
|
|
|
//
|
|
|
|
// SPDX-License-Identifier: BSL-1.0
|
|
|
|
//
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef Foundation_Unicode_INCLUDED
|
|
|
|
#define Foundation_Unicode_INCLUDED
|
|
|
|
|
|
|
|
|
|
|
|
#include "Poco/Foundation.h"
|
|
|
|
|
|
|
|
|
|
|
|
namespace Poco {
|
|
|
|
|
|
|
|
|
|
|
|
class Foundation_API Unicode
|
|
|
|
/// This class contains enumerations and static
|
|
|
|
/// utility functions for dealing with Unicode characters
|
|
|
|
/// and their properties.
|
|
|
|
///
|
|
|
|
/// For more information on Unicode, see <http://www.unicode.org>.
|
|
|
|
///
|
|
|
|
/// The implementation is based on the Unicode support
|
|
|
|
/// functions in PCRE.
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
// Implementation note: the following definitions must be kept
|
2023-03-23 19:19:11 +01:00
|
|
|
// in sync with those from pcre2_ucp.h (PCRE).
|
2021-01-30 07:51:39 +01:00
|
|
|
enum CharacterCategory
|
|
|
|
/// Unicode character categories.
|
|
|
|
{
|
|
|
|
UCP_OTHER,
|
|
|
|
UCP_LETTER,
|
|
|
|
UCP_MARK,
|
|
|
|
UCP_NUMBER,
|
|
|
|
UCP_PUNCTUATION,
|
|
|
|
UCP_SYMBOL,
|
|
|
|
UCP_SEPARATOR
|
|
|
|
};
|
|
|
|
|
|
|
|
enum CharacterType
|
|
|
|
/// Unicode character types.
|
|
|
|
{
|
|
|
|
UCP_CONTROL,
|
|
|
|
UCP_FORMAT,
|
|
|
|
UCP_UNASSIGNED,
|
|
|
|
UCP_PRIVATE_USE,
|
|
|
|
UCP_SURROGATE,
|
|
|
|
UCP_LOWER_CASE_LETTER,
|
|
|
|
UCP_MODIFIER_LETTER,
|
|
|
|
UCP_OTHER_LETTER,
|
|
|
|
UCP_TITLE_CASE_LETTER,
|
|
|
|
UCP_UPPER_CASE_LETTER,
|
|
|
|
UCP_SPACING_MARK,
|
|
|
|
UCP_ENCLOSING_MARK,
|
|
|
|
UCP_NON_SPACING_MARK,
|
|
|
|
UCP_DECIMAL_NUMBER,
|
|
|
|
UCP_LETTER_NUMBER,
|
|
|
|
UCP_OTHER_NUMBER,
|
|
|
|
UCP_CONNECTOR_PUNCTUATION,
|
|
|
|
UCP_DASH_PUNCTUATION,
|
|
|
|
UCP_CLOSE_PUNCTUATION,
|
|
|
|
UCP_FINAL_PUNCTUATION,
|
|
|
|
UCP_INITIAL_PUNCTUATION,
|
|
|
|
UCP_OTHER_PUNCTUATION,
|
|
|
|
UCP_OPEN_PUNCTUATION,
|
|
|
|
UCP_CURRENCY_SYMBOL,
|
|
|
|
UCP_MODIFIER_SYMBOL,
|
|
|
|
UCP_MATHEMATICAL_SYMBOL,
|
|
|
|
UCP_OTHER_SYMBOL,
|
|
|
|
UCP_LINE_SEPARATOR,
|
|
|
|
UCP_PARAGRAPH_SEPARATOR,
|
|
|
|
UCP_SPACE_SEPARATOR
|
|
|
|
};
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
enum Script
|
|
|
|
/// Unicode 7.0 script identifiers.
|
|
|
|
{
|
|
|
|
UCP_ARABIC,
|
|
|
|
UCP_ARMENIAN,
|
|
|
|
UCP_BENGALI,
|
|
|
|
UCP_BOPOMOFO,
|
|
|
|
UCP_BRAILLE,
|
|
|
|
UCP_BUGINESE,
|
|
|
|
UCP_BUHID,
|
|
|
|
UCP_CANADIAN_ABORIGINAL,
|
|
|
|
UCP_CHEROKEE,
|
|
|
|
UCP_COMMON,
|
|
|
|
UCP_COPTIC,
|
|
|
|
UCP_CYPRIOT,
|
|
|
|
UCP_CYRILLIC,
|
|
|
|
UCP_DESERET,
|
|
|
|
UCP_DEVANAGARI,
|
|
|
|
UCP_ETHIOPIC,
|
|
|
|
UCP_GEORGIAN,
|
|
|
|
UCP_GLAGOLITIC,
|
|
|
|
UCP_GOTHIC,
|
|
|
|
UCP_GREEK,
|
|
|
|
UCP_GUJARATI,
|
|
|
|
UCP_GURMUKHI,
|
|
|
|
UCP_HAN,
|
|
|
|
UCP_HANGUL,
|
|
|
|
UCP_HANUNOO,
|
|
|
|
UCP_HEBREW,
|
|
|
|
UCP_HIRAGANA,
|
|
|
|
UCP_INHERITED,
|
|
|
|
UCP_KANNADA,
|
|
|
|
UCP_KATAKANA,
|
|
|
|
UCP_KHAROSHTHI,
|
|
|
|
UCP_KHMER,
|
|
|
|
UCP_LAO,
|
|
|
|
UCP_LATIN,
|
|
|
|
UCP_LIMBU,
|
|
|
|
UCP_LINEAR_B,
|
|
|
|
UCP_MALAYALAM,
|
|
|
|
UCP_MONGOLIAN,
|
|
|
|
UCP_MYANMAR,
|
|
|
|
UCP_NEW_TAI_LUE,
|
|
|
|
UCP_OGHAM,
|
|
|
|
UCP_OLD_ITALIC,
|
|
|
|
UCP_OLD_PERSIAN,
|
|
|
|
UCP_ORIYA,
|
|
|
|
UCP_OSMANYA,
|
|
|
|
UCP_RUNIC,
|
|
|
|
UCP_SHAVIAN,
|
|
|
|
UCP_SINHALA,
|
|
|
|
UCP_SYLOTI_NAGRI,
|
|
|
|
UCP_SYRIAC,
|
|
|
|
UCP_TAGALOG,
|
|
|
|
UCP_TAGBANWA,
|
|
|
|
UCP_TAI_LE,
|
|
|
|
UCP_TAMIL,
|
|
|
|
UCP_TELUGU,
|
|
|
|
UCP_THAANA,
|
|
|
|
UCP_THAI,
|
|
|
|
UCP_TIBETAN,
|
|
|
|
UCP_TIFINAGH,
|
|
|
|
UCP_UGARITIC,
|
|
|
|
UCP_YI,
|
|
|
|
// Unicode 5.0
|
|
|
|
UCP_BALINESE,
|
|
|
|
UCP_CUNEIFORM,
|
|
|
|
UCP_NKO,
|
|
|
|
UCP_PHAGS_PA,
|
|
|
|
UCP_PHOENICIAN,
|
|
|
|
// Unicode 5.1
|
|
|
|
UCP_CARIAN,
|
|
|
|
UCP_CHAM,
|
|
|
|
UCP_KAYAH_LI,
|
|
|
|
UCP_LEPCHA,
|
|
|
|
UCP_LYCIAN,
|
|
|
|
UCP_LYDIAN,
|
|
|
|
UCP_OL_CHIKI,
|
|
|
|
UCP_REJANG,
|
|
|
|
UCP_SAURASHTRA,
|
|
|
|
UCP_SUNDANESE,
|
|
|
|
UCP_VAI,
|
|
|
|
// Unicode 5.2
|
|
|
|
UCP_AVESTAN,
|
|
|
|
UCP_BAMUM,
|
|
|
|
UCP_EGYPTIAN_HIEROGLYPHS,
|
|
|
|
UCP_IMPERIAL_ARAMAIC,
|
|
|
|
UCP_INSCRIPTIONAL_PAHLAVI,
|
|
|
|
UCP_INSCRIPTIONAL_PARTHIAN,
|
|
|
|
UCP_JAVANESE,
|
|
|
|
UCP_KAITHI,
|
|
|
|
UCP_LISU,
|
|
|
|
UCP_MEETEI_MAYEK,
|
|
|
|
UCP_OLD_SOUTH_ARABIAN,
|
|
|
|
UCP_OLD_TURKIC,
|
|
|
|
UCP_SAMARITAN,
|
|
|
|
UCP_TAI_THAM,
|
|
|
|
UCP_TAI_VIET,
|
|
|
|
// Unicode 6.0
|
|
|
|
UCP_BATAK,
|
|
|
|
UCP_BRAHMI,
|
|
|
|
UCP_MANDAIC,
|
|
|
|
// Unicode 6.1
|
|
|
|
UCP_CHAKMA,
|
|
|
|
UCP_MEROITIC_CURSIVE,
|
|
|
|
UCP_MEROITIC_HIEROGLYPHS,
|
|
|
|
UCP_MIAO,
|
|
|
|
UCP_SHARADA,
|
|
|
|
UCP_SORA_SOMPENG,
|
|
|
|
UCP_TAKRI,
|
|
|
|
// Unicode 7.0
|
|
|
|
UCP_BASSA_VAH,
|
|
|
|
UCP_CAUCASIAN_ALBANIAN,
|
|
|
|
UCP_DUPLOYAN,
|
|
|
|
UCP_ELBASAN,
|
|
|
|
UCP_GRANTHA,
|
|
|
|
UCP_KHOJKI,
|
|
|
|
UCP_KHUDAWADI,
|
|
|
|
UCP_LINEAR_A,
|
|
|
|
UCP_MAHAJANI,
|
|
|
|
UCP_MANICHAEAN,
|
|
|
|
UCP_MENDE_KIKAKUI,
|
|
|
|
UCP_MODI,
|
|
|
|
UCP_MRO,
|
|
|
|
UCP_NABATAEAN,
|
|
|
|
UCP_OLD_NORTH_ARABIAN,
|
|
|
|
UCP_OLD_PERMIC,
|
|
|
|
UCP_PAHAWH_HMONG,
|
|
|
|
UCP_PALMYRENE,
|
|
|
|
UCP_PSALTER_PAHLAVI,
|
|
|
|
UCP_PAU_CIN_HAU,
|
|
|
|
UCP_SIDDHAM,
|
|
|
|
UCP_TIRHUTA,
|
|
|
|
UCP_WARANG_CITI
|
|
|
|
};
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
enum
|
|
|
|
{
|
|
|
|
UCP_MAX_CODEPOINT = 0x10FFFF
|
|
|
|
};
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
struct CharacterProperties
|
|
|
|
/// This structure holds the character properties
|
|
|
|
/// of an Unicode character.
|
|
|
|
{
|
|
|
|
CharacterCategory category;
|
|
|
|
CharacterType type;
|
|
|
|
Script script;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void properties(int ch, CharacterProperties& props);
|
|
|
|
/// Return the Unicode character properties for the
|
|
|
|
/// character with the given Unicode value.
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
static bool isSpace(int ch);
|
|
|
|
/// Returns true iff the given character is a separator.
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
static bool isDigit(int ch);
|
|
|
|
/// Returns true iff the given character is a numeric character.
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
static bool isPunct(int ch);
|
|
|
|
/// Returns true iff the given character is a punctuation character.
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
static bool isAlpha(int ch);
|
2023-03-23 19:19:11 +01:00
|
|
|
/// Returns true iff the given character is a letter.
|
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
static bool isLower(int ch);
|
|
|
|
/// Returns true iff the given character is a lowercase
|
|
|
|
/// character.
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
static bool isUpper(int ch);
|
|
|
|
/// Returns true iff the given character is an uppercase
|
|
|
|
/// character.
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
static int toLower(int ch);
|
|
|
|
/// If the given character is an uppercase character,
|
|
|
|
/// return its lowercase counterpart, otherwise return
|
|
|
|
/// the character.
|
|
|
|
|
|
|
|
static int toUpper(int ch);
|
|
|
|
/// If the given character is a lowercase character,
|
|
|
|
/// return its uppercase counterpart, otherwise return
|
|
|
|
/// the character.
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// inlines
|
|
|
|
//
|
|
|
|
inline bool Unicode::isSpace(int ch)
|
|
|
|
{
|
|
|
|
CharacterProperties props;
|
|
|
|
properties(ch, props);
|
|
|
|
return props.category == UCP_SEPARATOR;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline bool Unicode::isDigit(int ch)
|
|
|
|
{
|
|
|
|
CharacterProperties props;
|
|
|
|
properties(ch, props);
|
|
|
|
return props.category == UCP_NUMBER;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline bool Unicode::isPunct(int ch)
|
|
|
|
{
|
|
|
|
CharacterProperties props;
|
|
|
|
properties(ch, props);
|
|
|
|
return props.category == UCP_PUNCTUATION;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline bool Unicode::isAlpha(int ch)
|
|
|
|
{
|
|
|
|
CharacterProperties props;
|
|
|
|
properties(ch, props);
|
|
|
|
return props.category == UCP_LETTER;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline bool Unicode::isLower(int ch)
|
|
|
|
{
|
|
|
|
CharacterProperties props;
|
|
|
|
properties(ch, props);
|
|
|
|
return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
|
|
|
|
}
|
|
|
|
|
2023-03-23 19:19:11 +01:00
|
|
|
|
2021-01-30 07:51:39 +01:00
|
|
|
inline bool Unicode::isUpper(int ch)
|
|
|
|
{
|
|
|
|
CharacterProperties props;
|
|
|
|
properties(ch, props);
|
|
|
|
return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
} // namespace Poco
|
|
|
|
|
|
|
|
|
|
|
|
#endif // Foundation_Unicode_INCLUDED
|