#ifndef UTF_H #define UTF_H #include #include #include #include /** * @brief Класс исключения неправильной перекодировки * @detailed Как правило исключение этого типа означает, что исходная строка * не соответствует указанной кодировке */ class bad_conversion: public std::exception { private: std::string _msg; char32_t _badSymbol; public: bad_conversion(const char* msg, char32_t badData): _badSymbol(badData) { std::stringstream errStream; errStream << msg << " bad symbol: " << _badSymbol; _msg = errStream.str(); } virtual ~bad_conversion() throw() {} virtual const char* what() const throw() { return _msg.c_str(); } }; #define UTF8_BOM_SIZE 3 #define UTF16_BOM_SIZE 2 #define UTF32_BOM_SIZE 4 extern const unsigned char g_utf8_bom[UTF8_BOM_SIZE]; extern const unsigned char g_utf16le_bom[UTF16_BOM_SIZE]; extern const unsigned char g_utf16be_bom[UTF16_BOM_SIZE]; extern const unsigned char g_utf32le_bom[UTF32_BOM_SIZE]; extern const unsigned char g_utf32be_bom[UTF32_BOM_SIZE]; enum byte_order { BYTE_ORDER_LITTLE_ENDIAN, BYTE_ORDER_BIG_ENDIAN }; byte_order current_byte_order(); inline char16_t invert_byte_order_16(char16_t val); inline char32_t invert_byte_order_32(char32_t val); std::size_t utf8_first_byte_mask(std::size_t symbolSize); const char* utf8_skip_bom(const char* str); const char16_t* utf16_skip_bom(const char16_t* str); const char32_t* utf32_skip_bom(const char32_t* str); std::size_t significant_bits(uint32_t v); void utf32_strcpy_with_convert_byteorder(const char32_t* src, char32_t* dst, std::size_t num, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN); void utf8_to_ucs4(const char* src, char32_t* dst, std::size_t symbols, bool needUnescape); void ucs4_to_utf8(const char32_t* src, char* dst, std::size_t symbols); void utf16_to_ucs4(const char16_t* src, char32_t* dst, std::size_t symbols, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN); void ucs4_to_utf16(const char32_t* src, char16_t* dst, std::size_t symbols, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN); std::size_t utf8_str_len(const char* str, bool needUnescape); std::size_t utf8_get_symbol_size(const char* str, bool needUnescape); std::size_t utf16_str_len(const char16_t* str, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN); std::size_t utf16_get_symbol_size(const char16_t* str, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN); std::size_t utf32_str_len(const char32_t* str); std::size_t ucs4_get_utf8_symbol_size(const char32_t symbol); std::size_t ucs4_get_utf8_str_bytes(const char32_t* str); std::size_t ucs4_get_utf16_symbol_size(const char32_t symbol); std::size_t ucs4_get_utf16_str_bytes(const char32_t* str); #endif // UTF_H