cpputil/include/utf.h
2013-06-10 21:57:42 +04:00

80 lines
2.7 KiB
C++
Executable File

#ifndef UTF_H
#define UTF_H
#include <cstdint>
#include <string>
#include <exception>
#include <sstream>
/**
* @brief Класс исключения неправильной перекодировки
* @detailed Как правило исключение этого типа означает, что исходная строка
* не соответствует указанной кодировке
*/
class bad_conversion: public std::exception
{
private:
std::string _msg;
char32_t _badSymbol;
public:
bad_conversion(const char* msg, char32_t badData): _badSymbol(badData)
{
std::stringstream errStream;
errStream << msg << " bad symbol: " << _badSymbol;
_msg = errStream.str();
}
virtual ~bad_conversion() throw() {}
virtual const char* what() const throw()
{
return _msg.c_str();
}
};
#define UTF8_BOM_SIZE 3
#define UTF16_BOM_SIZE 2
#define UTF32_BOM_SIZE 4
extern const unsigned char g_utf8_bom[UTF8_BOM_SIZE];
extern const unsigned char g_utf16le_bom[UTF16_BOM_SIZE];
extern const unsigned char g_utf16be_bom[UTF16_BOM_SIZE];
extern const unsigned char g_utf32le_bom[UTF32_BOM_SIZE];
extern const unsigned char g_utf32be_bom[UTF32_BOM_SIZE];
enum byte_order
{
BYTE_ORDER_LITTLE_ENDIAN,
BYTE_ORDER_BIG_ENDIAN
};
byte_order current_byte_order();
inline char16_t invert_byte_order_16(char16_t val);
inline char32_t invert_byte_order_32(char32_t val);
std::size_t utf8_first_byte_mask(std::size_t symbolSize);
const char* utf8_skip_bom(const char* str);
const char16_t* utf16_skip_bom(const char16_t* str);
const char32_t* utf32_skip_bom(const char32_t* str);
std::size_t significant_bits(uint32_t v);
void utf32_strcpy_with_convert_byteorder(const char32_t* src, char32_t* dst, std::size_t num, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN);
void utf8_to_ucs4(const char* src, char32_t* dst, std::size_t symbols);
void ucs4_to_utf8(const char32_t* src, char* dst, std::size_t symbols);
void utf16_to_ucs4(const char16_t* src, char32_t* dst, std::size_t symbols, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN);
void ucs4_to_utf16(const char32_t* src, char16_t* dst, std::size_t symbols, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN);
std::size_t utf8_str_len(const char* str);
std::size_t utf8_get_symbol_size(const char* str);
std::size_t utf16_str_len(const char16_t* str, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN);
std::size_t utf16_get_symbol_size(const char16_t* str, byte_order byteOrder = BYTE_ORDER_LITTLE_ENDIAN);
std::size_t utf32_str_len(const char32_t* str);
std::size_t ucs4_get_utf8_symbol_size(const char32_t symbol);
std::size_t ucs4_get_utf8_str_bytes(const char32_t* str);
std::size_t ucs4_get_utf16_symbol_size(const char32_t symbol);
std::size_t ucs4_get_utf16_str_bytes(const char32_t* str);
#endif // UTF_H