Refactor string functions, add unicode support
Refactored existing functions. Added Unicode support and UTF-8, UTF-16, and UTF-32 encoding/decoding.
This commit is contained in:
280
src/String/Unicode.c
Normal file
280
src/String/Unicode.c
Normal file
@@ -0,0 +1,280 @@
|
||||
#include <BH/String.h>
|
||||
#include <BH/Util.h>
|
||||
|
||||
#include "Inline/Unicode.h"
|
||||
|
||||
|
||||
uint32_t decodeUtf8(uint32_t *unit,
|
||||
unsigned char *state,
|
||||
unsigned char byte)
|
||||
{
|
||||
unsigned char type, bits;
|
||||
|
||||
type = lookupUtf8Type[(size_t)byte];
|
||||
bits = lookupUtf8Bits[(size_t)type];
|
||||
*unit = (*unit << bits) | ((0xFF >> (8 - bits)) & byte);
|
||||
|
||||
return *state = lookupUtf8State[(size_t)type + *state];
|
||||
}
|
||||
|
||||
|
||||
uint32_t BH_UnicodeLower(uint32_t unit)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
/* In convertable range */
|
||||
if (unit >= 0xFFFF)
|
||||
return unit;
|
||||
|
||||
/* Fasttrack and search lookup table */
|
||||
i = lookupLowerIndex[unit >> 8];
|
||||
while (i < sizeof(lookupLower) / sizeof(struct CaseMap))
|
||||
{
|
||||
if (lookupLower[i].from > unit)
|
||||
return unit;
|
||||
if (lookupLower[i].from == unit)
|
||||
return lookupLower[i].to;
|
||||
i++;
|
||||
}
|
||||
|
||||
return unit;
|
||||
}
|
||||
|
||||
|
||||
uint32_t BH_UnicodeUpper(uint32_t unit)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
/* In convertable range */
|
||||
if (unit >= 0xFFFF)
|
||||
return unit;
|
||||
|
||||
/* Fasttrack and search lookup table */
|
||||
i = lookupUpperIndex[unit >> 8];
|
||||
while (i < sizeof(lookupUpper) / sizeof(struct CaseMap))
|
||||
{
|
||||
if (lookupUpper[i].from > unit)
|
||||
return unit;
|
||||
if (lookupUpper[i].from == unit)
|
||||
return lookupUpper[i].to;
|
||||
i++;
|
||||
}
|
||||
|
||||
return unit;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeDecodeUtf8(const char *string,
|
||||
size_t size,
|
||||
uint32_t *unit)
|
||||
{
|
||||
unsigned char state;
|
||||
size_t i;
|
||||
|
||||
state = 0;
|
||||
*unit = 0;
|
||||
for (i = 0; i < size; i++)
|
||||
{
|
||||
switch (decodeUtf8(unit, &state, string[i]))
|
||||
{
|
||||
case UTF8_OK: return i + 1;
|
||||
case UTF8_ERROR: *unit = 0xFFFFFFFF; return (i > 0 ? i : i + 1);
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeEncodeUtf8(uint32_t unit,
|
||||
char *string)
|
||||
{
|
||||
size_t result;
|
||||
|
||||
result = 0;
|
||||
if (unit < 0x80ul)
|
||||
{
|
||||
string[0] = unit & 0x7F;
|
||||
result = 1;
|
||||
}
|
||||
else if (unit < 0x800ul)
|
||||
{
|
||||
string[0] = 0xC0 | (unit >> 6);
|
||||
string[1] = 0x80 | (unit & 0x3F);
|
||||
result = 2;
|
||||
}
|
||||
else if (unit < 0x10000ul)
|
||||
{
|
||||
string[0] = 0xE0 | (unit >> 12);
|
||||
string[1] = 0x80 | ((unit >> 6) & 0x3F);
|
||||
string[2] = 0x80 | (unit & 0x3F);
|
||||
result = 3;
|
||||
}
|
||||
else if (unit < 0x200000ul)
|
||||
{
|
||||
string[0] = 0xF0 | (unit >> 18);
|
||||
string[1] = 0x80 | ((unit >> 12) & 0x3F);
|
||||
string[2] = 0x80 | ((unit >> 6) & 0x3F);
|
||||
string[3] = 0x80 | (unit & 0x3F);
|
||||
result = 4;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static int classifyUtf16(uint16_t value)
|
||||
{
|
||||
if (value > 0xD7FF && value < 0xDC00)
|
||||
return UTF16_LOWSUR;
|
||||
else if (value > 0xDBFF && value < 0xE000)
|
||||
return UTF16_HIGHSUR;
|
||||
return UTF16_NORMAL;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeDecodeUtf16LE(const char *string,
|
||||
size_t size,
|
||||
uint32_t *unit)
|
||||
{
|
||||
uint16_t lower, upper;
|
||||
|
||||
if (size < 2)
|
||||
return 0;
|
||||
|
||||
upper = BH_Read16LEu(string);
|
||||
*unit = 0xFFFFFFFF;
|
||||
|
||||
if (classifyUtf16(upper) == UTF16_NORMAL)
|
||||
*unit = upper;
|
||||
else if (classifyUtf16(upper) == UTF16_LOWSUR)
|
||||
{
|
||||
if (size < 4)
|
||||
return 0;
|
||||
|
||||
lower = BH_Read16LEu(string + 2);
|
||||
if (classifyUtf16(lower) == UTF16_HIGHSUR)
|
||||
{
|
||||
*unit = (((upper & 0x3FF) << 10) | (lower & 0x3FF)) + 0x10000;
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeDecodeUtf16BE(const char *string,
|
||||
size_t size,
|
||||
uint32_t *unit)
|
||||
{
|
||||
uint16_t lower, upper;
|
||||
|
||||
if (size < 2)
|
||||
return 0;
|
||||
|
||||
upper = BH_Read16BEu(string);
|
||||
*unit = 0xFFFFFFFF;
|
||||
|
||||
if (classifyUtf16(upper) == UTF16_NORMAL)
|
||||
*unit = upper;
|
||||
else if (classifyUtf16(upper) == UTF16_LOWSUR)
|
||||
{
|
||||
if (size < 4)
|
||||
return 0;
|
||||
|
||||
lower = BH_Read16BEu(string + 2);
|
||||
if (classifyUtf16(lower) == UTF16_HIGHSUR)
|
||||
{
|
||||
*unit = (((upper & 0x3FF) << 10) | (lower & 0x3FF)) + 0x10000;
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeEncodeUtf16LE(uint32_t unit,
|
||||
char *string)
|
||||
{
|
||||
if (unit < 0x10000)
|
||||
{
|
||||
BH_Write16LEu(string, unit);
|
||||
return 2;
|
||||
}
|
||||
else if (unit < 0x200000)
|
||||
{
|
||||
unit -= 0x10000;
|
||||
BH_Write16LEu(string, 0xD800 | (unit >> 10));
|
||||
BH_Write16LEu(string + 2, 0xDC00 | (unit & 0x3FF));
|
||||
return 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeEncodeUtf16BE(uint32_t unit,
|
||||
char *string)
|
||||
{
|
||||
if (unit < 0x10000)
|
||||
{
|
||||
BH_Write16BEu(string, unit);
|
||||
return 2;
|
||||
}
|
||||
else if (unit < 0x200000)
|
||||
{
|
||||
unit -= 0x10000;
|
||||
BH_Write16BEu(string, 0xD800 | (unit >> 10));
|
||||
BH_Write16BEu(string + 2, 0xDC00 | (unit & 0x3FF));
|
||||
return 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeDecodeUtf32LE(const char *string,
|
||||
size_t size,
|
||||
uint32_t *unit)
|
||||
{
|
||||
if (size < 4)
|
||||
return 0;
|
||||
|
||||
*unit = BH_Read32LEu(string);
|
||||
return 4;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeDecodeUtf32BE(const char *string,
|
||||
size_t size,
|
||||
uint32_t *unit)
|
||||
{
|
||||
if (size < 4)
|
||||
return 0;
|
||||
|
||||
*unit = BH_Read32BEu(string);
|
||||
return 4;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeEncodeUtf32LE(uint32_t unit,
|
||||
char *string)
|
||||
{
|
||||
if (unit > 0x1FFFFF)
|
||||
return 0;
|
||||
|
||||
BH_Write32LEu(string, unit);
|
||||
return 4;
|
||||
}
|
||||
|
||||
|
||||
size_t BH_UnicodeEncodeUtf32BE(uint32_t unit,
|
||||
char *string)
|
||||
{
|
||||
if (unit > 0x1FFFFF)
|
||||
return 0;
|
||||
|
||||
BH_Write32BEu(string, unit);
|
||||
return 4;
|
||||
}
|
||||
Reference in New Issue
Block a user