Initial commit
This commit is contained in:
68
.gitignore
vendored
Normal file
68
.gitignore
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
# ---> C
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
||||
# Object files
|
||||
*.o
|
||||
*.ko
|
||||
*.obj
|
||||
*.elf
|
||||
|
||||
# Linker output
|
||||
*.ilk
|
||||
*.map
|
||||
*.exp
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
*.pch
|
||||
|
||||
# Libraries
|
||||
*.lib
|
||||
*.a
|
||||
*.la
|
||||
*.lo
|
||||
|
||||
# Shared objects (inc. Windows DLLs)
|
||||
*.dll
|
||||
*.so
|
||||
*.so.*
|
||||
*.dylib
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
*.i*86
|
||||
*.x86_64
|
||||
*.hex
|
||||
|
||||
# Debug files
|
||||
*.dSYM/
|
||||
*.su
|
||||
*.idb
|
||||
*.pdb
|
||||
|
||||
# Kernel Module Compile Results
|
||||
*.mod*
|
||||
*.cmd
|
||||
.tmp_versions/
|
||||
modules.order
|
||||
Module.symvers
|
||||
Mkfile.old
|
||||
dkms.conf
|
||||
|
||||
# ---> CMake
|
||||
CMakeLists.txt.user
|
||||
CMakeCache.txt
|
||||
CMakeFiles
|
||||
CMakeScripts
|
||||
Testing
|
||||
Makefile
|
||||
cmake_install.cmake
|
||||
install_manifest.txt
|
||||
compile_commands.json
|
||||
CTestTestfile.cmake
|
||||
_deps
|
||||
CMakeUserPresets.json
|
||||
|
||||
29
CMakeLists.txt
Normal file
29
CMakeLists.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
project(CgeStr LANGUAGES C)
|
||||
|
||||
set(CMAKE_C_STANDARD 99)
|
||||
set(CMAKE_C_STANDARD_REQUIRED ON)
|
||||
|
||||
set(SOURCES
|
||||
Rune.c
|
||||
Str.c
|
||||
UCD.c
|
||||
Utf8.c
|
||||
Utf16.c
|
||||
)
|
||||
|
||||
set(HEADERS
|
||||
CgeStr.h
|
||||
)
|
||||
|
||||
add_library(CgeStr STATIC ${SOURCES} ${HEADERS})
|
||||
|
||||
target_include_directories(CgeStr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
install(TARGETS CgeStr
|
||||
ARCHIVE DESTINATION lib
|
||||
LIBRARY DESTINATION lib
|
||||
RUNTIME DESTINATION bin
|
||||
)
|
||||
|
||||
install(FILES ${HEADERS} DESTINATION include)
|
||||
76
CgeStr.h
Normal file
76
CgeStr.h
Normal file
@@ -0,0 +1,76 @@
|
||||
#ifndef CGE_STR_H
|
||||
#define CGE_STR_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct CgeStr {
|
||||
const char* data;
|
||||
size_t size;
|
||||
} CgeStr;
|
||||
|
||||
#define CGE_STR_LIT(s) \
|
||||
{(s), sizeof(s) - 1}
|
||||
|
||||
typedef void (*CgeStrIterCb)(uint32_t rune, void* user);
|
||||
typedef void (*CgeStrWriteCb)(const char* data, size_t size, void* user);
|
||||
|
||||
enum CgeCat {
|
||||
CGE_CAT_LU, CGE_CAT_LL, CGE_CAT_LT, CGE_CAT_LM, CGE_CAT_LO, CGE_CAT_MN,
|
||||
CGE_CAT_MC, CGE_CAT_ME, CGE_CAT_ND, CGE_CAT_NL, CGE_CAT_NO, CGE_CAT_PC,
|
||||
CGE_CAT_PD, CGE_CAT_PS, CGE_CAT_PE, CGE_CAT_PI, CGE_CAT_PF, CGE_CAT_PO,
|
||||
CGE_CAT_SM, CGE_CAT_SC, CGE_CAT_SK, CGE_CAT_SO, CGE_CAT_ZS, CGE_CAT_ZL,
|
||||
CGE_CAT_ZP, CGE_CAT_CC, CGE_CAT_CF, CGE_CAT_CS, CGE_CAT_CO, CGE_CAT_CN
|
||||
};
|
||||
|
||||
int CgeRuneCategory(uint32_t rune);
|
||||
uint32_t CgeRuneLower(uint32_t rune);
|
||||
uint32_t CgeRuneUpper(uint32_t rune);
|
||||
uint32_t CgeRuneTitle(uint32_t rune);
|
||||
uint32_t CgeRuneFold(uint32_t rune);
|
||||
size_t CgeRuneLowerFull(uint32_t rune, uint32_t* out);
|
||||
size_t CgeRuneUpperFull(uint32_t rune, uint32_t* out);
|
||||
size_t CgeRuneTitleFull(uint32_t rune, uint32_t* out);
|
||||
size_t CgeRuneFoldFull(uint32_t rune, uint32_t* out);
|
||||
|
||||
int CgeRuneIsControl(uint32_t rune);
|
||||
int CgeRuneIsDigit(uint32_t rune);
|
||||
int CgeRuneIsGraphic(uint32_t rune);
|
||||
int CgeRuneIsLetter(uint32_t rune);
|
||||
int CgeRuneIsLower(uint32_t rune);
|
||||
int CgeRuneIsMark(uint32_t rune);
|
||||
int CgeRuneIsNumber(uint32_t rune);
|
||||
int CgeRuneIsPrint(uint32_t rune);
|
||||
int CgeRuneIsPunct(uint32_t rune);
|
||||
int CgeRuneIsSpace(uint32_t rune);
|
||||
int CgeRuneIsSymbol(uint32_t rune);
|
||||
int CgeRuneIsTitle(uint32_t rune);
|
||||
int CgeRuneIsUpper(uint32_t rune);
|
||||
|
||||
int CgeUtf8Encode(uint32_t rune, char* data);
|
||||
int CgeUtf8EncodeLax(uint32_t rune, char* data);
|
||||
int CgeUtf8Decode(const char* data, size_t size, uint32_t* rune);
|
||||
int CgeUtf8DecodeLax(const char* data, size_t size, uint32_t* rune);
|
||||
|
||||
int CgeUtf16Encode(uint32_t rune, uint16_t* data);
|
||||
int CgeUtf16EncodeLax(uint32_t rune, uint16_t* data);
|
||||
int CgeUtf16Decode(const uint16_t* data, size_t size, uint32_t* rune);
|
||||
int CgeUtf16DecodeLax(const uint16_t* data, size_t size, uint32_t* rune);
|
||||
|
||||
void CgeStrIter(CgeStr str, CgeStrIterCb cb, void* user);
|
||||
void CgeStrToLower(CgeStr str, CgeStrWriteCb cb, void* user);
|
||||
void CgeStrToUpper(CgeStr str, CgeStrWriteCb cb, void* user);
|
||||
void CgeStrFold(CgeStr str, CgeStrWriteCb cb, void* user);
|
||||
int CgeStrCmp(CgeStr lhs, CgeStr rhs);
|
||||
int CgeStrICmp(CgeStr lhs, CgeStr rhs);
|
||||
size_t CgeStrIndexRune(CgeStr str, uint32_t rune);
|
||||
size_t CgeStrLastIndexRune(CgeStr str, uint32_t rune);
|
||||
size_t CgeStrIndexStr(CgeStr str, CgeStr substr);
|
||||
size_t CgeStrLastIndexStr(CgeStr str, CgeStr substr);
|
||||
int CgeStrHasPrefix(CgeStr str, CgeStr prefix);
|
||||
int CgeStrHasSuffix(CgeStr str, CgeStr suffix);
|
||||
CgeStr CgeStrTrimLeft(CgeStr str);
|
||||
CgeStr CgeStrTrimRight(CgeStr str);
|
||||
CgeStr CgeStrTrim(CgeStr str);
|
||||
CgeStr CgeStrSplit(CgeStr *s, uint32_t delim);
|
||||
|
||||
#endif /* CGE_STR_H */
|
||||
12
LICENSE
Normal file
12
LICENSE
Normal file
@@ -0,0 +1,12 @@
|
||||
Copyright (C) 2026 by blankhex me@blankhex.com
|
||||
|
||||
Permission to use, copy, modify, and/or distribute this software for any
|
||||
purpose with or without fee is hereby granted.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
|
||||
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
|
||||
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
||||
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
||||
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
PERFORMANCE OF THIS SOFTWARE.
|
||||
23
Makefile.mingw
Normal file
23
Makefile.mingw
Normal file
@@ -0,0 +1,23 @@
|
||||
# MinGW Makefile for CgeStr
|
||||
|
||||
CC = gcc
|
||||
AR = ar
|
||||
CFLAGS = -std=c99 -O2 -Wall -Wextra
|
||||
ARFLAGS = rcs
|
||||
TARGET = libCgeStr.a
|
||||
|
||||
SOURCES = Rune.c Str.c UCD.c Utf8.c Utf16.c
|
||||
OBJECTS = $(SOURCES:.c=.o)
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): $(OBJECTS)
|
||||
$(AR) $(ARFLAGS) $@ $^
|
||||
|
||||
%.o: %.c CgeStr.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
clean:
|
||||
del $(OBJECTS) $(TARGET) 2>nul || exit 0
|
||||
33
Makefile.posix
Normal file
33
Makefile.posix
Normal file
@@ -0,0 +1,33 @@
|
||||
# POSIX Makefile for CgeStr
|
||||
|
||||
CC = gcc
|
||||
AR = ar
|
||||
CFLAGS = -std=c99 -O2 -Wall -Wextra -fPIC
|
||||
ARFLAGS = rcs
|
||||
TARGET = libCgeStr.a
|
||||
|
||||
SOURCES = Rune.c Str.c UCD.c Utf8.c Utf16.c
|
||||
OBJECTS = $(SOURCES:.c=.o)
|
||||
|
||||
.PHONY: all clean install
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): $(OBJECTS)
|
||||
$(AR) $(ARFLAGS) $@ $^
|
||||
|
||||
%.o: %.c CgeStr.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
clean:
|
||||
rm -f $(OBJECTS) $(TARGET)
|
||||
|
||||
install: $(TARGET)
|
||||
cp $(TARGET) /usr/local/lib/
|
||||
cp CgeStr.h /usr/local/include/
|
||||
ldconfig || echo "Run ldconfig manually if needed"
|
||||
|
||||
uninstall:
|
||||
rm -f /usr/local/lib/libCgeStr.a
|
||||
rm -f /usr/local/include/CgeStr.h
|
||||
ldconfig || true
|
||||
23
Makefile.win32
Normal file
23
Makefile.win32
Normal file
@@ -0,0 +1,23 @@
|
||||
# Makefile.win32 for MSVC (NMake)
|
||||
# Usage: Open "x86 Native Tools Command Prompt", then:
|
||||
# nmake -f Makefile.win32
|
||||
|
||||
CC = cl
|
||||
LIB = lib
|
||||
CFLAGS = /c /nologo /W3 /O2
|
||||
LIBFLAGS = /nologo
|
||||
TARGET = CgeStr.lib
|
||||
|
||||
SOURCES = Rune.c Str.c UCD.c Utf8.c Utf16.c
|
||||
OBJECTS = $(SOURCES:.c=.obj)
|
||||
|
||||
$(TARGET): $(OBJECTS)
|
||||
$(LIB) $(LIBFLAGS) /OUT:$(TARGET) $(OBJECTS)
|
||||
|
||||
{.}.c{}.obj:
|
||||
$(CC) $(CFLAGS) /Fo$@ $<
|
||||
|
||||
clean:
|
||||
del $(OBJECTS) $(TARGET) 2>nul
|
||||
|
||||
.PHONY: clean
|
||||
56
README.md
Normal file
56
README.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# CgeStr - Unicode String Library for C
|
||||
|
||||
A lightweight, dependency-free C library for UTF-8 string processing with full
|
||||
Unicode support.
|
||||
|
||||
## Features
|
||||
|
||||
- UTF-8 encoding and decoding
|
||||
- UTF-16 encoding and decoding
|
||||
- Unicode case mapping: lowercase, uppercase, titlecase, case folding
|
||||
- Full case mapping functions returning multiple runes when needed
|
||||
- Character classification: isControl, isDigit, isLetter, isSpace, and others
|
||||
- Unicode category lookup via `CgeRuneCategory`
|
||||
- Case-sensitive and case-insensitive string comparison
|
||||
- Substring and rune search
|
||||
- Prefix and suffix checking
|
||||
- String trimming (left, right, both)
|
||||
- String splitting by rune
|
||||
- Iteration over Unicode code points using callback interface
|
||||
|
||||
## Build Systems
|
||||
|
||||
- CMake - supports Linux, macOS, Windows (MSVC, MinGW)
|
||||
- Makefile.posix - for GCC/Clang on POSIX systems
|
||||
- Makefile.mingw - for MinGW on Windows
|
||||
- Makefile.win32 - for MSVC with NMake
|
||||
|
||||
Builds a static library. No shared library or external dependencies.
|
||||
|
||||
## Usage Example
|
||||
|
||||
```c
|
||||
#include "CgeStr.h"
|
||||
#include <stdio.h>
|
||||
|
||||
void print_rune(uint32_t rune, void* user) {
|
||||
printf("U+%04X ", rune);
|
||||
}
|
||||
|
||||
int main() {
|
||||
CgeStr str = CGE_STR_LIT("Héllo, 世界!");
|
||||
CgeStrIter(str, print_rune, NULL);
|
||||
printf("\n");
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
## Portability
|
||||
|
||||
- Written in C89+stdint.h
|
||||
- No dynamic memory allocation
|
||||
- No external dependencies
|
||||
|
||||
## License
|
||||
|
||||
0BSD - a permissive license with no attribution required.
|
||||
176
Rune.c
Normal file
176
Rune.c
Normal file
@@ -0,0 +1,176 @@
|
||||
#include "CgeStr.h"
|
||||
|
||||
int CgeRuneIsControl(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_CC:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsDigit(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_ND:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsGraphic(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_LL:
|
||||
case CGE_CAT_LM:
|
||||
case CGE_CAT_LO:
|
||||
case CGE_CAT_LT:
|
||||
case CGE_CAT_LU:
|
||||
case CGE_CAT_MC:
|
||||
case CGE_CAT_ME:
|
||||
case CGE_CAT_MN:
|
||||
case CGE_CAT_ND:
|
||||
case CGE_CAT_NL:
|
||||
case CGE_CAT_NO:
|
||||
case CGE_CAT_PC:
|
||||
case CGE_CAT_PD:
|
||||
case CGE_CAT_PE:
|
||||
case CGE_CAT_PF:
|
||||
case CGE_CAT_PI:
|
||||
case CGE_CAT_PO:
|
||||
case CGE_CAT_PS:
|
||||
case CGE_CAT_SC:
|
||||
case CGE_CAT_SK:
|
||||
case CGE_CAT_SM:
|
||||
case CGE_CAT_SO:
|
||||
case CGE_CAT_ZS:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsLetter(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_LL:
|
||||
case CGE_CAT_LM:
|
||||
case CGE_CAT_LO:
|
||||
case CGE_CAT_LT:
|
||||
case CGE_CAT_LU:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsLower(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_LL:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsMark(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_MC:
|
||||
case CGE_CAT_ME:
|
||||
case CGE_CAT_MN:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsNumber(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_ND:
|
||||
case CGE_CAT_NL:
|
||||
case CGE_CAT_NO:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsPrint(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_LL:
|
||||
case CGE_CAT_LM:
|
||||
case CGE_CAT_LO:
|
||||
case CGE_CAT_LT:
|
||||
case CGE_CAT_LU:
|
||||
case CGE_CAT_MC:
|
||||
case CGE_CAT_ME:
|
||||
case CGE_CAT_MN:
|
||||
case CGE_CAT_ND:
|
||||
case CGE_CAT_NL:
|
||||
case CGE_CAT_NO:
|
||||
case CGE_CAT_PC:
|
||||
case CGE_CAT_PD:
|
||||
case CGE_CAT_PE:
|
||||
case CGE_CAT_PF:
|
||||
case CGE_CAT_PI:
|
||||
case CGE_CAT_PO:
|
||||
case CGE_CAT_PS:
|
||||
case CGE_CAT_SC:
|
||||
case CGE_CAT_SK:
|
||||
case CGE_CAT_SM:
|
||||
case CGE_CAT_SO:
|
||||
return 1;
|
||||
}
|
||||
return rune == ' ';
|
||||
}
|
||||
|
||||
int CgeRuneIsPunct(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_PC:
|
||||
case CGE_CAT_PD:
|
||||
case CGE_CAT_PE:
|
||||
case CGE_CAT_PF:
|
||||
case CGE_CAT_PI:
|
||||
case CGE_CAT_PO:
|
||||
case CGE_CAT_PS:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsSpace(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_ZL:
|
||||
case CGE_CAT_ZP:
|
||||
case CGE_CAT_ZS:
|
||||
return 1;
|
||||
}
|
||||
|
||||
switch (rune) {
|
||||
case '\f':
|
||||
case '\n':
|
||||
case '\r':
|
||||
case '\t':
|
||||
case '\v':
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsSymbol(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_SC:
|
||||
case CGE_CAT_SK:
|
||||
case CGE_CAT_SM:
|
||||
case CGE_CAT_SO:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsTitle(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_LT:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeRuneIsUpper(uint32_t rune) {
|
||||
switch (CgeRuneCategory(rune)) {
|
||||
case CGE_CAT_LU:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
307
Str.c
Normal file
307
Str.c
Normal file
@@ -0,0 +1,307 @@
|
||||
#include "CgeStr.h"
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#define INVALID_RUNE 0xFFFD
|
||||
#define MAX_UNI_STREAM 4
|
||||
|
||||
struct UniStream {
|
||||
uint32_t data[MAX_UNI_STREAM];
|
||||
size_t head;
|
||||
size_t tail;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static int uniStreamPut(struct UniStream* stream, uint32_t rune) {
|
||||
if (stream->size >= MAX_UNI_STREAM)
|
||||
return 0;
|
||||
|
||||
stream->data[stream->tail] = rune;
|
||||
stream->tail = (stream->tail + 1) & (MAX_UNI_STREAM - 1);
|
||||
stream->size++;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int uniStreamGet(struct UniStream* stream, uint32_t* rune) {
|
||||
if (!stream->size)
|
||||
return 0;
|
||||
|
||||
*rune = stream->data[stream->head];
|
||||
stream->head = (stream->head + 1) & (MAX_UNI_STREAM - 1);
|
||||
stream->size--;
|
||||
return 1;
|
||||
}
|
||||
|
||||
void CgeStrIter(CgeStr str, CgeStrIterCb cb, void* user) {
|
||||
const char* current = str.data;
|
||||
const char* end = str.data + str.size;
|
||||
|
||||
while (current < end) {
|
||||
uint32_t rune;
|
||||
|
||||
current += CgeUtf8DecodeLax(current, end - current, &rune);
|
||||
cb(rune, user);
|
||||
}
|
||||
}
|
||||
|
||||
void CgeStrToLower(CgeStr str, CgeStrWriteCb cb, void* user) {
|
||||
const char* current = str.data;
|
||||
const char* end = str.data + str.size;
|
||||
|
||||
while (current < end) {
|
||||
uint32_t rune;
|
||||
uint32_t mapped[3];
|
||||
int count, i;
|
||||
|
||||
current += CgeUtf8DecodeLax(current, end - current, &rune);
|
||||
count = (int)CgeRuneLowerFull(rune, mapped);
|
||||
for (i = 0; i < count; i++) {
|
||||
char scratch[4];
|
||||
|
||||
cb(scratch, CgeUtf8EncodeLax(mapped[i], scratch), user);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CgeStrToUpper(CgeStr str, CgeStrWriteCb cb, void* user) {
|
||||
const char* current = str.data;
|
||||
const char* end = str.data + str.size;
|
||||
|
||||
while (current < end) {
|
||||
uint32_t rune;
|
||||
uint32_t mapped[3];
|
||||
int count, i;
|
||||
|
||||
current += CgeUtf8DecodeLax(current, end - current, &rune);
|
||||
count = (int)CgeRuneUpperFull(rune, mapped);
|
||||
for (i = 0; i < count; i++) {
|
||||
char scratch[4];
|
||||
|
||||
cb(scratch, CgeUtf8EncodeLax(mapped[i], scratch), user);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CgeStrFold(CgeStr str, CgeStrWriteCb cb, void* user) {
|
||||
const char* current = str.data;
|
||||
const char* end = str.data + str.size;
|
||||
|
||||
while (current < end) {
|
||||
uint32_t rune;
|
||||
uint32_t mapped[3];
|
||||
int count, i;
|
||||
|
||||
current += CgeUtf8DecodeLax(current, end - current, &rune);
|
||||
count = (int)CgeRuneFoldFull(rune, mapped);
|
||||
for (i = 0; i < count; i++) {
|
||||
char scratch[4];
|
||||
|
||||
cb(scratch, CgeUtf8EncodeLax(mapped[i], scratch), user);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int CgeStrCmp(CgeStr lhs, CgeStr rhs) {
|
||||
size_t leastSize;
|
||||
int result;
|
||||
|
||||
leastSize = (lhs.size < rhs.size) ? lhs.size : rhs.size;
|
||||
result = memcmp(lhs.data, rhs.data, leastSize);
|
||||
if (result < 0)
|
||||
return -1;
|
||||
else if (result > 0)
|
||||
return 1;
|
||||
|
||||
if (lhs.size < rhs.size)
|
||||
return -1;
|
||||
else if (lhs.size > rhs.size)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CgeStrICmp(CgeStr lhs, CgeStr rhs) {
|
||||
struct UniStream buf1 = {{0}, 0, 0, 0};
|
||||
struct UniStream buf2 = {{0}, 0, 0, 0};
|
||||
|
||||
const char* current1 = lhs.data;
|
||||
const char* current2 = rhs.data;
|
||||
const char* end1 = lhs.data + lhs.size;
|
||||
const char* end2 = rhs.data + rhs.size;
|
||||
|
||||
while (1) {
|
||||
uint32_t rune1, rune2;
|
||||
|
||||
if (!buf1.size && current1 < end1) {
|
||||
uint32_t scratch, folded[3];
|
||||
int i, count;
|
||||
|
||||
current1 += CgeUtf8DecodeLax(current1, end1 - current1, &scratch);
|
||||
count = (int)CgeRuneFoldFull(scratch, folded);
|
||||
for (i = 0; i < count; i++)
|
||||
uniStreamPut(&buf1, folded[i]);
|
||||
}
|
||||
|
||||
if (!buf2.size && current2 < end2) {
|
||||
uint32_t scratch, folded[3];
|
||||
int i, count;
|
||||
|
||||
current2 += CgeUtf8DecodeLax(current2, end2 - current2, &scratch);
|
||||
count = (int)CgeRuneFoldFull(scratch, folded);
|
||||
for (i = 0; i < count; i++)
|
||||
uniStreamPut(&buf2, folded[i]);
|
||||
}
|
||||
|
||||
if (!buf1.size && !buf2.size)
|
||||
return 0;
|
||||
|
||||
if (!uniStreamGet(&buf1, &rune1))
|
||||
return -1;
|
||||
|
||||
if (!uniStreamGet(&buf2, &rune2))
|
||||
return 1;
|
||||
|
||||
if (rune1 < rune2)
|
||||
return -1;
|
||||
if (rune1 > rune2)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
size_t CgeStrIndexRune(CgeStr str, uint32_t rune) {
|
||||
size_t i = 0;
|
||||
while (i < str.size) {
|
||||
uint32_t r;
|
||||
int count;
|
||||
|
||||
count = CgeUtf8DecodeLax(str.data + i, str.size - i, &r);
|
||||
if (r == rune)
|
||||
return i;
|
||||
i += count;
|
||||
}
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
size_t CgeStrLastIndexRune(CgeStr str, uint32_t rune) {
|
||||
size_t i = str.size;
|
||||
|
||||
while (i > 0) {
|
||||
size_t current = i;
|
||||
uint32_t r;
|
||||
|
||||
while (current > 0 && (str.data[current - 1] & 0xC0) == 0x80)
|
||||
current--;
|
||||
if (!current)
|
||||
current = i - 1;
|
||||
|
||||
CgeUtf8DecodeLax(str.data + current, i - current, &r);
|
||||
if (r == rune)
|
||||
return current;
|
||||
i = current;
|
||||
}
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
size_t CgeStrIndexStr(CgeStr str, CgeStr substr) {
|
||||
size_t i;
|
||||
|
||||
if (!substr.size)
|
||||
return 0;
|
||||
if (str.size < substr.size)
|
||||
return (size_t)-1;
|
||||
|
||||
for (i = 0; i <= str.size - substr.size; i++) {
|
||||
if (!memcmp(str.data + i, substr.data, substr.size))
|
||||
return i;
|
||||
}
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
size_t CgeStrLastIndexStr(CgeStr str, CgeStr substr) {
|
||||
size_t i;
|
||||
|
||||
if (!substr.size)
|
||||
return str.size;
|
||||
if (str.size < substr.size)
|
||||
return (size_t)-1;
|
||||
|
||||
for (i = str.size - substr.size; i != (size_t)-1; i--) {
|
||||
if (!memcmp(str.data + i, substr.data, substr.size))
|
||||
return i;
|
||||
}
|
||||
return (size_t)-1;
|
||||
}
|
||||
|
||||
int CgeStrHasPrefix(CgeStr str, CgeStr prefix) {
|
||||
if (prefix.size > str.size)
|
||||
return 0;
|
||||
|
||||
return !memcmp(str.data, prefix.data, prefix.size);
|
||||
}
|
||||
|
||||
int CgeStrHasSuffix(CgeStr str, CgeStr suffix) {
|
||||
if (suffix.size > str.size)
|
||||
return 0;
|
||||
|
||||
return !memcmp(str.data + str.size - suffix.size, suffix.data, suffix.size);
|
||||
}
|
||||
|
||||
CgeStr CgeStrTrimLeft(CgeStr str) {
|
||||
while (str.size) {
|
||||
uint32_t rune;
|
||||
int count;
|
||||
|
||||
count = CgeUtf8DecodeLax(str.data, str.size, &rune);
|
||||
if (!CgeRuneIsSpace(rune))
|
||||
break;
|
||||
|
||||
str.data += count;
|
||||
str.size -= count;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
CgeStr CgeStrTrimRight(CgeStr str) {
|
||||
while (str.size) {
|
||||
size_t pos = str.size;
|
||||
uint32_t rune;
|
||||
|
||||
while (pos > 0 && (str.data[pos - 1] & 0xC0) == 0x80)
|
||||
pos--;
|
||||
if (pos == 0)
|
||||
pos = 1;
|
||||
|
||||
CgeUtf8DecodeLax(str.data + pos - 1, str.size - (pos - 1), &rune);
|
||||
if (!CgeRuneIsSpace(rune))
|
||||
break;
|
||||
|
||||
str.size = pos - 1;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
CgeStr CgeStrTrim(CgeStr str) {
|
||||
return CgeStrTrimRight(CgeStrTrimLeft(str));
|
||||
}
|
||||
|
||||
CgeStr CgeStrSplit(CgeStr *s, uint32_t delim) {
|
||||
size_t pos;
|
||||
int count;
|
||||
uint32_t r;
|
||||
CgeStr result;
|
||||
|
||||
pos = CgeStrIndexRune(*s, delim);
|
||||
if (pos == (size_t)-1) {
|
||||
result = *s;
|
||||
s->size = 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
count = CgeUtf8DecodeLax(s->data + pos, s->size - pos, &r);
|
||||
|
||||
result.data = s->data;
|
||||
result.size = pos;
|
||||
s->data += pos + count;
|
||||
s->size -= pos + count;
|
||||
return result;
|
||||
}
|
||||
64
Utf16.c
Normal file
64
Utf16.c
Normal file
@@ -0,0 +1,64 @@
|
||||
#include "CgeStr.h"
|
||||
|
||||
#define INVALID_RUNE 0xFFFD
|
||||
|
||||
int CgeUtf16Encode(uint32_t rune, uint16_t* data) {
|
||||
if (rune <= 0xFFFF) {
|
||||
if (rune >= 0xD800 && rune <= 0xDFFF)
|
||||
return -1;
|
||||
data[0] = (uint16_t)rune;
|
||||
return 1;
|
||||
}
|
||||
if (rune <= 0x10FFFF) {
|
||||
uint32_t x = rune - 0x10000;
|
||||
data[0] = (uint16_t)(0xD800 + (x >> 10));
|
||||
data[1] = (uint16_t)(0xDC00 + (x & 0x3FF));
|
||||
return 2;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int CgeUtf16EncodeLax(uint32_t rune, uint16_t* data) {
|
||||
int result;
|
||||
|
||||
result = CgeUtf16Encode(rune, data);
|
||||
if (result == -1)
|
||||
result = CgeUtf16Encode(INVALID_RUNE, data);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int CgeUtf16Decode(const uint16_t* data, size_t size, uint32_t* rune) {
|
||||
uint16_t trail, lead = data[0];
|
||||
|
||||
if (size == 0)
|
||||
return -1;
|
||||
|
||||
if (lead < 0xD800 || lead > 0xDFFF) {
|
||||
*rune = lead;
|
||||
return 1;
|
||||
} else if (lead >= 0xD800 && lead <= 0xDBFF) {
|
||||
if (size < 2)
|
||||
return -1;
|
||||
|
||||
trail = data[1];
|
||||
if (trail >= 0xDC00 && trail <= 0xDFFF) {
|
||||
*rune = 0x10000 + ((lead & 0x3FF) << 10) + (trail & 0x3FF);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int CgeUtf16DecodeLax(const uint16_t* data, size_t size, uint32_t* rune) {
|
||||
int result;
|
||||
|
||||
result = CgeUtf16Decode(data, size, rune);
|
||||
if (result == -1) {
|
||||
*rune = INVALID_RUNE;
|
||||
result = 1;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
101
Utf8.c
Normal file
101
Utf8.c
Normal file
@@ -0,0 +1,101 @@
|
||||
#include "CgeStr.h"
|
||||
|
||||
#define INVALID_RUNE 0xFFFD
|
||||
|
||||
int CgeUtf8Encode(uint32_t rune, char* data) {
|
||||
if (rune < 0x80) {
|
||||
data[0] = (char)rune;
|
||||
return 1;
|
||||
}
|
||||
if (rune < 0x800) {
|
||||
data[0] = (char)(0xC0 | (rune >> 6));
|
||||
data[1] = (char)(0x80 | (rune & 0x3F));
|
||||
return 2;
|
||||
}
|
||||
if (rune < 0x10000) {
|
||||
if (rune >= 0xD800 && rune <= 0xDFFF)
|
||||
return -1;
|
||||
data[0] = (char)(0xE0 | (rune >> 12));
|
||||
data[1] = (char)(0x80 | ((rune >> 6) & 0x3F));
|
||||
data[2] = (char)(0x80 | (rune & 0x3F));
|
||||
return 3;
|
||||
}
|
||||
if (rune <= 0x10FFFF) {
|
||||
data[0] = (char)(0xF0 | (rune >> 18));
|
||||
data[1] = (char)(0x80 | ((rune >> 12) & 0x3F));
|
||||
data[2] = (char)(0x80 | ((rune >> 6) & 0x3F));
|
||||
data[3] = (char)(0x80 | (rune & 0x3F));
|
||||
return 4;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int CgeUtf8EncodeLax(uint32_t rune, char* data) {
|
||||
int result;
|
||||
|
||||
result = CgeUtf8Encode(rune, data);
|
||||
if (result == -1)
|
||||
result = CgeUtf8Encode(INVALID_RUNE, data);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int CgeUtf8Decode(const char* data, size_t size, uint32_t* rune) {
|
||||
unsigned char byte = (unsigned char)data[0];
|
||||
int i, n;
|
||||
|
||||
if (size == 0)
|
||||
return -1;
|
||||
|
||||
if (byte < 0x80) {
|
||||
*rune = byte;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ((byte & 0xE0) == 0xC0) {
|
||||
n = 2;
|
||||
*rune = byte & 0x1F;
|
||||
} else if ((byte & 0xF0) == 0xE0) {
|
||||
n = 3;
|
||||
*rune = byte & 0x0F;
|
||||
} else if ((byte & 0xF8) == 0xF0) {
|
||||
n = 4;
|
||||
*rune = byte & 0x07;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (size < (size_t)n)
|
||||
return -1;
|
||||
|
||||
for (i = 1; i < n; i++) {
|
||||
byte = (unsigned char)data[i];
|
||||
if ((byte & 0xC0) != 0x80)
|
||||
return -1;
|
||||
*rune = (*rune << 6) | (byte & 0x3F);
|
||||
}
|
||||
|
||||
if ((n == 2 && *rune < 0x80) ||
|
||||
(n == 3 && *rune < 0x800) ||
|
||||
(n == 4 && *rune < 0x10000)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (*rune > 0x10FFFF || (*rune >= 0xD800 && *rune <= 0xDFFF)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
int CgeUtf8DecodeLax(const char* data, size_t size, uint32_t* rune) {
|
||||
int result;
|
||||
|
||||
result = CgeUtf8Decode(data, size, rune);
|
||||
if (result == -1) {
|
||||
*rune = INVALID_RUNE;
|
||||
result = 1;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
168
generator/Blocks.h
Normal file
168
generator/Blocks.h
Normal file
@@ -0,0 +1,168 @@
|
||||
#ifndef BLOCKS_H
|
||||
#define BLOCKS_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include "ValueList.h"
|
||||
|
||||
struct BlockLevel {
|
||||
long* data;
|
||||
size_t size;
|
||||
size_t capacity;
|
||||
struct ValueList* list;
|
||||
};
|
||||
|
||||
struct Blocks {
|
||||
struct BlockLevel* levels;
|
||||
size_t depth;
|
||||
};
|
||||
|
||||
static int ilog2(unsigned long value) {
|
||||
int result = -1;
|
||||
while (value) value >>= 1, result++;
|
||||
return result;
|
||||
}
|
||||
|
||||
static long blockInsert(struct Blocks* blocks, long value, size_t depth) {
|
||||
struct BlockLevel* level = blocks->levels + depth;
|
||||
|
||||
if (blocks->depth - 1 != depth) {
|
||||
if ((value = blockInsert(blocks, value, depth + 1)) == -1)
|
||||
return -1;
|
||||
}
|
||||
|
||||
level->data[level->size++] = value;
|
||||
if (level->size >= level->capacity) {
|
||||
level->size = 0;
|
||||
|
||||
if (depth == 0 || (value = valueListFind(level->list, level->data, level->capacity)) == -1) {
|
||||
value = valueListIntern(&level->list, level->data, level->capacity);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static long blockFindR(struct Blocks* blocks, long value, long offset, size_t depth) {
|
||||
struct BlockLevel* level = &blocks->levels[depth];
|
||||
struct ValueList* current;
|
||||
size_t i, bits = 0;
|
||||
|
||||
for (i = depth + 1; i < blocks->depth; i++) {
|
||||
bits += ilog2(blocks->levels[i].capacity);
|
||||
}
|
||||
|
||||
current = level->list;
|
||||
while (offset--) current = current->next;
|
||||
|
||||
offset = (value >> bits) & ((unsigned long)level->capacity - 1);
|
||||
offset = current->data[offset];
|
||||
|
||||
if (depth != blocks->depth - 1)
|
||||
return blockFindR(blocks, value, offset, depth + 1);
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
static long blockFind(struct Blocks* blocks, long value) {
|
||||
size_t i, bits = 0;
|
||||
long offset;
|
||||
|
||||
for (i = 0; i < blocks->depth; i++) {
|
||||
bits += ilog2(blocks->levels[i].capacity);
|
||||
}
|
||||
|
||||
offset = value >> bits;
|
||||
return blockFindR(blocks, value, offset, 0);
|
||||
}
|
||||
|
||||
static void blockInit(struct Blocks* blocks, size_t depth, ...) {
|
||||
va_list args;
|
||||
|
||||
blocks->depth = 0;
|
||||
if (!(blocks->levels = malloc(sizeof(struct BlockLevel) * depth)))
|
||||
abort();
|
||||
|
||||
va_start(args, depth);
|
||||
|
||||
while (blocks->depth < depth) {
|
||||
struct BlockLevel* level = &blocks->levels[blocks->depth];
|
||||
|
||||
level->list = NULL;
|
||||
level->size = 0;
|
||||
level->capacity = va_arg(args, int);
|
||||
if (!(level->data = malloc(level->capacity * sizeof(long))))
|
||||
abort();
|
||||
blocks->depth++;
|
||||
}
|
||||
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
static void blockDump(struct Blocks* blocks, size_t depth, FILE* out,
|
||||
const char* name, const char* type) {
|
||||
struct BlockLevel* level = &blocks->levels[depth];
|
||||
struct ValueList* current;
|
||||
size_t i, j, printed = 0;
|
||||
|
||||
fprintf(out, "static const %s %s[] = {\n ", type, name);
|
||||
current = level->list;
|
||||
for (current = level->list; current; current = current->next) {
|
||||
for (i = 0; i < level->capacity; i++) {
|
||||
if (printed++ >= 15) {
|
||||
fprintf(out, "\n ");
|
||||
printed = 1;
|
||||
}
|
||||
fprintf(out, "%ld, ", current->data[i]);
|
||||
}
|
||||
}
|
||||
fprintf(out, "\n};\n\n");
|
||||
}
|
||||
|
||||
static void blockAccess(struct Blocks* blocks, size_t depth, FILE* out,
|
||||
const char* var, const char* arg, const char* name) {
|
||||
struct BlockLevel* level = &blocks->levels[depth];
|
||||
long i, bits = 0, offset, mask;
|
||||
|
||||
for (i = depth + 1; i < blocks->depth; i++) {
|
||||
bits += ilog2(blocks->levels[i].capacity);
|
||||
}
|
||||
|
||||
offset = ilog2(blocks->levels[depth].capacity);
|
||||
mask = level->capacity - 1;
|
||||
|
||||
fprintf(out, " %s = (long)%s", var, name);
|
||||
if (depth) {
|
||||
if (offset) {
|
||||
fprintf(out, "[(%s<<%ld)", var, offset);
|
||||
} else {
|
||||
fprintf(out, "[%s", var);
|
||||
}
|
||||
}
|
||||
else
|
||||
fprintf(out, "[");
|
||||
|
||||
if (mask || !depth) {
|
||||
if (depth)
|
||||
fprintf(out, "+");
|
||||
|
||||
if (mask)
|
||||
fprintf(out, "(");
|
||||
|
||||
if (bits)
|
||||
fprintf(out, "(%s>>%ld)", arg, (long)bits);
|
||||
else
|
||||
fprintf(out, "%s", arg);
|
||||
|
||||
if (depth)
|
||||
fprintf(out, "&%ld", mask);
|
||||
|
||||
if (mask)
|
||||
fprintf(out, ")");
|
||||
}
|
||||
|
||||
fprintf(out, "];\n");
|
||||
}
|
||||
|
||||
#endif /* BLOCKS_H */
|
||||
28
generator/README.md
Normal file
28
generator/README.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Generator
|
||||
|
||||
This utility should be used to regenerate tables based on Unicode Character
|
||||
Database (UCD).
|
||||
|
||||
Current version of this utility builds tables and functions for the following
|
||||
properties:
|
||||
|
||||
- Case mappings for lower, upper, title cases (1:1 and 1:M)
|
||||
- Case folding (1:1 and 1:M)
|
||||
- General category
|
||||
|
||||
## Usage
|
||||
|
||||
Compile `Tables.c`
|
||||
|
||||
```
|
||||
gcc Tables.c -o Tables
|
||||
```
|
||||
|
||||
Download and put `UnicodeData.txt`, `CaseFolding.txt`, `SpecialCasing.txt` near
|
||||
the compiled `Tables` program, run it and pipe output into `UCD.c`.
|
||||
|
||||
```
|
||||
Tables > ../UCD.c
|
||||
```
|
||||
|
||||
You can download required files from [here](https://www.unicode.org/Public/UCD/latest/ucd)
|
||||
465
generator/Tables.c
Normal file
465
generator/Tables.c
Normal file
@@ -0,0 +1,465 @@
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "Blocks.h"
|
||||
#include "TextProc.h"
|
||||
|
||||
#define MAX_LINE 512
|
||||
|
||||
struct CaseInfo {
|
||||
long rune;
|
||||
struct {
|
||||
long lower;
|
||||
long upper;
|
||||
long title;
|
||||
long fold;
|
||||
} simple;
|
||||
struct {
|
||||
long lower[4];
|
||||
long upper[4];
|
||||
long title[4];
|
||||
long fold[4];
|
||||
} full;
|
||||
struct CaseInfo* prev;
|
||||
struct CaseInfo* next;
|
||||
};
|
||||
|
||||
typedef int (*EntryCb)(long rune, int fill, char** fields, size_t size);
|
||||
|
||||
static void entryProcess(FILE* in, EntryCb cb, const char* globStart,
|
||||
const char* globEnd, size_t globField,
|
||||
size_t codeField, size_t minFields) {
|
||||
long code, startCode, prevCode = -1;
|
||||
char line[MAX_LINE];
|
||||
char* fields[MAX_FIELDS];
|
||||
int emitted = 0;
|
||||
size_t columns;
|
||||
|
||||
while ((columns = processLine(in, line, sizeof(line), fields))) {
|
||||
if (columns < minFields)
|
||||
continue;
|
||||
|
||||
code = strtol(fields[codeField], NULL, 16);
|
||||
while (prevCode + 1 < code)
|
||||
emitted = cb(++prevCode, 1, fields, columns);
|
||||
|
||||
if (globStart && globEnd && columns >= globField &&
|
||||
glob(globStart, fields[globField])) {
|
||||
startCode = code;
|
||||
while ((columns = processLine(in, line, sizeof(line), fields))) {
|
||||
if (columns < minFields)
|
||||
continue;
|
||||
break;
|
||||
}
|
||||
if (!glob(globEnd, fields[globField])) {
|
||||
fprintf(stderr, "Abnormal input - can find last element\n");
|
||||
abort();
|
||||
}
|
||||
code = strtol(fields[codeField], NULL, 16);
|
||||
while (startCode <= code) {
|
||||
emitted = cb(startCode, 0, fields, columns);
|
||||
startCode++;
|
||||
}
|
||||
} else {
|
||||
emitted = cb(code, 0, fields, columns);
|
||||
}
|
||||
prevCode = code;
|
||||
}
|
||||
|
||||
while (prevCode + 1 < 0x110000)
|
||||
emitted = cb(++prevCode, 1, fields, columns);
|
||||
while (emitted == -1)
|
||||
emitted = cb(code++, 1, fields, columns);
|
||||
}
|
||||
|
||||
static struct CaseInfo* caseInfoSort(struct CaseInfo* head) {
|
||||
struct CaseInfo* current;
|
||||
struct CaseInfo* next;
|
||||
int swapped;
|
||||
|
||||
if (!head || !head->next)
|
||||
return head;
|
||||
|
||||
do {
|
||||
swapped = 0;
|
||||
current = head;
|
||||
while (current->next) {
|
||||
next = current->next;
|
||||
if (current->rune > next->rune) {
|
||||
swapped = 1;
|
||||
|
||||
if (current->prev)
|
||||
current->prev->next = next;
|
||||
if (next->next)
|
||||
next->next->prev = current;
|
||||
|
||||
current->next = next->next;
|
||||
next->prev = current->prev;
|
||||
current->prev = next;
|
||||
next->next = current;
|
||||
|
||||
if (current == head)
|
||||
head = next;
|
||||
} else
|
||||
current = current->next;
|
||||
}
|
||||
} while (swapped);
|
||||
|
||||
return head;
|
||||
}
|
||||
|
||||
static struct CaseInfo* caseInfoGet(struct CaseInfo** head, long rune) {
|
||||
struct CaseInfo* current = *head;
|
||||
struct CaseInfo* node;
|
||||
|
||||
while (current != NULL) {
|
||||
if (current->rune == rune)
|
||||
return current;
|
||||
current = current->next;
|
||||
}
|
||||
|
||||
if (!(node = malloc(sizeof(*node))))
|
||||
return NULL;
|
||||
|
||||
memset(node, 0, sizeof(*node));
|
||||
node->rune = rune;
|
||||
node->next = *head;
|
||||
node->prev = NULL;
|
||||
if (*head)
|
||||
(*head)->prev = node;
|
||||
*head = node;
|
||||
return node;
|
||||
}
|
||||
|
||||
static size_t categoryClassify(const char* name) {
|
||||
static const char *categories[] = {
|
||||
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Mc", "Me", "Nd", "Nl", "No", "Pc",
|
||||
"Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", "So", "Zs", "Zl",
|
||||
"Zp", "Cc", "Cf", "Cs", "Co", "Cn", NULL,
|
||||
};
|
||||
size_t index = 0;
|
||||
|
||||
while (categories[index]) {
|
||||
if (!strncmp(categories[index], name, 2))
|
||||
return index;
|
||||
++index;
|
||||
}
|
||||
|
||||
return categoryClassify("Cn");
|
||||
}
|
||||
|
||||
FILE* in;
|
||||
FILE* out;
|
||||
|
||||
struct Blocks categoryBlocks;
|
||||
struct CaseInfo* caseInfo = NULL;
|
||||
|
||||
static int entryUnicodeData(long rune, int fill, char** fields, size_t size) {
|
||||
long lowercase, uppercase, titlecase;
|
||||
struct CaseInfo* node;
|
||||
|
||||
if (fill) {
|
||||
return blockInsert(&categoryBlocks, categoryClassify("Cn"), 0);
|
||||
} else {
|
||||
lowercase = strtol(fields[13], NULL, 16);
|
||||
uppercase = strtol(fields[12], NULL, 16);
|
||||
titlecase = strtol(fields[14], NULL, 16);
|
||||
|
||||
if (lowercase || uppercase || titlecase) {
|
||||
node = caseInfoGet(&caseInfo, rune);
|
||||
node->simple.lower = lowercase;
|
||||
node->simple.upper = uppercase;
|
||||
node->simple.title = titlecase;
|
||||
}
|
||||
|
||||
return blockInsert(&categoryBlocks, categoryClassify(fields[2]), 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void arrayParseFromStr(const char* field, long* array) {
|
||||
char* endptr = (char*)field;
|
||||
size_t written = 0;
|
||||
|
||||
while (1) {
|
||||
array[written] = strtol(endptr, &endptr, 16);
|
||||
if (!array[written])
|
||||
break;
|
||||
written++;
|
||||
}
|
||||
}
|
||||
|
||||
static int entryCaseFolding(long rune, int fill, char** fields, size_t size) {
|
||||
struct CaseInfo* node;
|
||||
|
||||
if (fill || !strcmp("T", fields[1]))
|
||||
return 1;
|
||||
|
||||
node = caseInfoGet(&caseInfo, rune);
|
||||
if (strcmp("F", fields[1])) {
|
||||
node->simple.fold = strtol(fields[2], NULL, 16);
|
||||
} else {
|
||||
arrayParseFromStr(fields[2], node->full.fold);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int entrySpecialCasing(long rune, int fill, char** fields, size_t size) {
|
||||
struct CaseInfo* node;
|
||||
|
||||
if (fill || strcmp("", fields[4]))
|
||||
return 1;
|
||||
|
||||
node = caseInfoGet(&caseInfo, rune);
|
||||
arrayParseFromStr(fields[1], node->full.lower);
|
||||
arrayParseFromStr(fields[3], node->full.upper);
|
||||
arrayParseFromStr(fields[2], node->full.title);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void mappingRemoveSingle(long* array) {
|
||||
if (array[0] && !array[1])
|
||||
array[0] = 0;
|
||||
}
|
||||
|
||||
static void caseInfoReduce(void) {
|
||||
struct CaseInfo* current = caseInfo;
|
||||
while (current) {
|
||||
if (!current->simple.title && current->simple.upper)
|
||||
current->simple.title = current->simple.upper;
|
||||
if (!current->full.title[0] && current->full.upper[0])
|
||||
memcpy(current->full.title, current->full.upper, 4 * sizeof(long));
|
||||
|
||||
mappingRemoveSingle(current->full.lower);
|
||||
mappingRemoveSingle(current->full.upper);
|
||||
mappingRemoveSingle(current->full.title);
|
||||
mappingRemoveSingle(current->full.fold);
|
||||
current = current->next;
|
||||
}
|
||||
}
|
||||
|
||||
struct Blocks lowerBlocks, upperBlocks, titleBlocks, foldBlocks;
|
||||
struct Blocks lowerFullBlocks, upperFullBlocks, titleFullBlocks, foldFullBlocks;
|
||||
|
||||
long longIndexData[1024][4];
|
||||
size_t longIndexSize = 0;
|
||||
|
||||
static long longIndexGet(long* array) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < longIndexSize; i++) {
|
||||
if (!memcmp(array, longIndexData[i], 4 * sizeof(long)))
|
||||
return i;
|
||||
}
|
||||
|
||||
memcpy(longIndexData[longIndexSize], array, 4 * sizeof(long));
|
||||
return longIndexSize++;
|
||||
}
|
||||
|
||||
static void blocksBuild(void) {
|
||||
struct CaseInfo* current = caseInfo;
|
||||
int emitted;
|
||||
long last = -1;
|
||||
|
||||
blockInit(&lowerBlocks, 4, 1, 64, 16, 1);
|
||||
blockInit(&upperBlocks, 4, 1, 64, 16, 1);
|
||||
blockInit(&titleBlocks, 4, 1, 64, 16, 1);
|
||||
blockInit(&foldBlocks, 4, 1, 64, 16, 1);
|
||||
blockInit(&lowerFullBlocks, 4, 1, 64, 32, 1);
|
||||
blockInit(&upperFullBlocks, 4, 1, 64, 32, 1);
|
||||
blockInit(&titleFullBlocks, 4, 1, 64, 32, 1);
|
||||
blockInit(&foldFullBlocks, 4, 1, 64, 32, 1);
|
||||
|
||||
while (current) {
|
||||
while (last + 1 < current->rune) {
|
||||
blockInsert(&lowerBlocks, 0, 0);
|
||||
blockInsert(&upperBlocks, 0, 0);
|
||||
blockInsert(&titleBlocks, 0, 0);
|
||||
blockInsert(&foldBlocks, 0, 0);
|
||||
blockInsert(&lowerFullBlocks, -1, 0);
|
||||
blockInsert(&upperFullBlocks, -1, 0);
|
||||
blockInsert(&titleFullBlocks, -1, 0);
|
||||
blockInsert(&foldFullBlocks, -1, 0);
|
||||
last++;
|
||||
}
|
||||
|
||||
if (current->simple.lower)
|
||||
blockInsert(&lowerBlocks, current->simple.lower - current->rune, 0);
|
||||
else
|
||||
blockInsert(&lowerBlocks, 0, 0);
|
||||
|
||||
if (current->simple.upper)
|
||||
blockInsert(&upperBlocks, current->simple.upper - current->rune, 0);
|
||||
else
|
||||
blockInsert(&upperBlocks, 0, 0);
|
||||
|
||||
if (current->simple.title)
|
||||
blockInsert(&titleBlocks, current->simple.title - current->rune, 0);
|
||||
else
|
||||
blockInsert(&titleBlocks, 0, 0);
|
||||
|
||||
if (current->simple.fold)
|
||||
blockInsert(&foldBlocks, current->simple.fold - current->rune, 0);
|
||||
else
|
||||
blockInsert(&foldBlocks, 0, 0);
|
||||
|
||||
if (current->full.lower[0])
|
||||
blockInsert(&lowerFullBlocks, longIndexGet(current->full.lower), 0);
|
||||
else
|
||||
blockInsert(&lowerFullBlocks, -1, 0);
|
||||
|
||||
if (current->full.upper[0])
|
||||
blockInsert(&upperFullBlocks, longIndexGet(current->full.upper), 0);
|
||||
else
|
||||
blockInsert(&upperFullBlocks, -1, 0);
|
||||
|
||||
if (current->full.title[0])
|
||||
blockInsert(&titleFullBlocks, longIndexGet(current->full.title), 0);
|
||||
else
|
||||
blockInsert(&titleFullBlocks, -1, 0);
|
||||
|
||||
if (current->full.fold[0])
|
||||
emitted = blockInsert(&foldFullBlocks, longIndexGet(current->full.fold), 0);
|
||||
else
|
||||
emitted = blockInsert(&foldFullBlocks, -1, 0);
|
||||
|
||||
last = current->rune;
|
||||
current = current->next;
|
||||
}
|
||||
|
||||
while (last + 1 < 0x110000 || emitted == -1) {
|
||||
blockInsert(&lowerBlocks, 0, 0);
|
||||
blockInsert(&upperBlocks, 0, 0);
|
||||
blockInsert(&titleBlocks, 0, 0);
|
||||
blockInsert(&foldBlocks, 0, 0);
|
||||
blockInsert(&lowerFullBlocks, -1, 0);
|
||||
blockInsert(&upperFullBlocks, -1, 0);
|
||||
blockInsert(&titleFullBlocks, -1, 0);
|
||||
emitted = blockInsert(&foldFullBlocks, -1, 0);
|
||||
last++;
|
||||
}
|
||||
}
|
||||
|
||||
static void outputCode(void) {
|
||||
#define DUMP(NAME, BLOCK, TYPE1, TYPE2, TYPE3, TYPE4) \
|
||||
blockDump(&BLOCK, 0, out, NAME "1", TYPE1); \
|
||||
blockDump(&BLOCK, 1, out, NAME "2", TYPE2); \
|
||||
blockDump(&BLOCK, 2, out, NAME "3", TYPE3); \
|
||||
blockDump(&BLOCK, 3, out, NAME "4", TYPE4)
|
||||
|
||||
fprintf(out, "/* Auto-generated case mapping tables */\n\n");
|
||||
fprintf(out, "#include <stdint.h>\n\n");
|
||||
|
||||
DUMP("cat", categoryBlocks, "uint8_t", "uint16_t", "uint16_t", "uint8_t");
|
||||
DUMP("low", lowerBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
||||
DUMP("upp", upperBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
||||
DUMP("tit", titleBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
||||
DUMP("fod", foldBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
||||
DUMP("lfx", lowerFullBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
||||
DUMP("ufx", upperFullBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
||||
DUMP("tfx", titleFullBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
||||
DUMP("ffx", foldFullBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
||||
|
||||
fprintf(out, "static const int32_t case_data[][3] = {");
|
||||
{
|
||||
size_t i;
|
||||
for (i = 0; i < longIndexSize; ++i) {
|
||||
if (i % 4 == 0)
|
||||
fprintf(out, "\n ");
|
||||
fprintf(out, "{%ld, %ld, %ld}, ",
|
||||
longIndexData[i][0], longIndexData[i][1],
|
||||
longIndexData[i][2]);
|
||||
}
|
||||
}
|
||||
fprintf(out, "\n};\n\n");
|
||||
|
||||
#define EMIT_SIMPLE(FUNC, BLOCKS, BASE) do { \
|
||||
fprintf(out, "uint32_t CgeRune" #FUNC "(uint32_t r){\n"); \
|
||||
fprintf(out, " long t;\n if(r>1114111ul)return r;\n"); \
|
||||
blockAccess(&BLOCKS, 0, out, "t", "r", BASE "1"); \
|
||||
blockAccess(&BLOCKS, 1, out, "t", "r", BASE "2"); \
|
||||
blockAccess(&BLOCKS, 2, out, "t", "r", BASE "3"); \
|
||||
blockAccess(&BLOCKS, 3, out, "t", "r", BASE "4"); \
|
||||
fprintf(out, " return t?t+r:r;\n}\n\n"); \
|
||||
} while(0)
|
||||
|
||||
#define EMIT_FULL(FUNC, SIMPLE, FULL_BLOCKS, FULL_BASE, SIMPLE_FUNC) do { \
|
||||
fprintf(out, "size_t CgeRune" #FUNC "Full(uint32_t r, uint32_t* out){\n"); \
|
||||
fprintf(out, " long t;\n if(r>1114111ul){\n *out=r;\n return 1;\n }\n"); \
|
||||
blockAccess(&FULL_BLOCKS, 0, out, "t", "r", FULL_BASE "1"); \
|
||||
blockAccess(&FULL_BLOCKS, 1, out, "t", "r", FULL_BASE "2"); \
|
||||
blockAccess(&FULL_BLOCKS, 2, out, "t", "r", FULL_BASE "3"); \
|
||||
blockAccess(&FULL_BLOCKS, 3, out, "t", "r", FULL_BASE "4"); \
|
||||
fprintf(out, " if(t>=0){\n"); \
|
||||
fprintf(out, " const int32_t* p=case_data[t];\n"); \
|
||||
fprintf(out, " size_t i=0;\n"); \
|
||||
fprintf(out, " while(p[i] && i<3){out[i]=p[i];i++;}\n"); \
|
||||
fprintf(out, " return i;\n }\n"); \
|
||||
fprintf(out, " *out=CgeRune" #SIMPLE "(r);\n return 1;\n}\n\n"); \
|
||||
} while(0)
|
||||
|
||||
fprintf(out, "int CgeRuneCategory(uint32_t r){\n");
|
||||
fprintf(out, " long t;\n if(r>1114111ul)return %d;\n", (int)categoryClassify("Cn"));
|
||||
blockAccess(&categoryBlocks, 0, out, "t", "r", "cat1");
|
||||
blockAccess(&categoryBlocks, 1, out, "t", "r", "cat2");
|
||||
blockAccess(&categoryBlocks, 2, out, "t", "r", "cat3");
|
||||
blockAccess(&categoryBlocks, 3, out, "t", "r", "cat4");
|
||||
fprintf(out, " return t;\n}\n\n"); \
|
||||
|
||||
EMIT_SIMPLE(Lower, lowerBlocks, "low");
|
||||
EMIT_SIMPLE(Upper, upperBlocks, "upp");
|
||||
EMIT_SIMPLE(Title, titleBlocks, "tit");
|
||||
EMIT_SIMPLE(Fold, foldBlocks, "fod");
|
||||
|
||||
EMIT_FULL(Lower, Lower, lowerFullBlocks, "lfx", Lower);
|
||||
EMIT_FULL(Upper, Upper, upperFullBlocks, "ufx", Upper);
|
||||
EMIT_FULL(Title, Title, titleFullBlocks, "tfx", Title);
|
||||
EMIT_FULL(Fold, Fold, foldFullBlocks, "ffx", Fold);
|
||||
}
|
||||
|
||||
#undef DUMP
|
||||
#undef EMIT_SIMPLE
|
||||
#undef EMIT_FULL
|
||||
|
||||
int main() {
|
||||
if (!(in = fopen("UnicodeData.txt", "r"))) {
|
||||
fprintf(stderr, "UnicodeData.txt not found. Download it from:\n");
|
||||
fprintf(stderr, "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt\n");
|
||||
return -1;
|
||||
}
|
||||
out = stdout;
|
||||
|
||||
fprintf(stderr, "Processing UnicodeData.txt\n");
|
||||
blockInit(&categoryBlocks, 4, 1, 16, 8, 8);
|
||||
entryProcess(in, entryUnicodeData, "<*, First>", "<*, Last>", 1, 0, 15);
|
||||
fclose(in);
|
||||
|
||||
if (!(in = fopen("CaseFolding.txt", "r"))) {
|
||||
fprintf(stderr, "CaseFolding.txt not found. Download it from:\n");
|
||||
fprintf(stderr, "https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt\n");
|
||||
return -1;
|
||||
}
|
||||
fprintf(stderr, "Processing CaseFolding.txt\n");
|
||||
entryProcess(in, entryCaseFolding, NULL, NULL, 0, 0, 3);
|
||||
fclose(in);
|
||||
|
||||
fprintf(stderr, "Processing SpecialCasing.txt\n");
|
||||
if (!(in = fopen("SpecialCasing.txt", "r"))) {
|
||||
fprintf(stderr, "SpecialCasing.txt not found. Download it from:\n");
|
||||
fprintf(stderr, "https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt\n");
|
||||
return -1;
|
||||
}
|
||||
entryProcess(in, entrySpecialCasing, NULL, NULL, 0, 0, 4);
|
||||
fclose(in);
|
||||
|
||||
caseInfo = caseInfoSort(caseInfo);
|
||||
caseInfoReduce();
|
||||
blocksBuild();
|
||||
outputCode();
|
||||
|
||||
return 0;
|
||||
}
|
||||
76
generator/TextProc.h
Normal file
76
generator/TextProc.h
Normal file
@@ -0,0 +1,76 @@
|
||||
#ifndef TEXTPROC_H
|
||||
#define TEXTPROC_H
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#define MAX_FIELDS 16
|
||||
|
||||
static int glob(const char* pattern, const char* text) {
|
||||
const char* star = NULL;
|
||||
const char* restart = text;
|
||||
|
||||
while (*text) {
|
||||
if (*pattern == *text || *pattern == '?')
|
||||
pattern++, text++;
|
||||
else if (*pattern == '*')
|
||||
star = ++pattern, restart = text;
|
||||
else if (star)
|
||||
pattern = star, text = ++restart;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
while (*pattern == '*')
|
||||
pattern++;
|
||||
return (*pattern == '\0');
|
||||
}
|
||||
|
||||
static char* trimLeft(char* line) {
|
||||
for (; *line && isspace(*line); ++line);
|
||||
return line;
|
||||
}
|
||||
|
||||
static void trimRight(char* line) {
|
||||
char* last = line;
|
||||
|
||||
for (; *line; ++line)
|
||||
if (!isspace(*line))
|
||||
last = line + 1;
|
||||
*last = '\0';
|
||||
}
|
||||
|
||||
static void trimComment(char* line) {
|
||||
char* separator = strchr(line, '#');
|
||||
if (separator) *separator = '\0';
|
||||
}
|
||||
|
||||
static char* trim(char* line) {
|
||||
trimRight(line);
|
||||
return trimLeft(line);
|
||||
}
|
||||
|
||||
static size_t fieldParse(char* line, char** fields, char separator) {
|
||||
size_t index = 0;
|
||||
|
||||
do {
|
||||
fields[index] = line;
|
||||
if ((line = strchr(line, separator)))
|
||||
*(line++) = '\0';
|
||||
fields[index] = trim(fields[index]);
|
||||
index++;
|
||||
} while (line && index < MAX_FIELDS);
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
static int processLine(FILE* in, char* line, size_t size, char** fields) {
|
||||
if (!fgets(line, size, in))
|
||||
return 0;
|
||||
|
||||
trimComment(line);
|
||||
return fieldParse(line, fields, ';');
|
||||
}
|
||||
|
||||
#endif /* TEXTPROC_H */
|
||||
41
generator/ValueList.h
Normal file
41
generator/ValueList.h
Normal file
@@ -0,0 +1,41 @@
|
||||
#ifndef VALUELIST_H
|
||||
#define VALUELIST_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
struct ValueList {
|
||||
long* data;
|
||||
size_t size;
|
||||
struct ValueList* next;
|
||||
};
|
||||
|
||||
static long valueListFind(struct ValueList* list, long* data, size_t size) {
|
||||
long index = 0;
|
||||
|
||||
while (list) {
|
||||
if (list->size == size && !memcmp(list->data, data, size * sizeof(long)))
|
||||
return index;
|
||||
index++, list = list->next;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static long valueListIntern(struct ValueList** list, long* data, size_t size) {
|
||||
long index = 0;
|
||||
|
||||
while (*list) ++index, list = &(*list)->next;
|
||||
if (!(*list = malloc(sizeof(struct ValueList))))
|
||||
abort();
|
||||
if (!((*list)->data = malloc(size * sizeof(long))))
|
||||
abort();
|
||||
|
||||
memcpy((*list)->data, data, size * sizeof(long));
|
||||
(*list)->size = size;
|
||||
(*list)->next = NULL;
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
#endif /* VALUELIST_H */
|
||||
Reference in New Issue
Block a user