466 lines
15 KiB
C
466 lines
15 KiB
C
#include <stdarg.h>
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "Blocks.h"
|
|
#include "TextProc.h"
|
|
|
|
#define MAX_LINE 512
|
|
|
|
struct CaseInfo {
|
|
long rune;
|
|
struct {
|
|
long lower;
|
|
long upper;
|
|
long title;
|
|
long fold;
|
|
} simple;
|
|
struct {
|
|
long lower[4];
|
|
long upper[4];
|
|
long title[4];
|
|
long fold[4];
|
|
} full;
|
|
struct CaseInfo* prev;
|
|
struct CaseInfo* next;
|
|
};
|
|
|
|
typedef int (*EntryCb)(long rune, int fill, char** fields, size_t size);
|
|
|
|
static void entryProcess(FILE* in, EntryCb cb, const char* globStart,
|
|
const char* globEnd, size_t globField,
|
|
size_t codeField, size_t minFields) {
|
|
long code, startCode, prevCode = -1;
|
|
char line[MAX_LINE];
|
|
char* fields[MAX_FIELDS];
|
|
int emitted = 0;
|
|
size_t columns;
|
|
|
|
while ((columns = processLine(in, line, sizeof(line), fields))) {
|
|
if (columns < minFields)
|
|
continue;
|
|
|
|
code = strtol(fields[codeField], NULL, 16);
|
|
while (prevCode + 1 < code)
|
|
emitted = cb(++prevCode, 1, fields, columns);
|
|
|
|
if (globStart && globEnd && columns >= globField &&
|
|
glob(globStart, fields[globField])) {
|
|
startCode = code;
|
|
while ((columns = processLine(in, line, sizeof(line), fields))) {
|
|
if (columns < minFields)
|
|
continue;
|
|
break;
|
|
}
|
|
if (!glob(globEnd, fields[globField])) {
|
|
fprintf(stderr, "Abnormal input - can find last element\n");
|
|
abort();
|
|
}
|
|
code = strtol(fields[codeField], NULL, 16);
|
|
while (startCode <= code) {
|
|
emitted = cb(startCode, 0, fields, columns);
|
|
startCode++;
|
|
}
|
|
} else {
|
|
emitted = cb(code, 0, fields, columns);
|
|
}
|
|
prevCode = code;
|
|
}
|
|
|
|
while (prevCode + 1 < 0x110000)
|
|
emitted = cb(++prevCode, 1, fields, columns);
|
|
while (emitted == -1)
|
|
emitted = cb(code++, 1, fields, columns);
|
|
}
|
|
|
|
static struct CaseInfo* caseInfoSort(struct CaseInfo* head) {
|
|
struct CaseInfo* current;
|
|
struct CaseInfo* next;
|
|
int swapped;
|
|
|
|
if (!head || !head->next)
|
|
return head;
|
|
|
|
do {
|
|
swapped = 0;
|
|
current = head;
|
|
while (current->next) {
|
|
next = current->next;
|
|
if (current->rune > next->rune) {
|
|
swapped = 1;
|
|
|
|
if (current->prev)
|
|
current->prev->next = next;
|
|
if (next->next)
|
|
next->next->prev = current;
|
|
|
|
current->next = next->next;
|
|
next->prev = current->prev;
|
|
current->prev = next;
|
|
next->next = current;
|
|
|
|
if (current == head)
|
|
head = next;
|
|
} else
|
|
current = current->next;
|
|
}
|
|
} while (swapped);
|
|
|
|
return head;
|
|
}
|
|
|
|
static struct CaseInfo* caseInfoGet(struct CaseInfo** head, long rune) {
|
|
struct CaseInfo* current = *head;
|
|
struct CaseInfo* node;
|
|
|
|
while (current != NULL) {
|
|
if (current->rune == rune)
|
|
return current;
|
|
current = current->next;
|
|
}
|
|
|
|
if (!(node = malloc(sizeof(*node))))
|
|
return NULL;
|
|
|
|
memset(node, 0, sizeof(*node));
|
|
node->rune = rune;
|
|
node->next = *head;
|
|
node->prev = NULL;
|
|
if (*head)
|
|
(*head)->prev = node;
|
|
*head = node;
|
|
return node;
|
|
}
|
|
|
|
static size_t categoryClassify(const char* name) {
|
|
static const char *categories[] = {
|
|
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Mc", "Me", "Nd", "Nl", "No", "Pc",
|
|
"Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", "So", "Zs", "Zl",
|
|
"Zp", "Cc", "Cf", "Cs", "Co", "Cn", NULL,
|
|
};
|
|
size_t index = 0;
|
|
|
|
while (categories[index]) {
|
|
if (!strncmp(categories[index], name, 2))
|
|
return index;
|
|
++index;
|
|
}
|
|
|
|
return categoryClassify("Cn");
|
|
}
|
|
|
|
FILE* in;
|
|
FILE* out;
|
|
|
|
struct Blocks categoryBlocks;
|
|
struct CaseInfo* caseInfo = NULL;
|
|
|
|
static int entryUnicodeData(long rune, int fill, char** fields, size_t size) {
|
|
long lowercase, uppercase, titlecase;
|
|
struct CaseInfo* node;
|
|
|
|
if (fill) {
|
|
return blockInsert(&categoryBlocks, categoryClassify("Cn"), 0);
|
|
} else {
|
|
lowercase = strtol(fields[13], NULL, 16);
|
|
uppercase = strtol(fields[12], NULL, 16);
|
|
titlecase = strtol(fields[14], NULL, 16);
|
|
|
|
if (lowercase || uppercase || titlecase) {
|
|
node = caseInfoGet(&caseInfo, rune);
|
|
node->simple.lower = lowercase;
|
|
node->simple.upper = uppercase;
|
|
node->simple.title = titlecase;
|
|
}
|
|
|
|
return blockInsert(&categoryBlocks, categoryClassify(fields[2]), 0);
|
|
}
|
|
}
|
|
|
|
static void arrayParseFromStr(const char* field, long* array) {
|
|
char* endptr = (char*)field;
|
|
size_t written = 0;
|
|
|
|
while (1) {
|
|
array[written] = strtol(endptr, &endptr, 16);
|
|
if (!array[written])
|
|
break;
|
|
written++;
|
|
}
|
|
}
|
|
|
|
static int entryCaseFolding(long rune, int fill, char** fields, size_t size) {
|
|
struct CaseInfo* node;
|
|
|
|
if (fill || !strcmp("T", fields[1]))
|
|
return 1;
|
|
|
|
node = caseInfoGet(&caseInfo, rune);
|
|
if (strcmp("F", fields[1])) {
|
|
node->simple.fold = strtol(fields[2], NULL, 16);
|
|
} else {
|
|
arrayParseFromStr(fields[2], node->full.fold);
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int entrySpecialCasing(long rune, int fill, char** fields, size_t size) {
|
|
struct CaseInfo* node;
|
|
|
|
if (fill || strcmp("", fields[4]))
|
|
return 1;
|
|
|
|
node = caseInfoGet(&caseInfo, rune);
|
|
arrayParseFromStr(fields[1], node->full.lower);
|
|
arrayParseFromStr(fields[3], node->full.upper);
|
|
arrayParseFromStr(fields[2], node->full.title);
|
|
|
|
return 1;
|
|
}
|
|
|
|
static void mappingRemoveSingle(long* array) {
|
|
if (array[0] && !array[1])
|
|
array[0] = 0;
|
|
}
|
|
|
|
static void caseInfoReduce(void) {
|
|
struct CaseInfo* current = caseInfo;
|
|
while (current) {
|
|
if (!current->simple.title && current->simple.upper)
|
|
current->simple.title = current->simple.upper;
|
|
if (!current->full.title[0] && current->full.upper[0])
|
|
memcpy(current->full.title, current->full.upper, 4 * sizeof(long));
|
|
|
|
mappingRemoveSingle(current->full.lower);
|
|
mappingRemoveSingle(current->full.upper);
|
|
mappingRemoveSingle(current->full.title);
|
|
mappingRemoveSingle(current->full.fold);
|
|
current = current->next;
|
|
}
|
|
}
|
|
|
|
struct Blocks lowerBlocks, upperBlocks, titleBlocks, foldBlocks;
|
|
struct Blocks lowerFullBlocks, upperFullBlocks, titleFullBlocks, foldFullBlocks;
|
|
|
|
long longIndexData[1024][4];
|
|
size_t longIndexSize = 0;
|
|
|
|
static long longIndexGet(long* array) {
|
|
size_t i;
|
|
|
|
for (i = 0; i < longIndexSize; i++) {
|
|
if (!memcmp(array, longIndexData[i], 4 * sizeof(long)))
|
|
return i;
|
|
}
|
|
|
|
memcpy(longIndexData[longIndexSize], array, 4 * sizeof(long));
|
|
return longIndexSize++;
|
|
}
|
|
|
|
static void blocksBuild(void) {
|
|
struct CaseInfo* current = caseInfo;
|
|
int emitted;
|
|
long last = -1;
|
|
|
|
blockInit(&lowerBlocks, 4, 1, 64, 16, 1);
|
|
blockInit(&upperBlocks, 4, 1, 64, 16, 1);
|
|
blockInit(&titleBlocks, 4, 1, 64, 16, 1);
|
|
blockInit(&foldBlocks, 4, 1, 64, 16, 1);
|
|
blockInit(&lowerFullBlocks, 4, 1, 64, 32, 1);
|
|
blockInit(&upperFullBlocks, 4, 1, 64, 32, 1);
|
|
blockInit(&titleFullBlocks, 4, 1, 64, 32, 1);
|
|
blockInit(&foldFullBlocks, 4, 1, 64, 32, 1);
|
|
|
|
while (current) {
|
|
while (last + 1 < current->rune) {
|
|
blockInsert(&lowerBlocks, 0, 0);
|
|
blockInsert(&upperBlocks, 0, 0);
|
|
blockInsert(&titleBlocks, 0, 0);
|
|
blockInsert(&foldBlocks, 0, 0);
|
|
blockInsert(&lowerFullBlocks, -1, 0);
|
|
blockInsert(&upperFullBlocks, -1, 0);
|
|
blockInsert(&titleFullBlocks, -1, 0);
|
|
blockInsert(&foldFullBlocks, -1, 0);
|
|
last++;
|
|
}
|
|
|
|
if (current->simple.lower)
|
|
blockInsert(&lowerBlocks, current->simple.lower - current->rune, 0);
|
|
else
|
|
blockInsert(&lowerBlocks, 0, 0);
|
|
|
|
if (current->simple.upper)
|
|
blockInsert(&upperBlocks, current->simple.upper - current->rune, 0);
|
|
else
|
|
blockInsert(&upperBlocks, 0, 0);
|
|
|
|
if (current->simple.title)
|
|
blockInsert(&titleBlocks, current->simple.title - current->rune, 0);
|
|
else
|
|
blockInsert(&titleBlocks, 0, 0);
|
|
|
|
if (current->simple.fold)
|
|
blockInsert(&foldBlocks, current->simple.fold - current->rune, 0);
|
|
else
|
|
blockInsert(&foldBlocks, 0, 0);
|
|
|
|
if (current->full.lower[0])
|
|
blockInsert(&lowerFullBlocks, longIndexGet(current->full.lower), 0);
|
|
else
|
|
blockInsert(&lowerFullBlocks, -1, 0);
|
|
|
|
if (current->full.upper[0])
|
|
blockInsert(&upperFullBlocks, longIndexGet(current->full.upper), 0);
|
|
else
|
|
blockInsert(&upperFullBlocks, -1, 0);
|
|
|
|
if (current->full.title[0])
|
|
blockInsert(&titleFullBlocks, longIndexGet(current->full.title), 0);
|
|
else
|
|
blockInsert(&titleFullBlocks, -1, 0);
|
|
|
|
if (current->full.fold[0])
|
|
emitted = blockInsert(&foldFullBlocks, longIndexGet(current->full.fold), 0);
|
|
else
|
|
emitted = blockInsert(&foldFullBlocks, -1, 0);
|
|
|
|
last = current->rune;
|
|
current = current->next;
|
|
}
|
|
|
|
while (last + 1 < 0x110000 || emitted == -1) {
|
|
blockInsert(&lowerBlocks, 0, 0);
|
|
blockInsert(&upperBlocks, 0, 0);
|
|
blockInsert(&titleBlocks, 0, 0);
|
|
blockInsert(&foldBlocks, 0, 0);
|
|
blockInsert(&lowerFullBlocks, -1, 0);
|
|
blockInsert(&upperFullBlocks, -1, 0);
|
|
blockInsert(&titleFullBlocks, -1, 0);
|
|
emitted = blockInsert(&foldFullBlocks, -1, 0);
|
|
last++;
|
|
}
|
|
}
|
|
|
|
static void outputCode(void) {
|
|
#define DUMP(NAME, BLOCK, TYPE1, TYPE2, TYPE3, TYPE4) \
|
|
blockDump(&BLOCK, 0, out, NAME "1", TYPE1); \
|
|
blockDump(&BLOCK, 1, out, NAME "2", TYPE2); \
|
|
blockDump(&BLOCK, 2, out, NAME "3", TYPE3); \
|
|
blockDump(&BLOCK, 3, out, NAME "4", TYPE4)
|
|
|
|
fprintf(out, "/* Auto-generated case mapping tables */\n\n");
|
|
fprintf(out, "#include <stdint.h>\n\n");
|
|
|
|
DUMP("cat", categoryBlocks, "uint8_t", "uint16_t", "uint16_t", "uint8_t");
|
|
DUMP("low", lowerBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
|
DUMP("upp", upperBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
|
DUMP("tit", titleBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
|
DUMP("fod", foldBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
|
DUMP("lfx", lowerFullBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
|
DUMP("ufx", upperFullBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
|
DUMP("tfx", titleFullBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
|
DUMP("ffx", foldFullBlocks, "uint8_t", "uint8_t", "uint8_t", "uint32_t");
|
|
|
|
fprintf(out, "static const int32_t case_data[][3] = {");
|
|
{
|
|
size_t i;
|
|
for (i = 0; i < longIndexSize; ++i) {
|
|
if (i % 4 == 0)
|
|
fprintf(out, "\n ");
|
|
fprintf(out, "{%ld, %ld, %ld}, ",
|
|
longIndexData[i][0], longIndexData[i][1],
|
|
longIndexData[i][2]);
|
|
}
|
|
}
|
|
fprintf(out, "\n};\n\n");
|
|
|
|
#define EMIT_SIMPLE(FUNC, BLOCKS, BASE) do { \
|
|
fprintf(out, "uint32_t CgeRune" #FUNC "(uint32_t r){\n"); \
|
|
fprintf(out, " long t;\n if(r>1114111ul)return r;\n"); \
|
|
blockAccess(&BLOCKS, 0, out, "t", "r", BASE "1"); \
|
|
blockAccess(&BLOCKS, 1, out, "t", "r", BASE "2"); \
|
|
blockAccess(&BLOCKS, 2, out, "t", "r", BASE "3"); \
|
|
blockAccess(&BLOCKS, 3, out, "t", "r", BASE "4"); \
|
|
fprintf(out, " return t?t+r:r;\n}\n\n"); \
|
|
} while(0)
|
|
|
|
#define EMIT_FULL(FUNC, SIMPLE, FULL_BLOCKS, FULL_BASE, SIMPLE_FUNC) do { \
|
|
fprintf(out, "size_t CgeRune" #FUNC "Full(uint32_t r, uint32_t* out){\n"); \
|
|
fprintf(out, " long t;\n if(r>1114111ul){\n *out=r;\n return 1;\n }\n"); \
|
|
blockAccess(&FULL_BLOCKS, 0, out, "t", "r", FULL_BASE "1"); \
|
|
blockAccess(&FULL_BLOCKS, 1, out, "t", "r", FULL_BASE "2"); \
|
|
blockAccess(&FULL_BLOCKS, 2, out, "t", "r", FULL_BASE "3"); \
|
|
blockAccess(&FULL_BLOCKS, 3, out, "t", "r", FULL_BASE "4"); \
|
|
fprintf(out, " if(t>=0){\n"); \
|
|
fprintf(out, " const int32_t* p=case_data[t];\n"); \
|
|
fprintf(out, " size_t i=0;\n"); \
|
|
fprintf(out, " while(p[i] && i<3){out[i]=p[i];i++;}\n"); \
|
|
fprintf(out, " return i;\n }\n"); \
|
|
fprintf(out, " *out=CgeRune" #SIMPLE "(r);\n return 1;\n}\n\n"); \
|
|
} while(0)
|
|
|
|
fprintf(out, "int CgeRuneCategory(uint32_t r){\n");
|
|
fprintf(out, " long t;\n if(r>1114111ul)return %d;\n", (int)categoryClassify("Cn"));
|
|
blockAccess(&categoryBlocks, 0, out, "t", "r", "cat1");
|
|
blockAccess(&categoryBlocks, 1, out, "t", "r", "cat2");
|
|
blockAccess(&categoryBlocks, 2, out, "t", "r", "cat3");
|
|
blockAccess(&categoryBlocks, 3, out, "t", "r", "cat4");
|
|
fprintf(out, " return t;\n}\n\n"); \
|
|
|
|
EMIT_SIMPLE(Lower, lowerBlocks, "low");
|
|
EMIT_SIMPLE(Upper, upperBlocks, "upp");
|
|
EMIT_SIMPLE(Title, titleBlocks, "tit");
|
|
EMIT_SIMPLE(Fold, foldBlocks, "fod");
|
|
|
|
EMIT_FULL(Lower, Lower, lowerFullBlocks, "lfx", Lower);
|
|
EMIT_FULL(Upper, Upper, upperFullBlocks, "ufx", Upper);
|
|
EMIT_FULL(Title, Title, titleFullBlocks, "tfx", Title);
|
|
EMIT_FULL(Fold, Fold, foldFullBlocks, "ffx", Fold);
|
|
}
|
|
|
|
#undef DUMP
|
|
#undef EMIT_SIMPLE
|
|
#undef EMIT_FULL
|
|
|
|
int main() {
|
|
if (!(in = fopen("UnicodeData.txt", "r"))) {
|
|
fprintf(stderr, "UnicodeData.txt not found. Download it from:\n");
|
|
fprintf(stderr, "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt\n");
|
|
return -1;
|
|
}
|
|
out = stdout;
|
|
|
|
fprintf(stderr, "Processing UnicodeData.txt\n");
|
|
blockInit(&categoryBlocks, 4, 1, 16, 8, 8);
|
|
entryProcess(in, entryUnicodeData, "<*, First>", "<*, Last>", 1, 0, 15);
|
|
fclose(in);
|
|
|
|
if (!(in = fopen("CaseFolding.txt", "r"))) {
|
|
fprintf(stderr, "CaseFolding.txt not found. Download it from:\n");
|
|
fprintf(stderr, "https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt\n");
|
|
return -1;
|
|
}
|
|
fprintf(stderr, "Processing CaseFolding.txt\n");
|
|
entryProcess(in, entryCaseFolding, NULL, NULL, 0, 0, 3);
|
|
fclose(in);
|
|
|
|
fprintf(stderr, "Processing SpecialCasing.txt\n");
|
|
if (!(in = fopen("SpecialCasing.txt", "r"))) {
|
|
fprintf(stderr, "SpecialCasing.txt not found. Download it from:\n");
|
|
fprintf(stderr, "https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt\n");
|
|
return -1;
|
|
}
|
|
entryProcess(in, entrySpecialCasing, NULL, NULL, 0, 0, 4);
|
|
fclose(in);
|
|
|
|
caseInfo = caseInfoSort(caseInfo);
|
|
caseInfoReduce();
|
|
blocksBuild();
|
|
outputCode();
|
|
|
|
return 0;
|
|
}
|