15 #include <unicode/utf8.h>
16 #include <unicode/utf.h>
17 #include <unicode/uchar.h>
18 #include <unicode/unorm2.h>
19 #include <unicode/ustring.h>
24 static const afw_utf8_t impl_utf8_null = { NULL, 0 };
26 #define IMPL_WHITESPACE(c) \
27 ((c) == 0x20 || (c) == 0x09 || (c) == 0x0d || (c) == 0x0a)
46 if (*offset != soffset && len != slen) {
54 U8_NEXT((
const uint8_t *)s, soffset, slen, cp);
71 U8_APPEND((
afw_octet_t *)utf8_z, i, 4, cp, isError);
73 return (isError == FALSE);
88 afw_utf8_nfc_option_t option,
94 UNormalizationCheckResult is_nfc;
99 int32_t input_utf16_length;
101 int32_t output_utf16_length;
103 const UNormalizer2* nfc;
106 if (!s || len == 0) {
112 return &afw_s_a_empty_string;
133 "ICU implementation restrict - len to large or negative", xctx);
138 length = (int32_t)len;
140 for (i = 0; i < length;) {
141 U8_NEXT(
string, i, length, c);
151 return &impl_utf8_null;
159 if (c < 0x0300)
continue;
162 is_nfc = u_getIntPropertyValue(c, UCHAR_NFC_QUICK_CHECK);
163 if (is_nfc != UNORM_YES)
break;
182 if (is_nfc == UNORM_YES) {
185 else if (is_nfc == UNORM_NO) {
186 return &impl_utf8_null;
195 else if (is_nfc == UNORM_YES) {
201 new_result->len = len;
210 memcpy(result_s, s, len);
211 new_result->s = result_s;
212 new_result->len = len;
228 errorCode = U_ZERO_ERROR;
230 length = (int32_t)len;
232 input_utf16 = malloc(length *
sizeof(UChar));
233 input_utf16 = u_strFromUTF8Lenient(input_utf16,
234 length, &input_utf16_length,
string, length,
236 if (U_FAILURE(errorCode)) {
238 "u_strFromUTF8Lenient() failed", xctx);
242 nfc = unorm2_getNFCInstance(&errorCode);
243 if (U_FAILURE(errorCode)) {
245 "unorm2_getNFCInstance() failed", xctx);
249 if (is_nfc == UNORM_MAYBE) {
252 if (unorm2_isNormalized(nfc, input_utf16,
253 input_utf16_length, &errorCode))
278 new_result->len = len;
287 memcpy(result_s, s, len);
288 new_result->s = result_s;
289 new_result->len = len;
304 result = &impl_utf8_null;
310 output_utf16_length = unorm2_normalize(nfc,
311 input_utf16, input_utf16_length,
312 NULL, 0, &errorCode);
313 if (errorCode != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(errorCode)) {
315 "unorm2_normalize() preflight failed", xctx);
317 errorCode = U_ZERO_ERROR;
318 output_utf16 = malloc(output_utf16_length * 2);
319 output_utf16_length = unorm2_normalize(nfc,
320 input_utf16, input_utf16_length,
321 output_utf16, output_utf16_length, &errorCode);
322 if (!U_SUCCESS(errorCode)) {
324 "unorm2_normalize() failed", xctx);
328 u_strToUTF8(NULL, 0, &length, output_utf16, output_utf16_length,
330 if (errorCode != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(errorCode)) {
332 "u_strToUTF8() preflight failed", xctx);
334 errorCode = U_ZERO_ERROR;
337 new_result->s = result_s;
338 new_result->len = length;
339 u_strToUTF8(result_s, length, &length, output_utf16,
340 output_utf16_length, &errorCode);
341 if (!U_SUCCESS(errorCode)) {
343 "u_strToUTF8() failed", xctx);
350 if (input_utf16) free(input_utf16);
351 if (output_utf16) free(output_utf16);
375 if (!s || s_len == 0) {
382 memcpy(s_z, s, s_len);
412 va_start(strings, xctx);
427 va_list original_strings;
429 va_copy(original_strings,strings);
434 while ((
string = va_arg(strings,
afw_utf8_t *))) {
435 result->len +=
string->len;
439 if (result->len > 0) {
443 while ((
string = va_arg(original_strings,
afw_utf8_t *))) {
444 if (string->len > 0) {
445 memcpy(c, string->s, string->len);
466 va_start(arg, format);
496 return (string->len >= starts_with->len &&
497 memcmp(string->s, starts_with->s, starts_with->len) == 0);
506 return (string->len >= len &&
507 memcmp(string->s, starts_with_z, len) == 0);
516 return (string->len >= ends_with->len &&
517 memcmp(string->s + (string->len - ends_with->len),
518 ends_with->s, ends_with->len) == 0);
528 return (string->len >= len &&
529 memcmp(string->s + (string->len - len), ends_with_z, len) == 0);
557 for (c = s1->s, len = s1->len;
558 s2->len <= len; c++, len--)
560 if (memcmp(c, s2->s, s2->len) == 0) {
594 if (s1->len == s2->len) {
595 return memcmp(s1->s, s2->s, s1->len);
597 else if (s1->len < s2->len) {
598 result = memcmp(s1->s, s2->s, s1->len);
605 result = memcmp(s1->s, s2->s, s2->len);
619 int32_t i1, i2, len1, len2;
627 "ICU implementation restrict - len to large or negative", xctx);
630 len1 = (int32_t) s->len;
631 cs1 = (
const uint8_t *)s->s;
637 for (i1 = 0, len2 = 0; i1 < len1; )
639 U8_NEXT_UNSAFE(cs1, i1, c);
640 len2 += U8_LENGTH(c);
649 for (i1 = 0, i2 = 0; i1 < len1;) {
650 U8_NEXT_UNSAFE(cs1, i1, c);
652 U8_APPEND_UNSAFE(cs2, i2, c);
669 if (s->len == 0)
return s;
674 for (c = s->s; len > 0 && IMPL_WHITESPACE(*c); len--, c++);
678 for (c = s->s + s->len - 1; len > 0 && IMPL_WHITESPACE(*c); len--, c--);
683 for (c = start, start_fix = NULL; len > 0; len--, c++) {
684 if (IMPL_WHITESPACE(*c)) {
685 if (start_fix)
break;
687 if (*c != 0x20)
break;
695 if (!start_fix && start == s->s && end == s->s + s->len)
return s;
699 result->len = result_len;
708 new_len = result->len;
709 len = end - start_fix;
710 for (c = start_fix, last_ws =
false; len > 0; len--, c++)
712 if (IMPL_WHITESPACE(*c)) {
713 if (last_ws) new_len--;
726 result->len = new_len;
727 for (last_ws =
false; len > 0; c++, len--) {
728 if (IMPL_WHITESPACE(*c)) {
729 if (!last_ws) *new_c++ = 0x20;
747 const uint8_t *cs1, *cs2;
755 "ICU implementation restrict - len to large or negative", xctx);
758 cs1 = (
const uint8_t *)s1->s;
759 cs2 = (
const uint8_t *)s2->s;
763 for (i = 0; i < len;) {
768 U8_NEXT_UNSAFE(cs1, i2, c1);
769 U8_NEXT_UNSAFE(cs2, i, c2);
772 if (c1 == c2)
continue;
773 result = (int)(c1 > c2) ? 1 : -1;
777 if (result == 0 && s1->len != s2->len) {
778 result = (s1->len > s2->len) ? 1 : -1;
790 return (s1->len == len &&
791 (s1->len == 0 || memcmp(s1->s, s2_z, len) == 0));
805 va_copy(original_ap, ap);
810 sz += strlen((
const char *)s);
817 while ((*s2++ = *s++));
834 result = impl_u8z_concat_v(p, xctx, ap);
873 if (!pointers)
return NULL;
876 for (in = pointers, count = 0; *in; in++, count++);
879 if (count == 0 && !NULL_terminate)
return NULL;
882 ((NULL_terminate) ? count + 1 : count) *
sizeof(
afw_utf8_t *),
886 for (in = pointers; count > 0; count--, in++, out++) {
888 if ((*in)->len > 0) {
889 (*out)->len = (*in)->len;
892 memcpy(s, (*in)->s, (*in)->len);
896 if (NULL_terminate) {
917 if (!strings || *strings == NULL)
return &afw_s_a_empty_string;
920 for (count = 0, c = strings; *c; count++, c++)
926 len += (count - 1) * separator->len;
929 if (len == 0)
return &afw_s_a_empty_string;
936 for (n = 1, c = strings; *c; n++, c++) {
937 memcpy(s, (*c)->s, (*c)->len);
939 if (n < count && separator) {
940 memcpy(s, separator->s, separator->len);
962 if (!strings || *strings == NULL)
return "";
965 for (count = 0, c = strings; *c; count++, c++)
971 len += (count - 1) * separator->len;
974 if (len == 1)
return "";
978 for (n = 1, c = strings; *c; n++, c++) {
979 memcpy(o, (*c)->s, (*c)->len);
981 if (n < count && separator) {
982 memcpy(o, separator->s, separator->len);
1005 if (!strings_z || *strings_z == 0)
return "";
1008 for (count = 0, c_z = strings_z; *c_z; count++, c_z++)
1010 len += strlen(*c_z);
1014 len += (count - 1) * separator->len;
1017 if (len == 1)
return "";
1021 for (n = 1, c_z = strings_z; *c_z; n++, c_z++) {
1022 memcpy(o, *c_z, strlen(*c_z));
1024 if (n < count && separator) {
1025 memcpy(o, separator->s, separator->len);
1026 o += separator->len;
1043 for (c = result; *c; c++) {
1044 if (*c ==
'/' || *c ==
'\\') {
1072 end = (
const afw_octet_t *)s->s + (offset <= s->len ? offset : s->len);
1074 newlines = line_offset = 0,
1076 end = c + (offset <= s->len ? offset : s->len);
1084 else if (*c ==
'\t') {
1085 line_offset = (line_offset + tab_size) % tab_size * tab_size;
1087 else if ((*c < 128 || *c >= 0b11000000) && *c !=
'\r') {
1092 if (newlines == 0) {
1094 *column_number = line_offset + 1;
1098 *line_number = newlines + 1;
1099 *column_number = line_offset + 1;
1122 *number_of_lines = 1;
1123 *max_column_number = 0;
1125 for (offset = 0, column_number=1;;) {
1131 column_number += tab_size;
1134 *number_of_lines += 1;
1140 if (*max_column_number < column_number) {
1141 *max_column_number = column_number;
1161 for (count = 1, sz = s->len, c = s->s; sz > 0; sz--, c++) {
1162 if (*c ==
',') count++;
1166 for (sz = s->len, b = c = s->s, v = result; ; sz--, c++) {
1167 if (sz <= 0 || *c ==
',' || *c ==
';') {
1172 for (; sz > 0 && *c !=
','; sz--, c++);
AFW_DEFINE(const afw_object_t *)
#define AFW_DEFINE_ELLIPSIS(type)
Define a public afw function with variable arguments.
#define AFW_DECLARE(type)
Declare a public afw function.
Adaptive Framework Core Internal.
afw_compile_code_point_is_EOL(afw_code_point_t cp)
Determine if codepoint matches AFW EOL production.
#define AFW_UTF8_Z_LEN
String is NUL (0) terminate.
afw_int32_t afw_code_point_t
Unicode code point.
#define AFW_INT32_MAX
Max int32.
afw_utf8_octet_t afw_utf8_z_t
NFC normalized UTF-8 null terminated string.
char afw_utf8_octet_t
8 bits of utf-8 codepoint.
apr_size_t afw_size_t
size_t.
unsigned char afw_octet_t
8 bits (unsigned).
#define AFW_FINALLY
Always executed regardless of error.
#define AFW_THROW_MEMORY_ERROR(xctx)
#define AFW_THROW_ERROR_RV_Z(code, rv_source_id, rv, message_z, xctx)
Macro used to set error and rv in xctx and throw it.
#define AFW_ENDTRY
Ends an AFW try block.
#define AFW_TRY
Begin an AFW TRY block.
#define AFW_THROW_ERROR_Z(code, message_z, xctx)
Macro used to set error and 0 rv in xctx and throw it.
#define afw_pool_malloc(instance, size, xctx)
Call method malloc of interface afw_pool.
#define afw_pool_calloc(instance, size, xctx)
Call method calloc of interface afw_pool.
#define afw_pool_get_apr_pool(instance)
Call method get_apr_pool of interface afw_pool.
#define afw_pool_calloc_type(instance, type, xctx)
Macro to allocate cleared memory to hold type in pool.
afw_int32_t afw_safe_cast_size_to_int32(afw_size_t size, afw_xctx_t *xctx)
Safely cast afw_size_t to afw_int32_t.
const afw_utf8_t * afw_utf8_concat(const afw_pool_t *p, afw_xctx_t *xctx,...)
Concatenate strings with result in specifed pool.
afw_boolean_t afw_utf8_equal_utf8_z(const afw_utf8_t *s1, const afw_utf8_z_t *s2_z)
Check to see if a string equals a utf8_z string.
int afw_utf8_compare(const afw_utf8_t *s1, const afw_utf8_t *s2)
Compare two strings.
afw_utf8_nfc(const afw_utf8_octet_t *s, afw_size_t len, afw_utf8_nfc_option_t option, const afw_pool_t *p, afw_xctx_t *xctx)
UTF-8 NFC support function.
afw_utf8_ends_with(const afw_utf8_t *string, const afw_utf8_t *ends_with)
Check to see if a string ends with another string.
afw_boolean_t afw_utf8_starts_with_z(const afw_utf8_t *string, const afw_utf8_z_t *starts_with_z)
Check to see if a string starts with a utf8_z string.
const afw_utf8_t * afw_utf8_from_encoding(const afw_utf8_t *from_encoding, const char **from, afw_size_t *from_size, const afw_pool_t *p, afw_xctx_t *xctx)
Convert character encoding to a utf-8 in specified pool.
afw_utf8_ends_with_z(const afw_utf8_t *string, const afw_utf8_z_t *ends_with_z)
Check to see if a string ends with a utf8_z string.
afw_utf8_z_printf_v(const afw_utf8_z_t *format_z, va_list ap, const afw_pool_t *p, afw_xctx_t *xctx)
afw_utf8_printf_v(const afw_utf8_z_t *format, va_list arg, const afw_pool_t *p, afw_xctx_t *xctx)
Create a utf-8 string using a c format string in specified pool.
afw_boolean_t afw_utf8_equal(const afw_utf8_t *s1, const afw_utf8_t *s2)
Check to see if a string equals another string.
afw_utf8_from_code_point(afw_utf8_octet_t utf8_z[5], afw_code_point_t cp, afw_xctx_t *xctx)
Convert a code point to utf8.
afw_utf8_parse_csv(const afw_utf8_t *s, const afw_pool_t *p, afw_xctx_t *xctx)
Check to see if a string equals a utf8_z string.
afw_utf8_clone_pointer_array(afw_size_t count, const afw_utf8_t *const *pointers, afw_boolean_t NULL_terminate, const afw_pool_t *p, afw_xctx_t *xctx)
Clone a pointer array of utf-8 to specified pool.
afw_utf8_line_column_of_offset(afw_size_t *line_number, afw_size_t *column_number, const afw_utf8_t *s, afw_size_t offset, int tab_size, afw_xctx_t *xctx)
Determine the line and column of an offset in a string.
int afw_utf8_compare_ignore_case(const afw_utf8_t *s1, const afw_utf8_t *s2, afw_xctx_t *xctx)
Compare two strings ignoring case.
afw_utf8_array_to_utf8_z_with_separator(const afw_utf8_t *const *strings, const afw_utf8_t *separator, const afw_pool_t *p, afw_xctx_t *xctx)
Concat array of utf-8 with optional separator to specified pool.
afw_utf8_z_create(const afw_utf8_octet_t *s, afw_size_t len, const afw_pool_t *p, afw_xctx_t *xctx)
Create a NFC Normalized zero terminated UTF-8 string in specified pool.
afw_utf8_line_count_and_max_column(afw_size_t *number_of_lines, afw_size_t *max_column_number, const afw_utf8_t *s, int tab_size, afw_xctx_t *xctx)
Determine the line count and maximum column in a string.
const afw_utf8_z_t * afw_utf8_z_concat(const afw_pool_t *p, afw_xctx_t *xctx,...)
afw_utf8_printf(const afw_pool_t *p, afw_xctx_t *xctx, const afw_utf8_z_t *format,...)
Create a utf-8 string using a c format string in specified pool.
afw_utf8_contains(const afw_utf8_t *s1, const afw_utf8_t *s2)
Check to see if a string contains another string.
const afw_utf8_t * afw_utf8_normalize_space(const afw_utf8_t *s, const afw_pool_t *p, afw_xctx_t *xctx)
Create a utf-8 sting with spaces normalized in specified pool.
afw_utf8_z_array_to_utf8_z_with_separator(const afw_utf8_z_t *const *strings_z, const afw_utf8_t *separator, const afw_pool_t *p, afw_xctx_t *xctx)
Concat array of utf-8 with optional separator to specified pool.
const afw_utf8_t * afw_utf8_to_lower(const afw_utf8_t *s, const afw_pool_t *p, afw_xctx_t *xctx)
Convert utf-8 sting to lower case in specified pool.
afw_boolean_t afw_utf8_starts_with(const afw_utf8_t *string, const afw_utf8_t *starts_with)
Check to see if a string starts with another string.
afw_utf8_next_code_point(const afw_utf8_octet_t *s, afw_size_t *offset, afw_size_t len, afw_xctx_t *xctx)
Get next codepoint in utf-8.
const afw_utf8_t * afw_utf8_concat_v(const afw_pool_t *p, afw_xctx_t *xctx, va_list strings)
Concatenate strings with result in specifed pool.
afw_utf8_array_to_utf8_with_separator(const afw_utf8_t *const *strings, const afw_utf8_t *separator, const afw_pool_t *p, afw_xctx_t *xctx)
Concat array of utf-8 with optional separator to specified pool.
#define afw_utf8_create(s, len, p, xctx)
Create utf-8 string without copy unless necessary in pool specified.
afw_utf8_z_source_file(const afw_utf8_z_t *source_z)
Returns value of source_z after last '/ 'or '\'.
@ afw_utf8_nfc_option_is_valid
Only check that input is valid UTF-8.
@ afw_utf8_nfc_option_create
If s is already normalized, use it directly for result->s.
@ afw_utf8_nfc_option_create_copy
If s is already normalized, make copy for result->s.
@ afw_utf8_nfc_option_is_nfc
Only check that input is UTF-8 NFC normalized.
Interface afw_pool public struct.
NFC normalized UTF-8 string.
Interface afw_xctx public struct.