Adaptive Framework  0.9.0
All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
afw_utf8.c
Go to the documentation of this file.
1 // See the 'COPYING' file in the project root for licensing information.
2 /*
3  * AFW - String Functions
4  *
5  * Copyright (c) 2010-2023 Clemson University
6  *
7  */
8 
14 #include "afw_internal.h"
15 #include <unicode/utf8.h>
16 #include <unicode/utf.h>
17 #include <unicode/uchar.h>
18 #include <unicode/unorm2.h>
19 #include <unicode/ustring.h>
20 
21 static const afw_utf8_z_t * impl_z_empty = "";
22 
23 /* afw_utf8_t with 0 len and null s. */
24 static const afw_utf8_t impl_utf8_null = { NULL, 0 };
25 
26 #define IMPL_WHITESPACE(c) \
27 ((c) == 0x20 || (c) == 0x09 || (c) == 0x0d || (c) == 0x0a)
28 
29 /* Get next codepoint. */
32  afw_size_t len, afw_xctx_t *xctx)
33 {
34  UChar32 cp;
35  int32_t slen;
36  int32_t soffset;
37 
38  /* If len is AFW_UTF8_Z_LEN, set it to strlen(s). */
39  if (len == AFW_UTF8_Z_LEN) {
40  len = strlen(s);
41  }
42 
43  /* ICU only supports int32 length. */
44  soffset = afw_safe_cast_size_to_int32(*offset, xctx);
45  slen = afw_safe_cast_size_to_int32(len, xctx);
46  if (*offset != soffset && len != slen) {
47  AFW_THROW_ERROR_Z(general, "offset or len exceeds icu max", xctx);
48  }
49 
50  if (*offset >= len) {
51  cp = -1;
52  }
53  else {
54  U8_NEXT((const uint8_t *)s, soffset, slen, cp);
55  *offset = soffset;
56  }
57 
58  return cp;
59 }
60 
61 
62 /* Convert a code point to utf8. */
65  afw_xctx_t *xctx)
66 {
67  UBool isError;
68  int32_t i = 0;
69 
70  isError = FALSE;
71  U8_APPEND((afw_octet_t *)utf8_z, i, 4, cp, isError);
72  utf8_z[i] = 0;
73  return (isError == FALSE);
74 }
75 
76 
77 /*
78  * UTF-8 NFC support function.
79  *
80  * IMPORTANT: If option is afw_utf8_nfc_option_is_valid, p must not be used. Macro
81  * afw_utf8_is_valid() calls this using xctx's pool, which can be a
82  * problem.
83  *
84  */
85 AFW_DEFINE(const afw_utf8_t *)
87  const afw_utf8_octet_t *s, afw_size_t len,
88  afw_utf8_nfc_option_t option,
89  const afw_pool_t *p, afw_xctx_t *xctx)
90 {
91  UChar32 c;
92  int32_t i, length;
93  const afw_utf8_octet_t *string;
94  UNormalizationCheckResult is_nfc;
95  const afw_utf8_t *result;
96  afw_utf8_t *new_result;
97  afw_utf8_octet_t *result_s;
98  UErrorCode errorCode;
99  int32_t input_utf16_length;
100  UChar *input_utf16;
101  int32_t output_utf16_length;
102  UChar *output_utf16;
103  const UNormalizer2* nfc;
104 
105  /* If s is NULL or length is 0, return NULL or empty string. */
106  if (!s || len == 0) {
107  if (option == afw_utf8_nfc_option_is_valid ||
108  option == afw_utf8_nfc_option_is_nfc)
109  {
110  return NULL;
111  }
112  return &afw_s_a_empty_string;
113  }
114 
115  /* If len is AFW_UTF8_Z_LEN, set it to strlen(s). */
116  if (len == AFW_UTF8_Z_LEN) {
117  len = strlen(s);
118  }
119 
120  /* If s starts with BOM, skip it. */
121  if (len >= 3 &&
122  (afw_octet_t)s[0] == 0xEF &&
123  (afw_octet_t)s[1] == 0xBB &&
124  (afw_octet_t)s[2] == 0xBF)
125  {
126  s += 3;
127  len -= 3;
128  }
129 
130  /* ICU only supports 32 bit non-negative lengths. */
131  if (len > AFW_INT32_MAX) {
132  AFW_THROW_ERROR_Z(general,
133  "ICU implementation restrict - len to large or negative", xctx);
134  }
135 
136  /* Do a fast check to determine if further normalization is required. */
137  string = s;
138  length = (int32_t)len;
139  is_nfc = UNORM_YES;
140  for (i = 0; i < length;) {
141  U8_NEXT(string, i, length, c);
142 
143  /* If codepoint is invalid ... */
144  if (c < 0) {
145  /*
146  * If "is" option, return non-NULL pointer to indicate not valid.
147  */
148  if (option == afw_utf8_nfc_option_is_valid ||
149  option == afw_utf8_nfc_option_is_nfc)
150  {
151  return &impl_utf8_null;
152  }
153 
154  /* For other options, throw an error. */
155  AFW_THROW_ERROR_Z(general, "Not valid UTF-8", xctx);
156  }
157 
158  /* If c < 0x0300, c is NFC normalized. */
159  if (c < 0x0300) continue;
160 
161  /* If NFC_QUICK_CHECK is not UNORM_YES, break. */
162  is_nfc = u_getIntPropertyValue(c, UCHAR_NFC_QUICK_CHECK);
163  if (is_nfc != UNORM_YES) break;
164  }
165 
166  /*
167  * If only checking for valid utf8, return NULL to indicate input is valid
168  * utf8.
169  *
170  * NOTE: No code before this point should access p. See IMPORTANT note
171  * above.
172  */
173  if (option == afw_utf8_nfc_option_is_valid) {
174  return NULL;
175  }
176 
177  /*
178  * If only checking for NFC normalize and know for sure already, return
179  * return NULL or non-NULL result.
180  */
181  if (option == afw_utf8_nfc_option_is_nfc) {
182  if (is_nfc == UNORM_YES) {
183  return NULL;
184  }
185  else if (is_nfc == UNORM_NO) {
186  return &impl_utf8_null;
187  }
188  /* is_nfc == UNORM_MAYBE will fall through. */
189  }
190 
191  /*
192  * If fast check indicates NFC normalized already and is a create option,
193  * return result without further processing.
194  */
195  else if (is_nfc == UNORM_YES) {
196 
197  /* If not option create, return result using input s. */
198  if (option == afw_utf8_nfc_option_create) {
199  new_result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
200  new_result->s = s;
201  new_result->len = len;
202  return new_result;
203  }
204 
205  /* If not option create_copy, return result using copy of input s. */
206  else if (option == afw_utf8_nfc_option_create_copy) {
207  new_result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
208  if (len > 0) {
209  result_s = afw_pool_calloc(p, len, xctx);
210  memcpy(result_s, s, len);
211  new_result->s = result_s;
212  new_result->len = len;
213  }
214  return new_result;
215  }
216  }
217 
218  /*
219  * At this point, will need to convert to UTF-16 to use ICU functions.
220  *
221  * Use AFW_TRY to make sure malloc memory is freed.
222  */
223  AFW_TRY {
224 
225  result = NULL;
226 
227  /* Convert utf-8 to UChar[] */
228  errorCode = U_ZERO_ERROR;
229  string = s;
230  length = (int32_t)len;
231  output_utf16 = NULL;
232  input_utf16 = malloc(length * sizeof(UChar));
233  input_utf16 = u_strFromUTF8Lenient(input_utf16,
234  length, &input_utf16_length, string, length,
235  &errorCode);
236  if (U_FAILURE(errorCode)) {
237  AFW_THROW_ERROR_RV_Z(general, icu, errorCode,
238  "u_strFromUTF8Lenient() failed", xctx);
239  }
240 
241  /* Get nfc normalizer. */
242  nfc = unorm2_getNFCInstance(&errorCode);
243  if (U_FAILURE(errorCode)) {
244  AFW_THROW_ERROR_RV_Z(general, icu, errorCode,
245  "unorm2_getNFCInstance() failed", xctx);
246  }
247 
248  /* If is_nfc == UNORM_MAYBE, check to see if already normalized. */
249  if (is_nfc == UNORM_MAYBE) {
250 
251  /* If normalized already, set result. */
252  if (unorm2_isNormalized(nfc, input_utf16,
253  input_utf16_length, &errorCode))
254  {
255 
256  /*
257  * If just checking for normalization, return NULL to indicate
258  * normalized.
259  */
260  if (option == afw_utf8_nfc_option_is_nfc) {
261  result = NULL;
262  break;
263  }
264 
265  /*
266  * If length is 0, return NULL. Note: probably can't
267  * get to this point in code.
268  */
269  if (len == 0) {
270  result = NULL;
271  break;
272  }
273 
274  /* If option create, result uses input s. */
275  if (option == afw_utf8_nfc_option_create) {
276  new_result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
277  new_result->s = s;
278  new_result->len = len;
279  result = new_result;
280  break;
281  }
282 
283  /* If not option create_copy, return result using copy of input s. */
284  if (option == afw_utf8_nfc_option_create_copy) {
285  new_result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
286  result_s = afw_pool_calloc(p, len, xctx);
287  memcpy(result_s, s, len);
288  new_result->s = result_s;
289  new_result->len = len;
290  result = new_result;
291  break;
292  }
293 
294  /* Invalid option. */
295  AFW_THROW_ERROR_Z(general, "Invalid afw_utf8_nfc() option",
296  xctx);
297  }
298 
299  /*
300  * If not normalized and just checking normalization, result is
301  * a non-NULL pointer.
302  */
303  else if (option == afw_utf8_nfc_option_is_nfc) {
304  result = &impl_utf8_null;
305  break;
306  }
307  }
308 
309  /* At this point normalization is required. */
310  output_utf16_length = unorm2_normalize(nfc,
311  input_utf16, input_utf16_length,
312  NULL, 0, &errorCode);
313  if (errorCode != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(errorCode)) {
314  AFW_THROW_ERROR_RV_Z(general, icu, errorCode,
315  "unorm2_normalize() preflight failed", xctx);
316  }
317  errorCode = U_ZERO_ERROR;
318  output_utf16 = malloc(output_utf16_length * 2);
319  output_utf16_length = unorm2_normalize(nfc,
320  input_utf16, input_utf16_length,
321  output_utf16, output_utf16_length, &errorCode);
322  if (!U_SUCCESS(errorCode)) {
323  AFW_THROW_ERROR_RV_Z(general, icu, errorCode,
324  "unorm2_normalize() failed", xctx);
325  }
326 
327  /* Convert normalized result to utf-8 and set result. */
328  u_strToUTF8(NULL, 0, &length, output_utf16, output_utf16_length,
329  &errorCode);
330  if (errorCode != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(errorCode)) {
331  AFW_THROW_ERROR_RV_Z(general, icu, errorCode,
332  "u_strToUTF8() preflight failed", xctx);
333  }
334  errorCode = U_ZERO_ERROR;
335  new_result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
336  result_s = afw_pool_calloc(p, length, xctx);
337  new_result->s = result_s;
338  new_result->len = length;
339  u_strToUTF8(result_s, length, &length, output_utf16,
340  output_utf16_length, &errorCode);
341  if (!U_SUCCESS(errorCode)) {
342  AFW_THROW_ERROR_RV_Z(general, icu, errorCode,
343  "u_strToUTF8() failed", xctx);
344  }
345  result = new_result;
346  }
347 
348  /* Always free malloced memory. */
349  AFW_FINALLY{
350  if (input_utf16) free(input_utf16);
351  if (output_utf16) free(output_utf16);
352  }
353 
354  AFW_ENDTRY;
355 
356  /* Return result. */
357  return result;
358 }
359 
360 
361 /*
362  * Create a NFC normalized zero terminated UTF-8 string in specified
363  * pool.
364  */
365 AFW_DEFINE(const afw_utf8_z_t *)
367  const afw_utf8_octet_t *s, afw_size_t len, const afw_pool_t *p, afw_xctx_t *xctx)
368 {
369  const afw_utf8_t *temp;
370  afw_size_t s_len;
371  afw_utf8_z_t *s_z;
372 
373  /* Determine len and if 0, return empty string. */
374  s_len = (len == AFW_UTF8_Z_LEN) ? strlen(s) : len;
375  if (!s || s_len == 0) {
376  return impl_z_empty;
377  }
378 
379  /* Allocate memory for string including length byte. */
380  s_z = afw_pool_malloc(p, s_len + 1, xctx);
381 
382  memcpy(s_z, s, s_len);
383  s_z[s_len] = 0;
384 
385  /* Give afw_utf8_nfc a chance to normalize result. */
386  temp = afw_utf8_nfc(s_z, s_len + 1, afw_utf8_nfc_option_create,
387  p, xctx);
388 
389  /* Return just s part of temp. */
390  return temp->s;
391 }
392 
393 
394 /* Convert character encoding to a utf-8 in specified pool. */
396  const afw_utf8_t * from_encoding,
397  const char* * from, afw_size_t * from_size,
398  const afw_pool_t *p, afw_xctx_t *xctx)
399 {
400  AFW_THROW_ERROR_Z(general, "Not implemented", xctx);
401 }
402 
403 
404 /* Concatenate strings in specifed pool. */
406  afw_xctx_t *xctx, ...)
407 {
408  va_list strings;
409  const afw_utf8_t *result;
410 
411  /* Calculate size needed to hold concatenated strings. */
412  va_start(strings, xctx);
413  result = afw_utf8_concat_v(p, xctx, strings);
414  va_end(strings);
415 
416  /* Return result. */
417  return result;
418 }
419 
420 /* Concatenate strings in specifed pool. */
422  const afw_pool_t *p, afw_xctx_t *xctx, va_list strings)
423 {
424  afw_utf8_t *string;
425  afw_utf8_octet_t *c;
426  afw_utf8_t *result;
427  va_list original_strings;
428 
429  va_copy(original_strings,strings);
430  /* Allocate memory for afw_utf8_t result. */
431  result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
432 
433  /* Calculate size needed to hold concatenated strings. */
434  while ((string = va_arg(strings, afw_utf8_t *))) {
435  result->len += string->len;
436  }
437 
438  /* Allocate memory and concatenate strings. */
439  if (result->len > 0) {
441  result->len, xctx);
442  result->s = c;
443  while ((string = va_arg(original_strings, afw_utf8_t *))) {
444  if (string->len > 0) {
445  memcpy(c, string->s, string->len);
446  c += string->len;
447  }
448  }
449  }
450 
451  /* Return result. */
452  return result;
453 }
454 
455 
456 
457 /* Create a string using a c format string. */
460  const afw_pool_t *p, afw_xctx_t *xctx, const afw_utf8_z_t *format, ...)
461 {
462  va_list arg;
463  const afw_utf8_z_t *s;
464 
465  /* Use apr_pvsprint() to produce c string. */
466  va_start(arg, format);
467  s = apr_pvsprintf(afw_pool_get_apr_pool(p), format, arg);
468  va_end(arg);
469 
470  /* Make an afw_string from result of apr_pvsprintf() and return it. */
471  return afw_utf8_create(s, AFW_UTF8_Z_LEN, p, xctx);
472 }
473 
474 
475 /* Create a string using a c format string. */
476 AFW_DEFINE(const afw_utf8_t *)
478  const afw_utf8_z_t *format, va_list arg,
479  const afw_pool_t *p, afw_xctx_t *xctx)
480 {
481  const afw_utf8_z_t *s;
482 
483  /* Use apr_pvsprint() to produce c string. */
484  s = apr_pvsprintf(afw_pool_get_apr_pool(p), format, arg);
485 
486  /* Make an afw_string from result of apr_pvsprintf() and return it. */
487  return afw_utf8_create(s, AFW_UTF8_Z_LEN, p, xctx);
488 }
489 
490 
491 
492 /* Check to see if a string starts with another string. */
494  const afw_utf8_t *string, const afw_utf8_t *starts_with)
495 {
496  return (string->len >= starts_with->len &&
497  memcmp(string->s, starts_with->s, starts_with->len) == 0);
498 }
499 
500 
501 /* Check to see if a string starts with a utf8_z string. */
503  const afw_utf8_t *string, const afw_utf8_z_t *starts_with_z)
504 {
505  afw_size_t len = strlen(starts_with_z);
506  return (string->len >= len &&
507  memcmp(string->s, starts_with_z, len) == 0);
508 }
509 
510 
511 /* Check to see if a string ends with another string. */
514  const afw_utf8_t *string, const afw_utf8_t *ends_with)
515 {
516  return (string->len >= ends_with->len &&
517  memcmp(string->s + (string->len - ends_with->len),
518  ends_with->s, ends_with->len) == 0);
519 }
520 
521 
522 /* Check to see if a string ends with a utf8_z string. */
525  const afw_utf8_t *string, const afw_utf8_z_t *ends_with_z)
526 {
527  afw_size_t len = strlen(ends_with_z);
528  return (string->len >= len &&
529  memcmp(string->s + (string->len - len), ends_with_z, len) == 0);
530 }
531 
532 
533 /* Check to see if a string equals another string. */
535  const afw_utf8_t *s1, const afw_utf8_t *s2)
536 {
537  return afw_utf8_compare(s1, s2) == 0;
538 }
539 
540 
541 /* Check to see if a string contains another string. */
544  const afw_utf8_t *s1, const afw_utf8_t *s2)
545 {
546  const afw_utf8_octet_t *c;
547  afw_size_t len;
548 
549  if (!s1) {
550  return false;
551  }
552 
553  if (!s2) {
554  return true;
555  }
556 
557  for (c = s1->s, len = s1->len;
558  s2->len <= len; c++, len--)
559  {
560  if (memcmp(c, s2->s, s2->len) == 0) {
561  return true;
562  }
563  }
564 
565  return false;
566 }
567 
568 
569 /* Compare two strings. */
571  const afw_utf8_t *s1, const afw_utf8_t *s2)
572 {
573  /*
574  UErrorCode status = U_ZERO_ERROR;
575  UCharIterator sIter, tIter;
576 
577  uiter_setUTF8(&sIter, s1->s, s1->len);
578  uiter_setUTF8(&tIter, s2->s, s2->len);
579  return ucol_strcollIter(myCollation, &sIter, &tIter, &status);
580  */
581 
582  int result;
583 
584  if (!s1 || !s2) {
585  if (!s1 && !s2) {
586  return 0;
587  }
588  if (s1) {
589  return 1;
590  }
591  return -1;
592  }
593 
594  if (s1->len == s2->len) {
595  return memcmp(s1->s, s2->s, s1->len);
596  }
597  else if (s1->len < s2->len) {
598  result = memcmp(s1->s, s2->s, s1->len);
599  if (result == 0) {
600  return -1;
601  }
602  return result;
603  }
604  else {
605  result = memcmp(s1->s, s2->s, s2->len);
606  if (result == 0) {
607  return 1;
608  }
609  return result;
610  }
611 }
612 
613 
614 /* Convert utf-8 sting to lower case in specified pool. */
616  const afw_utf8_t *s, const afw_pool_t *p, afw_xctx_t *xctx)
617 {
618  UChar32 c;
619  int32_t i1, i2, len1, len2;
620  const uint8_t *cs1;
621  uint8_t *cs2;
622  afw_utf8_t *result;
623 
624  /* ICU only supports 32 bit non-negative lengths. */
625  if (s->len > AFW_INT32_MAX) {
626  AFW_THROW_ERROR_Z(general,
627  "ICU implementation restrict - len to large or negative", xctx);
628  }
629 
630  len1 = (int32_t) s->len;
631  cs1 = (const uint8_t *)s->s;
632 
633  /*
634  * Not sure if this is necessary, but if bytes to represent lower case is
635  * different than upper, this will get output length right.
636  */
637  for (i1 = 0, len2 = 0; i1 < len1; )
638  {
639  U8_NEXT_UNSAFE(cs1, i1, c);
640  len2 += U8_LENGTH(c);
641  }
642 
643  cs2 = (len2 > 0) ? afw_pool_calloc(p, len2, xctx) : NULL;
644 
645  result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
646  result->s = (const afw_utf8_octet_t *)cs2;
647  result->len = len2;
648 
649  for (i1 = 0, i2 = 0; i1 < len1;) {
650  U8_NEXT_UNSAFE(cs1, i1, c);
651  c = u_tolower(c);
652  U8_APPEND_UNSAFE(cs2, i2, c);
653  }
654 
655  return result;
656 }
657 
658 /* Create a utf-8 sting with spaces normalized in specified pool. */
660  const afw_utf8_t *s, const afw_pool_t *p, afw_xctx_t *xctx)
661 {
662  const afw_utf8_octet_t *c, *start, *end, *start_fix;
663  afw_utf8_octet_t *new_c;
664  afw_size_t len, result_len, new_len;
665  afw_utf8_t *result;
666  afw_boolean_t last_ws;
667 
668  /* If len is 0, just return input s. */
669  if (s->len == 0) return s;
670 
671  /* Set start after leading whitespace. */
672  len = s->len;
673  c = s->s;
674  for (c = s->s; len > 0 && IMPL_WHITESPACE(*c); len--, c++);
675  start = c;
676 
677  /* Set end past last char before trailing whitespace. */
678  for (c = s->s + s->len - 1; len > 0 && IMPL_WHITESPACE(*c); len--, c--);
679  end = c + 1;
680 
681  /* Check for whitespace sequences between start and end. */
682  result_len = len;
683  for (c = start, start_fix = NULL; len > 0; len--, c++) {
684  if (IMPL_WHITESPACE(*c)) {
685  if (start_fix) break;
686  start_fix = c;
687  if (*c != 0x20) break;
688  }
689  else {
690  start_fix = NULL;
691  }
692  }
693 
694  /* If no leading, trailing, or embedded whitespace sequences, return s. */
695  if (!start_fix && start == s->s && end == s->s + s->len) return s;
696 
697  /* Make a new afw_utf_t and set to not include leading and trailing ws. */
698  result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
699  result->len = result_len;
700  result->s = start;
701 
702  /* If there are no embedded sequences, return this. */
703  if (!start_fix) {
704  return result;
705  }
706 
707  /* Figure out how long new string will be with sequences removed. */
708  new_len = result->len;
709  len = end - start_fix;
710  for (c = start_fix, last_ws = false; len > 0; len--, c++)
711  {
712  if (IMPL_WHITESPACE(*c)) {
713  if (last_ws) new_len--;
714  last_ws = true;
715  }
716  else {
717  last_ws = false;
718  }
719  }
720 
721  /* Return new string populated with sequences replaced with 0x20. */
722  c = result->s;
723  new_c = afw_pool_malloc(p, new_len, xctx);
724  result->s = new_c;
725  len = result->len;
726  result->len = new_len;
727  for (last_ws = false; len > 0; c++, len--) {
728  if (IMPL_WHITESPACE(*c)) {
729  if (!last_ws) *new_c++ = 0x20;
730  last_ws = true;
731  }
732  else {
733  *new_c++ = *c;
734  last_ws = false;
735  }
736  }
737  return result;
738 }
739 
740 
741 /* Compare two strings. */
743  const afw_utf8_t *s1, const afw_utf8_t *s2, afw_xctx_t *xctx)
744 {
745  UChar32 c1, c2;
746  int32_t i, len, i2;
747  const uint8_t *cs1, *cs2;
748  int result;
749 
750  /* ICU only supports 32 bit non-negative lengths. */
751  if (s1->len > AFW_INT32_MAX ||
752  s2->len > AFW_INT32_MAX)
753  {
754  AFW_THROW_ERROR_Z(general,
755  "ICU implementation restrict - len to large or negative", xctx);
756  }
757 
758  cs1 = (const uint8_t *)s1->s;
759  cs2 = (const uint8_t *)s2->s;
760  len = afw_safe_cast_size_to_int32((s1->len <= s2->len) ? s1->len : s2->len,
761  xctx);
762  result = 0;
763  for (i = 0; i < len;) {
764  /* U8_NEXT_UNSAFE increments i. Don't use i in first call so offset
765  will be correct for both the 'for' loop and pointing to the
766  correct character in both strings. */
767  i2 = i;
768  U8_NEXT_UNSAFE(cs1, i2, c1);
769  U8_NEXT_UNSAFE(cs2, i, c2);
770  c1 = u_tolower(c1);
771  c2 = u_tolower(c2);
772  if (c1 == c2) continue;
773  result = (int)(c1 > c2) ? 1 : -1;
774  break;
775  }
776 
777  if (result == 0 && s1->len != s2->len) {
778  result = (s1->len > s2->len) ? 1 : -1;
779  }
780 
781  return result;
782 }
783 
784 
785 /* Check to see if a string equals a utf8_z string. */
787  const afw_utf8_t *s1, const afw_utf8_z_t *s2_z)
788 {
789  afw_size_t len = (s2_z) ? strlen(s2_z) : 0;
790  return (s1->len == len &&
791  (s1->len == 0 || memcmp(s1->s, s2_z, len) == 0));
792 }
793 
794 
795 /* Concatenate zero terminated UTF8 strings. */
796 static const afw_utf8_z_t * impl_u8z_concat_v(
797  const afw_pool_t *p, afw_xctx_t *xctx, va_list ap)
798 {
799  afw_size_t sz;
800  afw_utf8_z_t *s;
801  afw_utf8_z_t *s2;
802  afw_utf8_z_t *result;
803  va_list original_ap;
804 
805  va_copy(original_ap, ap);
806 
807  /* Calculate size needed to hold concatenated strings. */
808  sz = 1;
809  while ((s = va_arg(ap, afw_utf8_z_t *))) {
810  sz += strlen((const char *)s);
811  }
812 
813  /* Allocate memory and concatenate strings. */
814  s2 = (afw_utf8_z_t *)afw_pool_malloc(p, sz, xctx);
815  result = s2;
816  while ((s = va_arg(original_ap, afw_utf8_z_t *))) {
817  while ((*s2++ = *s++));
818  s2--;
819  }
820 
821  /* Return result. */
822  return result;
823 }
824 
825 /* Concatenate zero terminated UTF8 strings. */
827  const afw_pool_t *p, afw_xctx_t *xctx, ...)
828 {
829  va_list ap;
830  const afw_utf8_z_t *result;
831 
832  /* Calculate size needed to hold concatenated strings. */
833  va_start(ap, xctx);
834  result = impl_u8z_concat_v(p, xctx, ap);
835  va_end(ap);
836 
837  /* Return result. */
838  return result;
839 }
840 
841 
842 /* Create a utf8_z string using a c format string and va_list in specified pool. */
843 AFW_DEFINE(const afw_utf8_z_t *)
845  const afw_utf8_z_t *format_z, va_list ap,
846  const afw_pool_t *p,
847  afw_xctx_t *xctx)
848 {
849  afw_utf8_z_t *result;
850 
851  result = apr_pvsprintf(afw_pool_get_apr_pool(p), format_z, ap);
852 
853  if (!result) {
855  }
856 
857  return result;
858 }
859 
860 /* Clone a pointer array of utf-8 to specified pool. */
861 AFW_DEFINE(const afw_utf8_t * const *)
863  afw_size_t count,
864  const afw_utf8_t * const * pointers,
865  afw_boolean_t NULL_terminate,
866  const afw_pool_t *p, afw_xctx_t *xctx)
867 {
868  const afw_utf8_t * const *in;
869  afw_utf8_t * *out;
870  afw_utf8_octet_t *s;
871  const afw_utf8_t * const *result;
872 
873  if (!pointers) return NULL;
874 
875  if (count == -1) {
876  for (in = pointers, count = 0; *in; in++, count++);
877  }
878 
879  if (count == 0 && !NULL_terminate) return NULL;
880 
881  out = afw_pool_malloc(p,
882  ((NULL_terminate) ? count + 1 : count) * sizeof(afw_utf8_t *),
883  xctx);
884  result = (const afw_utf8_t * const *)out;
885 
886  for (in = pointers; count > 0; count--, in++, out++) {
887  *out = afw_pool_calloc_type(p, afw_utf8_t, xctx);
888  if ((*in)->len > 0) {
889  (*out)->len = (*in)->len;
890  s = afw_pool_malloc(p, (*in)->len, xctx);
891  (*out)->s = s;
892  memcpy(s, (*in)->s, (*in)->len);
893  }
894  }
895 
896  if (NULL_terminate) {
897  *out = NULL;
898  }
899 
900  return result;
901 }
902 
903 
904 /* Concat array of utf-8 with optional separator to specified pool. */
905 AFW_DEFINE(const afw_utf8_t *)
907  const afw_utf8_t * const * strings,
908  const afw_utf8_t * separator,
909  const afw_pool_t *p, afw_xctx_t *xctx)
910 {
911  afw_size_t n, count;
912  afw_size_t len;
913  const afw_utf8_t * const * c;
914  afw_utf8_t * result;
915  afw_utf8_octet_t * s;
916 
917  if (!strings || *strings == NULL) return &afw_s_a_empty_string;
918 
919  len = 0;
920  for (count = 0, c = strings; *c; count++, c++)
921  {
922  len += (*c)->len;
923  }
924 
925  if (separator) {
926  len += (count - 1) * separator->len;
927  }
928 
929  if (len == 0) return &afw_s_a_empty_string;
930 
931  s = afw_pool_malloc(p, len, xctx);
932  result = afw_pool_calloc_type(p, afw_utf8_t, xctx);
933  result->s = s;
934  result->len = len;
935 
936  for (n = 1, c = strings; *c; n++, c++) {
937  memcpy(s, (*c)->s, (*c)->len);
938  s += (*c)->len;
939  if (n < count && separator) {
940  memcpy(s, separator->s, separator->len);
941  s += separator->len;
942  }
943  }
944 
945  return result;
946 }
947 
948 
949 /* Concat array of utf-8 with optional separator to specified pool. */
950 AFW_DEFINE(const afw_utf8_z_t *)
952  const afw_utf8_t * const * strings,
953  const afw_utf8_t * separator,
954  const afw_pool_t *p, afw_xctx_t *xctx)
955 {
956  afw_size_t n, count;
957  afw_size_t len;
958  const afw_utf8_t * const * c;
959  const afw_utf8_z_t * result;
960  afw_utf8_z_t * o;
961 
962  if (!strings || *strings == NULL) return "";
963 
964  len = 1;
965  for (count = 0, c = strings; *c; count++, c++)
966  {
967  len += (*c)->len;
968  }
969 
970  if (separator) {
971  len += (count - 1) * separator->len;
972  }
973 
974  if (len == 1) return "";
975 
976  o = afw_pool_malloc(p, len, xctx);
977  result = o;
978  for (n = 1, c = strings; *c; n++, c++) {
979  memcpy(o, (*c)->s, (*c)->len);
980  o += (*c)->len;
981  if (n < count && separator) {
982  memcpy(o, separator->s, separator->len);
983  o += separator->len;
984  }
985  }
986  *o = 0;
987 
988  return result;
989 }
990 
991 
992 /* Concat array of utf-8 with optional separator to specified pool. */
993 AFW_DEFINE(const afw_utf8_z_t *)
995  const afw_utf8_z_t * const * strings_z,
996  const afw_utf8_t * separator,
997  const afw_pool_t *p, afw_xctx_t *xctx)
998 {
999  afw_size_t n, count;
1000  afw_size_t len;
1001  const afw_utf8_z_t * const * c_z;
1002  const afw_utf8_z_t * result;
1003  afw_utf8_z_t * o;
1004 
1005  if (!strings_z || *strings_z == 0) return "";
1006 
1007  len = 1;
1008  for (count = 0, c_z = strings_z; *c_z; count++, c_z++)
1009  {
1010  len += strlen(*c_z);
1011  }
1012 
1013  if (separator) {
1014  len += (count - 1) * separator->len;
1015  }
1016 
1017  if (len == 1) return "";
1018 
1019  o = afw_pool_malloc(p, len, xctx);
1020  result = o;
1021  for (n = 1, c_z = strings_z; *c_z; n++, c_z++) {
1022  memcpy(o, *c_z, strlen(*c_z));
1023  o += strlen(*c_z);
1024  if (n < count && separator) {
1025  memcpy(o, separator->s, separator->len);
1026  o += separator->len;
1027  }
1028  }
1029  *o = 0;
1030 
1031  return result;
1032 }
1033 
1034 
1035 /* Returns value of source_z after last '/ 'or '\'. */
1036 AFW_DEFINE(const afw_utf8_z_t *)
1038  const afw_utf8_z_t *result;
1039  const afw_utf8_z_t *c;
1040 
1041  result = source_z;
1042  if (result) {
1043  for (c = result; *c; c++) {
1044  if (*c == '/' || *c == '\\') {
1045  result = c + 1;
1046  }
1047  }
1048  }
1049 
1050  return result;
1051 }
1052 
1053 
1054 
1055 /* Determine the line and column of an offset in a string. */
1059  afw_size_t *line_number,
1060  afw_size_t *column_number,
1061  const afw_utf8_t *s,
1062  afw_size_t offset,
1063  int tab_size,
1064  afw_xctx_t *xctx)
1065 {
1066  afw_size_t newlines;
1067  afw_size_t line_offset;
1068  const afw_octet_t *c;
1069  const afw_octet_t *end;
1070  afw_boolean_t result;
1071 
1072  end = (const afw_octet_t *)s->s + (offset <= s->len ? offset : s->len);
1073  for (
1074  newlines = line_offset = 0,
1075  c = (const afw_octet_t *)s->s,
1076  end = c + (offset <= s->len ? offset : s->len);
1077  c < end;
1078  c++)
1079  {
1080  if (*c == '\n') {
1081  newlines++;
1082  line_offset = 0;
1083  }
1084  else if (*c == '\t') {
1085  line_offset = (line_offset + tab_size) % tab_size * tab_size;
1086  }
1087  else if ((*c < 128 || *c >= 0b11000000) && *c != '\r') {
1088  line_offset++;
1089  }
1090  }
1091 
1092  if (newlines == 0) {
1093  *line_number = 1;
1094  *column_number = line_offset + 1;
1095  result = false;
1096  }
1097  else {
1098  *line_number = newlines + 1;
1099  *column_number = line_offset + 1;
1100  result = true;
1101  }
1102 
1103  return result;
1104 }
1105 
1106 
1107 
1108 /* Determine the line and column of an offset in a string. */
1110 AFW_DEFINE(void)
1112  afw_size_t *number_of_lines,
1113  afw_size_t *max_column_number,
1114  const afw_utf8_t *s,
1115  int tab_size,
1116  afw_xctx_t *xctx)
1117 {
1118  afw_size_t column_number;
1119  afw_code_point_t cp;
1120  afw_size_t offset;
1121 
1122  *number_of_lines = 1;
1123  *max_column_number = 0;
1124 
1125  for (offset = 0, column_number=1;;) {
1126  cp = afw_utf8_next_code_point(s->s, &offset, s->len, xctx);
1127  if (cp < 0) {
1128  break;
1129  }
1130  if (cp == '\t') {
1131  column_number += tab_size;
1132  }
1133  else if (afw_compile_code_point_is_EOL(cp)) {
1134  *number_of_lines += 1;
1135  column_number = 1;
1136  }
1137  else {
1138  column_number++;
1139  }
1140  if (*max_column_number < column_number) {
1141  *max_column_number = column_number;
1142  }
1143  }
1144 }
1145 
1146 
1147 
1148 AFW_DEFINE(const afw_utf8_t * const *)
1150  const afw_utf8_t *s,
1151  const afw_pool_t *p,
1152  afw_xctx_t *xctx)
1153 {
1154  afw_size_t count;
1155  afw_size_t sz;
1156  const afw_utf8_octet_t *c;
1157  const afw_utf8_octet_t *b;
1158  const afw_utf8_t **result;
1159  const afw_utf8_t **v;
1160 
1161  for (count = 1, sz = s->len, c = s->s; sz > 0; sz--, c++) {
1162  if (*c == ',') count++;
1163  }
1164 
1165  result = afw_pool_calloc(p, sizeof(afw_utf8_t *)*(count+1), xctx);
1166  for (sz = s->len, b = c = s->s, v = result; ; sz--, c++) {
1167  if (sz <= 0 || *c == ',' || *c == ';') {
1168  *v = afw_utf8_create(b, c - b, p, xctx);
1169  v++;
1170  if (sz <= 0) break;
1171  if (*c == ';') {
1172  for (; sz > 0 && *c != ','; sz--, c++);
1173  if (sz <= 0) break;
1174  }
1175  b = c + 1;
1176  }
1177  }
1178 
1179  *v = NULL;
1180  return result;
1181 }
1182 
1183 
AFW_DEFINE(const afw_object_t *)
#define AFW_DEFINE_ELLIPSIS(type)
Define a public afw function with variable arguments.
#define AFW_DECLARE(type)
Declare a public afw function.
Adaptive Framework Core Internal.
afw_compile_code_point_is_EOL(afw_code_point_t cp)
Determine if codepoint matches AFW EOL production.
#define AFW_UTF8_Z_LEN
String is NUL (0) terminate.
Definition: afw_common.h:266
_Bool afw_boolean_t
Definition: afw_common.h:373
afw_int32_t afw_code_point_t
Unicode code point.
Definition: afw_common.h:205
#define AFW_INT32_MAX
Max int32.
Definition: afw_common.h:169
afw_utf8_octet_t afw_utf8_z_t
NFC normalized UTF-8 null terminated string.
Definition: afw_common.h:523
char afw_utf8_octet_t
8 bits of utf-8 codepoint.
Definition: afw_common.h:236
apr_size_t afw_size_t
size_t.
Definition: afw_common.h:151
unsigned char afw_octet_t
8 bits (unsigned).
Definition: afw_common.h:211
#define AFW_FINALLY
Always executed regardless of error.
Definition: afw_error.h:702
#define AFW_THROW_MEMORY_ERROR(xctx)
Definition: afw_error.h:499
#define AFW_THROW_ERROR_RV_Z(code, rv_source_id, rv, message_z, xctx)
Macro used to set error and rv in xctx and throw it.
Definition: afw_error.h:301
#define AFW_ENDTRY
Ends an AFW try block.
Definition: afw_error.h:727
#define AFW_TRY
Begin an AFW TRY block.
Definition: afw_error.h:634
#define AFW_THROW_ERROR_Z(code, message_z, xctx)
Macro used to set error and 0 rv in xctx and throw it.
Definition: afw_error.h:283
#define afw_pool_malloc(instance, size, xctx)
Call method malloc of interface afw_pool.
#define afw_pool_calloc(instance, size, xctx)
Call method calloc of interface afw_pool.
#define afw_pool_get_apr_pool(instance)
Call method get_apr_pool of interface afw_pool.
#define afw_pool_calloc_type(instance, type, xctx)
Macro to allocate cleared memory to hold type in pool.
Definition: afw_pool.h:167
afw_int32_t afw_safe_cast_size_to_int32(afw_size_t size, afw_xctx_t *xctx)
Safely cast afw_size_t to afw_int32_t.
const afw_utf8_t * afw_utf8_concat(const afw_pool_t *p, afw_xctx_t *xctx,...)
Concatenate strings with result in specifed pool.
afw_boolean_t afw_utf8_equal_utf8_z(const afw_utf8_t *s1, const afw_utf8_z_t *s2_z)
Check to see if a string equals a utf8_z string.
int afw_utf8_compare(const afw_utf8_t *s1, const afw_utf8_t *s2)
Compare two strings.
afw_utf8_nfc(const afw_utf8_octet_t *s, afw_size_t len, afw_utf8_nfc_option_t option, const afw_pool_t *p, afw_xctx_t *xctx)
UTF-8 NFC support function.
Definition: afw_utf8.c:86
afw_utf8_ends_with(const afw_utf8_t *string, const afw_utf8_t *ends_with)
Check to see if a string ends with another string.
Definition: afw_utf8.c:513
afw_boolean_t afw_utf8_starts_with_z(const afw_utf8_t *string, const afw_utf8_z_t *starts_with_z)
Check to see if a string starts with a utf8_z string.
const afw_utf8_t * afw_utf8_from_encoding(const afw_utf8_t *from_encoding, const char **from, afw_size_t *from_size, const afw_pool_t *p, afw_xctx_t *xctx)
Convert character encoding to a utf-8 in specified pool.
Definition: afw_utf8.c:395
afw_utf8_ends_with_z(const afw_utf8_t *string, const afw_utf8_z_t *ends_with_z)
Check to see if a string ends with a utf8_z string.
Definition: afw_utf8.c:524
afw_utf8_z_printf_v(const afw_utf8_z_t *format_z, va_list ap, const afw_pool_t *p, afw_xctx_t *xctx)
Definition: afw_utf8.c:844
afw_utf8_printf_v(const afw_utf8_z_t *format, va_list arg, const afw_pool_t *p, afw_xctx_t *xctx)
Create a utf-8 string using a c format string in specified pool.
Definition: afw_utf8.c:477
afw_boolean_t afw_utf8_equal(const afw_utf8_t *s1, const afw_utf8_t *s2)
Check to see if a string equals another string.
afw_utf8_from_code_point(afw_utf8_octet_t utf8_z[5], afw_code_point_t cp, afw_xctx_t *xctx)
Convert a code point to utf8.
Definition: afw_utf8.c:64
afw_utf8_parse_csv(const afw_utf8_t *s, const afw_pool_t *p, afw_xctx_t *xctx)
Check to see if a string equals a utf8_z string.
Definition: afw_utf8.c:1149
afw_utf8_clone_pointer_array(afw_size_t count, const afw_utf8_t *const *pointers, afw_boolean_t NULL_terminate, const afw_pool_t *p, afw_xctx_t *xctx)
Clone a pointer array of utf-8 to specified pool.
Definition: afw_utf8.c:862
afw_utf8_line_column_of_offset(afw_size_t *line_number, afw_size_t *column_number, const afw_utf8_t *s, afw_size_t offset, int tab_size, afw_xctx_t *xctx)
Determine the line and column of an offset in a string.
Definition: afw_utf8.c:1058
int afw_utf8_compare_ignore_case(const afw_utf8_t *s1, const afw_utf8_t *s2, afw_xctx_t *xctx)
Compare two strings ignoring case.
afw_utf8_array_to_utf8_z_with_separator(const afw_utf8_t *const *strings, const afw_utf8_t *separator, const afw_pool_t *p, afw_xctx_t *xctx)
Concat array of utf-8 with optional separator to specified pool.
Definition: afw_utf8.c:951
afw_utf8_z_create(const afw_utf8_octet_t *s, afw_size_t len, const afw_pool_t *p, afw_xctx_t *xctx)
Create a NFC Normalized zero terminated UTF-8 string in specified pool.
Definition: afw_utf8.c:366
afw_utf8_line_count_and_max_column(afw_size_t *number_of_lines, afw_size_t *max_column_number, const afw_utf8_t *s, int tab_size, afw_xctx_t *xctx)
Determine the line count and maximum column in a string.
Definition: afw_utf8.c:1111
const afw_utf8_z_t * afw_utf8_z_concat(const afw_pool_t *p, afw_xctx_t *xctx,...)
afw_utf8_printf(const afw_pool_t *p, afw_xctx_t *xctx, const afw_utf8_z_t *format,...)
Create a utf-8 string using a c format string in specified pool.
Definition: afw_utf8.c:459
afw_utf8_contains(const afw_utf8_t *s1, const afw_utf8_t *s2)
Check to see if a string contains another string.
Definition: afw_utf8.c:543
const afw_utf8_t * afw_utf8_normalize_space(const afw_utf8_t *s, const afw_pool_t *p, afw_xctx_t *xctx)
Create a utf-8 sting with spaces normalized in specified pool.
afw_utf8_z_array_to_utf8_z_with_separator(const afw_utf8_z_t *const *strings_z, const afw_utf8_t *separator, const afw_pool_t *p, afw_xctx_t *xctx)
Concat array of utf-8 with optional separator to specified pool.
Definition: afw_utf8.c:994
const afw_utf8_t * afw_utf8_to_lower(const afw_utf8_t *s, const afw_pool_t *p, afw_xctx_t *xctx)
Convert utf-8 sting to lower case in specified pool.
afw_boolean_t afw_utf8_starts_with(const afw_utf8_t *string, const afw_utf8_t *starts_with)
Check to see if a string starts with another string.
afw_utf8_next_code_point(const afw_utf8_octet_t *s, afw_size_t *offset, afw_size_t len, afw_xctx_t *xctx)
Get next codepoint in utf-8.
Definition: afw_utf8.c:31
const afw_utf8_t * afw_utf8_concat_v(const afw_pool_t *p, afw_xctx_t *xctx, va_list strings)
Concatenate strings with result in specifed pool.
afw_utf8_array_to_utf8_with_separator(const afw_utf8_t *const *strings, const afw_utf8_t *separator, const afw_pool_t *p, afw_xctx_t *xctx)
Concat array of utf-8 with optional separator to specified pool.
Definition: afw_utf8.c:906
#define afw_utf8_create(s, len, p, xctx)
Create utf-8 string without copy unless necessary in pool specified.
Definition: afw_utf8.h:239
afw_utf8_z_source_file(const afw_utf8_z_t *source_z)
Returns value of source_z after last '/ 'or '\'.
Definition: afw_utf8.c:1037
@ afw_utf8_nfc_option_is_valid
Only check that input is valid UTF-8.
Definition: afw_utf8.h:78
@ afw_utf8_nfc_option_create
If s is already normalized, use it directly for result->s.
Definition: afw_utf8.h:89
@ afw_utf8_nfc_option_create_copy
If s is already normalized, make copy for result->s.
Definition: afw_utf8.h:92
@ afw_utf8_nfc_option_is_nfc
Only check that input is UTF-8 NFC normalized.
Definition: afw_utf8.h:86
Interface afw_pool public struct.
NFC normalized UTF-8 string.
Definition: afw_common.h:545
Interface afw_xctx public struct.