[svn commit] r437 - in trunk: . gc gc/tests include/minor

jimb at red-bean.com jimb at red-bean.com
Sun Apr 24 19:50:56 CDT 2005


Author: jimb
Date: Sun Apr 24 19:50:55 2005
New Revision: 437

Modified:
   trunk/TODO
   trunk/gc/Makefile.am
   trunk/gc/characters.c
   trunk/gc/convert-text-wchar.c
   trunk/gc/numbers.c
   trunk/gc/strings.c
   trunk/gc/strings.h
   trunk/gc/tests/c-api-utf8.c
   trunk/gc/utf8.c
   trunk/include/minor/minor.h
   trunk/include/minor/unicode.h
Log:
Clarify policy for when to abort and when to return an exception.

Consistently return exceptions for character set conversions that
would lose information.

Move string creation and access functions out of Unicode/C conversion
page.

* gc/utf8.c (mn_get_utf8): Return NULL to indicate an error, rather
than setting CODE_POINT to MN_UNICODE_EOF.  Fix callers.
(mn_forward_utf8): Return NULL if we encounter ill-formed UTF-8, rather
than aborting.

* gc/strings.c (mn__conversion_exception): Rename this from
conversion_exception, and make it visible within libminor.

* Makefile.am (AM_ETAGSFLAGS): New variable.  List files missed by Automake.

Doc fixes.



Modified: trunk/TODO
==============================================================================
--- trunk/TODO	(original)
+++ trunk/TODO	Sun Apr 24 19:50:55 2005
@@ -21,6 +21,8 @@
 
 * Re-prioritize to-do items.
 
+* Why use 'assert' when we have 'check'?
+
 * tests for unicode-case.c
 
 * sketch new character set conversion interface; see how well we can do
@@ -34,8 +36,14 @@
 
 * Implement better Unicode case insensitivity in reader.
 
+* Don't store a terminating null character in strings.  More
+  sophisticated representations (shared, quick-concat) won't allow
+  that, and since the strings aren't in the C execution character set
+  anyway, we can't do the trick of passing them directly to system
+  calls.
+
 * Rename 'struct string' members 'size' and 'length' to 'byte_size'
-and 'char_length'.
+  and 'char_length'.
 
 * Should mn_get_utf8 and mn_put_utf8 return a count of code units
   consumed, instead of a new pointer?  Should they take a pointer by

Modified: trunk/gc/Makefile.am
==============================================================================
--- trunk/gc/Makefile.am	(original)
+++ trunk/gc/Makefile.am	Sun Apr 24 19:50:55 2005
@@ -5,6 +5,11 @@
 
 AM_CFLAGS = -Wall -Wstrict-prototypes -Wmissing-prototypes
 AM_CPPFLAGS = -I$(top_srcdir)/include
+AM_ETAGSFLAGS =					\
+	barriers-ia-32.c			\
+	barriers-amd64.c			\
+	convert-text-wchar.c			\
+	pause-posix-tls.c
 
 libminor_la_LDFLAGS = -version-info 0:0:0
 libminor_la_LIBADD = -lpthread $(LTLIBOBJS)

Modified: trunk/gc/characters.c
==============================================================================
--- trunk/gc/characters.c	(original)
+++ trunk/gc/characters.c	Sun Apr 24 19:50:55 2005
@@ -10,6 +10,8 @@
 #include "check.h"
 #include "heap.h"
 #include "threads.h"
+#include "strings.h"
+#include "excepts.h"
 
 
 /* Type predicates.  */
@@ -110,15 +112,17 @@
 mn_from_char (mn_call *c, int c_char)
 {
   mn_ref *result;
+  mn_unicode_int_t u = mn_char_to_unicode (c_char);
 
   mn__begin_incoherent (c);
   {
-    mn_unicode_int_t u = mn_char_to_unicode (c_char);
-
     if (u == MN_UNICODE_EOF)
-      abort ();
-
-    result = mn__make_local_ref (c, mn__tag_character ((mn_unicode_t) u));
+      {
+	mn__set_exception (c, mn__conversion_exception->obj);
+	result = NULL;
+      }
+    else
+      result = mn__make_local_ref (c, mn__tag_character ((mn_unicode_t) u));
   }
   mn__end_incoherent (c);
 
@@ -130,15 +134,17 @@
 mn_from_wchar (mn_call *c, wchar_t c_char)
 {
   mn_ref *result;
+  mn_unicode_int_t u = mn_wchar_to_unicode (c_char);
 
   mn__begin_incoherent (c);
   {
-    mn_unicode_int_t u = mn_wchar_to_unicode (c_char);
-
     if (u == MN_UNICODE_EOF)
-      abort ();
-
-    result = mn__make_local_ref (c, mn__tag_character ((mn_unicode_t) u));
+      {
+	mn__set_exception (c, mn__conversion_exception->obj);
+	result = NULL;
+      }
+    else
+      result = mn__make_local_ref (c, mn__tag_character ((mn_unicode_t) u));
   }
   mn__end_incoherent (c);
 
@@ -147,22 +153,24 @@
 
 
 
-/* Converting between Minor characters and Unicode scalar values.  */
+/* Converting between Minor characters and Unicode code points.  */
+
 
 mn_ref *
-mn_from_unicode (mn_call *c, mn_unicode_t scalar)
+mn_from_unicode (mn_call *c, mn_unicode_t code_point)
 {
   mn_ref *result;
 
   mn__begin_incoherent (c);
   {
-    result = mn__make_local_ref (c, mn__tag_character (scalar));
+    result = mn__make_local_ref (c, mn__tag_character (code_point));
   }
   mn__end_incoherent (c);
 
   return result;
 }
 
+
 mn_unicode_t
 mn_to_unicode (mn_call *c, mn_ref *ref)
 {

Modified: trunk/gc/convert-text-wchar.c
==============================================================================
--- trunk/gc/convert-text-wchar.c	(original)
+++ trunk/gc/convert-text-wchar.c	Sun Apr 24 19:50:55 2005
@@ -104,8 +104,7 @@
 	     doubling it in size will produce at least that much free
 	     space.  */
 	  result_size *= 2;
-	  result = mn_xrealloc (result, 
-				    result_size * sizeof (*result));
+	  result = mn_xrealloc (result, result_size * sizeof (*result));
 	  next = result + next_offset;
 	  end = result + result_size;
 	}
@@ -147,6 +146,11 @@
       size_t produced;
 
       mem = mn_get_utf8 (mem, &u);
+      if (! mem)
+	{
+	  mn_xfree (buf);
+	  return NULL;
+	}
 
       /* Do we have enough space for an arbitrary multi-byte character?  */
       if (buf_next >= buf_end - MB_CUR_MAX)

Modified: trunk/gc/numbers.c
==============================================================================
--- trunk/gc/numbers.c	(original)
+++ trunk/gc/numbers.c	Sun Apr 24 19:50:55 2005
@@ -135,6 +135,9 @@
       mn_unicode_t u;
 
       s = mn_get_utf8 (s, &u);
+      if (! s)
+	goto ill_formed;
+
       if (u == mn_unicode_number_sign)
 	{
 	  int new_radix = 0;
@@ -142,6 +145,8 @@
 	  if (state != prefix)
 	    goto ill_formed;
 	  s = mn_get_utf8 (s, &u);
+	  if (! s)
+	    goto ill_formed;
 	  u = mn_simple_case_fold (u);
 	  if (u == mn_unicode_latin_small_letter_b)
 	    new_radix = 2;

Modified: trunk/gc/strings.c
==============================================================================
--- trunk/gc/strings.c	(original)
+++ trunk/gc/strings.c	Sun Apr 24 19:50:55 2005
@@ -21,7 +21,7 @@
 #include "excepts.h"
 
 static mn_ref *string_label;
-static mn_ref *conversion_exception;
+mn_ref *mn__conversion_exception;
 
 bool
 mn__is_string (tagged_t obj)
@@ -39,16 +39,6 @@
 }
 
 
-static struct string *
-check_string (mn_ref *ref)
-{
-  if (! mn__is_string (ref->obj))
-    abort ();
-
-  return mn__untag_string (ref->obj);
-}
-
-
 /* Construct a string from the SIZE bytes of UTF-8 text at UTF8, which
    contain CHARS coded characters.  The resulting string is a copy of
    the text at UTF8; it does not point back at the memory it is
@@ -95,7 +85,7 @@
     }
   else
     {
-      mn__set_exception (c, conversion_exception->obj);
+      mn__set_exception (c, mn__conversion_exception->obj);
       return NULL;
     }
 }
@@ -125,23 +115,19 @@
 
 
 char *
-mn_string_to_str (mn_call *c, mn_ref *string)
+mn_string_to_mem (mn_call *c, mn_ref *string, size_t *length)
 {
   char *result;
 
   mn__begin_incoherent (c);
   {
-    struct string *s = check_string (string);
-    size_t result_size;
+    struct string *s = mn__untag_string (string->obj);
 
-    /* S->size includes the terminating null; including that in the
-       text passed to the conversion function ensures that the result
-       will be null-terminated as well.  */
-    result = mn_mem_from_utf8 (s->elements, s->size, &result_size);
-
-    if (result)
-      /* Resize the result down to the first null character.  */
-      result = mn_xrealloc (result, strlen (result) + 1);
+    /* S->size includes the terminating null, so leave that out of the
+       portion we convert.  */
+    result = mn_mem_from_utf8 (s->elements, s->size - 1, length);
+    if (! result)
+      mn__set_exception (c, mn__conversion_exception->obj);
   }
   mn__end_incoherent (c);
 
@@ -150,22 +136,26 @@
 
 
 char *
-mn_string_to_mem (mn_call *c, mn_ref *string, size_t *length)
+mn_string_to_str (mn_call *c, mn_ref *string)
 {
-  char *result;
-
-  mn__begin_incoherent (c);
-  {
-    struct string *s = check_string (string);
+  size_t length;
+  char *result = mn_string_to_mem (c, string, &length);
 
-    /* S->size includes the terminating null, so leave that out of the
-       portion we convert.  */
-    result = mn_mem_from_utf8 (s->elements, s->size - 1, length);
-
-    /* Our strings should always be well-formed UTF-8.  */
-    assert (result != NULL);
-  }
-  mn__end_incoherent (c);
+  if (result)
+    {
+      /* Find the first null character (if any), and resize the string
+	 to end after that.  */
+      char *null = memchr (result, '\0', length);
+
+      if (null)
+	result = mn_xrealloc (result, null - result + 1);
+      else
+	{
+	  /* We'll have to add our own terminating null.  */
+	  result = mn_xrealloc (result, length + 1);
+	  result[length] = '\0';
+	}
+    }
 
   return result;
 }
@@ -174,19 +164,17 @@
 mn_utf8_t *
 mn_string_to_utf8 (mn_call *c, mn_ref *string, size_t *size)
 {
-  char *result;
+  mn_utf8_t *result;
 
   mn__begin_incoherent (c);
   {
-    struct string *s = check_string (string);
-    size_t content_size = s->size - 1;
+    struct string *s = mn__untag_string (string->obj);
 
     /* S->size includes the terminating null, so leave that out of the
        portion we copy.  */
-    result = mn_xmalloc (content_size);
-    memcpy (result, s->elements, content_size);
-
-    *size = content_size;
+    *size = s->size - 1;
+    result = mn_xmalloc (*size);
+    memcpy (result, s->elements, *size);
   }
   mn__end_incoherent (c);
 
@@ -195,13 +183,13 @@
 
 
 mn_ref *
-mn_string_from_str (mn_call *c, const char *str)
+mn_string_from_mem (mn_call *c, const char *mem, size_t length)
 {
   mn_ref *result;
 
   mn__begin_incoherent (c);
   {
-    struct string *s = string_from_mem (c, str, strlen (str));
+    struct string *s = string_from_mem (c, mem, length);
 
     if (s)
       result = mn__make_local_ref (c, mn__tag_labeled_object (s));
@@ -216,23 +204,9 @@
 
 
 mn_ref *
-mn_string_from_mem (mn_call *c, const char *mem, size_t length)
+mn_string_from_str (mn_call *c, const char *str)
 {
-  mn_ref *result;
-
-  mn__begin_incoherent (c);
-  {
-    struct string *s = string_from_mem (c, mem, length);
-
-    if (s)
-      result = mn__make_local_ref (c, mn__tag_labeled_object (s));
-    else
-      /* string_from_mem has set the current exception.  */
-      result = NULL;
-  }
-  mn__end_incoherent (c);
-
-  return result;
+  return mn_string_from_mem (c, str, strlen (str));
 }
 
 
@@ -253,8 +227,10 @@
     if (s)
       result = mn__make_local_ref (c, mn__tag_labeled_object (s));
     else
-      /* string_from_mem has set the current exception.  */
-      result = NULL;
+      {
+	result = NULL;
+	mn__set_exception (c, mn__conversion_exception->obj);
+      }
   }
   mn__end_incoherent (c);
 
@@ -262,7 +238,6 @@
 }
 
 
-
 int
 mn_string_length (mn_call *c, mn_ref *string_ref)
 {
@@ -270,7 +245,7 @@
 
   mn__begin_incoherent (c);
   {
-    result = check_string (string_ref)->length;
+    result = mn__untag_string (string_ref->obj)->length;
   }
   mn__end_incoherent (c);
 
@@ -285,7 +260,7 @@
   
   mn__begin_incoherent (c);
   {
-    struct string *s = check_string (string_ref);
+    struct string *s = mn__untag_string (string_ref->obj);
     const mn_utf8_t *end = s->elements + s->size;
     const mn_utf8_t *elt;
     mn_unicode_t scalar;
@@ -300,7 +275,7 @@
     elt = mn_get_utf8 (elt, &scalar);
 
     /* Make sure there really was a well-formed character there.  */
-    check (elt <= end);
+    check (elt && elt <= end);
 
     result = mn__make_local_ref (c, mn__tag_character (scalar));
   }
@@ -309,6 +284,7 @@
   return result;
 }
 
+
 
 /* Initialization.  */
 
@@ -329,11 +305,11 @@
   /* Build an exception object to return when conversion to Unicode
      fails.  */
   {
-    static const char message[] = "C to Unicode conversion failed";
+    static const char message[] = "C / Unicode conversion failed";
     tagged_t ex = mn__string_from_mem (c, message, sizeof (message) - 1);
 
     check (ex != mn__unique_false ());
     
-    conversion_exception = mn__make_global_ref (ex);
+    mn__conversion_exception = mn__make_global_ref (ex);
   }
 }

Modified: trunk/gc/strings.h
==============================================================================
--- trunk/gc/strings.h	(original)
+++ trunk/gc/strings.h	Sun Apr 24 19:50:55 2005
@@ -15,7 +15,8 @@
 
   /* The size of the 'elements' array in bytes.
 
-     This size includes the terminating null character.  */
+     This size includes the terminating null character, so that the GC
+     can use it as the size of the variable portion of the object.  */
   size_t size;
 
   /* The length of the string, in coded characters.  (Thus, if the
@@ -49,6 +50,11 @@
 tagged_t mn__string_from_mem (mn_call *, const char *mem, size_t len);
 
 
+/* An exception indicating an error converting between Unicode and the
+   C execution character set.  */
+extern mn_ref *mn__conversion_exception;
+
+
 /* Initialize the strings module.  */
 void mn__gc_strings_init (mn_call *c);
 

Modified: trunk/gc/tests/c-api-utf8.c
==============================================================================
--- trunk/gc/tests/c-api-utf8.c	(original)
+++ trunk/gc/tests/c-api-utf8.c	Sun Apr 24 19:50:55 2005
@@ -53,7 +53,7 @@
       mn_unicode_t fetched;
       
       scan = mn_get_utf8 (scan, &fetched);
-      if (fetched != code)
+      if (!scan || fetched != code)
 	{
 	  fail ();
 	  break;

Modified: trunk/gc/utf8.c
==============================================================================
--- trunk/gc/utf8.c	(original)
+++ trunk/gc/utf8.c	Sun Apr 24 19:50:55 2005
@@ -4,103 +4,103 @@
 
 #include "minor/unicode.h"
 
-/* UTF-8 encodes Unicode scalar values as series of one to four code
+/* UTF-8 encodes Unicode code points as series of one to four code
    units (which are bytes) as shown below, where the 'v' bits are the
-   bits of the scalar value.  More significant bits appear in earlier
+   bits of the code point.  More significant bits appear in earlier
    code units.
 
-    === scalar value ==	   ======== UTF-8 bytes (in binary) ================
-       0x00 --     0x7f	   0vvvvvvv				(one byte)
-       0x80 --    0x7ff	   110vvvvv 10vvvvvv			(two bytes)
-      0x800 --   0xffff	   1110vvvv 10vvvvvv 10vvvvvv		(three bytes)
-    0x10000 -- 0x1fffff	   11110vvv 10vvvvvv 10vvvvvv 10vvvvvv	(four bytes)
+    === code point ==      ======== UTF-8 bytes (in binary) ================
+       0x00 --     0x7f    0vvvvvvv                             (one byte)
+       0x80 --    0x7ff    110vvvvv 10vvvvvv                    (two bytes)
+      0x800 --   0xffff    1110vvvv 10vvvvvv 10vvvvvv           (three bytes)
+    0x10000 -- 0x1fffff    11110vvv 10vvvvvv 10vvvvvv 10vvvvvv  (four bytes)
 
    So all ASCII characters are represented as a single code unit whose
    value is equal to the ASCII code --- in other words, ASCII is
    unchanged.
 
-   The scalar value 0x0430 is represented by the two-unit sequence:
+   The code point 0x0430 is represented by the two-unit sequence:
    11010000 10110000 = 0xd0 0xb0 (code units)
       =====   ======
-      10000   110000 -> 10000110000 = 0x0430 (scalar value)
+      10000   110000 -> 10000110000 = 0x0430 (code point)
 
-   The scalar value 0x4e8c is represented by the three-unit sequence:
+   The code point 0x4e8c is represented by the three-unit sequence:
    11100100 10111010 10001100 = 0xe4 0xba 0x8c (code units)
        ====   ======   ======
-       0100   111010   001100 -> 0100111010001100 = 0x4e8c (scalar value)
+       0100   111010   001100 -> 0100111010001100 = 0x4e8c (code point)
 
    And so on.  */
 
 
 mn_utf8_t *
-mn_get_utf8 (const mn_utf8_t *utf8, mn_unicode_t *scalar_p)
+mn_get_utf8 (const mn_utf8_t *utf8, mn_unicode_t *code_point_p)
 {
-  mn_unicode_t scalar;
+  mn_unicode_t code_point;
 
   if (*utf8 <= 0x7f)
-    scalar = *utf8++;
+    code_point = *utf8++;
   else if (*utf8 <= 0xbf)
-    abort ();
+    return NULL;
   else
     {
       if (*utf8 <= 0xdf)
-	scalar = (*utf8++ & 0x1f) << 6;
+	code_point = (*utf8++ & 0x1f) << 6;
       else
 	{
 	  if (*utf8 <= 0xef)
-	    scalar = (*utf8++ & 0x0f) << 12;
+	    code_point = (*utf8++ & 0x0f) << 12;
 	  else
 	    {
 	      if (*utf8 <= 0xf7)
-		scalar = (*utf8++ & 0x07) << 18;
+		code_point = (*utf8++ & 0x07) << 18;
 	      else
-		abort ();
+		return NULL;
 
-	      scalar |= (*utf8++ & 0x3f) << 12;
+	      code_point |= (*utf8++ & 0x3f) << 12;
 	    }
 
-	  scalar |= (*utf8++ & 0x3f) << 6;
+	  code_point |= (*utf8++ & 0x3f) << 6;
 	}
 
-      scalar |= *utf8++ & 0x3f;
+      code_point |= *utf8++ & 0x3f;
     }
 
-  *scalar_p = scalar;
+  *code_point_p = code_point;
   return (mn_utf8_t *) utf8;
 }
 
 
 mn_utf8_t *
-mn_put_utf8 (mn_utf8_t *utf8, mn_unicode_t scalar)
+mn_put_utf8 (mn_utf8_t *utf8, mn_unicode_t code_point)
 {
-  mn_unicode_t s = scalar;
+  mn_unicode_t c = code_point;
 
-  if (s < 0)
+  if (c < 0)
     abort ();
-  else if (s <= 0x7f)
-    *utf8++ = s;
+  else if (c <= 0x7f)
+    *utf8++ = c;
   else
     {
-      if (s <= 0x3ff)
-	*utf8++ = 0xc0 | (s >> 6);
+      if (c <= 0x3ff)
+	*utf8++ = 0xc0 | (c >> 6);
       else
 	{
-	  if (s <= 0xffff)
-	    *utf8++ = 0xe0 | (s >> 12);
+	  if (c <= 0xffff)
+	    *utf8++ = 0xe0 | (c >> 12);
 	  else
 	    {
-	      if (s <= 0x1fffff)
-		*utf8++ = 0xf0 | (s >> 18);
+	      if (c <= 0x1fffff)
+		*utf8++ = 0xf0 | (c >> 18);
 	      else
 		abort ();
 
-	      *utf8++ = 0x80 | ((s >> 12) & 0x3f);
+	      *utf8++ = 0x80 | ((c >> 12) & 0x3f);
 	    }
 
-	  *utf8++ = 0x80 | ((s >> 6) & 0x3f);
+	  *utf8++ = 0x80 | ((c >> 6) & 0x3f);
 	}
       
-      *utf8++ = 0x80 | (s & 0x3f);
+      *utf8++ = 0x80 | (c & 0x3f);
     }
 
   return utf8;
@@ -115,7 +115,7 @@
       if (*utf8 <= 0x7f)
 	utf8++;
       else if (*utf8 <= 0xbf)
-	abort ();
+	return NULL;
       else if (*utf8 <= 0xdf)
 	utf8 += 2;
       else if (*utf8 <= 0xef)
@@ -123,7 +123,7 @@
       else if (*utf8 <= 0xf7)
 	utf8 += 4;
       else
-	abort ();
+	return NULL;
 
       if (utf8 > limit)
 	return (mn_utf8_t *) limit;

Modified: trunk/include/minor/minor.h
==============================================================================
--- trunk/include/minor/minor.h	(original)
+++ trunk/include/minor/minor.h	Sun Apr 24 19:50:55 2005
@@ -521,74 +521,35 @@
    functions in this API accept or return 'char' or 'wchar_t' values,
    or strings made from them, those values use the current C execution
    character set; the API converts to and from Minor's internal
-   representation as needed.  This means that you can use these values
-   with the standard C library functions that operate on text
+   representation as needed.  This means that you can use such values
+   with the standard C library functions for working with text
    (getchar, printf, atoi, and so on) in the normal way, without
    worrying about what representation Minor is using.
 
-   Strings of 'char' values are always treated as containing multibyte
-   characters (if the execution character set has any), never as plain
-   byte strings.
-
    Since the encoding of characters in the current C execution
    character set is determined by the current locale, the behavior of
    these functions may depend on the current locale --- specifically,
    that established for the LC_CTYPE category.
 
-
-   Reporting Conversion Errors:   
-
-   Various problems can occur during conversion:
-
-   - A multi-byte C string using a variable-width character encoding
-     scheme might be unparseable as a stream of characters.
-
-     For example, in UTF-8, the byte sequence 0x80 is an ill-formed
-     character: 0x80 may only appear in UTF-8 as part of a multi-byte
-     character, and never as its first byte.
-
-   - A well-formed stream of code points might contain code points
-     that don't correspond to characters.
-
-     For example, the byte sequence 0xed 0xb0 0x80 is a well-formed
-     UTF-8 sequence, but it represents the code point 0xdc00 --- an
-     "isolated surrogate" value reserved for use in UTF-16 encoding
-     forms, and not assigned to any character.
-
-     (The distinction being attempted here is that these errors are
-     due to a code point being unassigned in the given character set,
-     and not due to some syntactic problem in the byte sequence.)
-
-   - A stream of well-formed characters in one character set may
-     contain characters that don't exist in the other.  For example,
-     there is no character in ISO Latin-1 corresponding to the Unicode
-     character U+2638 ("Wheel of Dharma").
-
-   The functions in this API return exceptions when they encounter any
-   of the above problems, except in special cases where it is possible
-   to carry through the operation without losing information.  If
-   information would be lost, the functions always return an
-   exception.
-
-   For example, Minor characters hold Unicode code points (up to
-   U+ffffff) without regard for whether that code point is actually
-   assigned to any particular character.  In locales where the C
-   wchar_t type uses Unicode as well, the wide character L'\xdc00' can
-   be converted to a Minor character and back to a C wide character
-   without loss of information.  In this case, the conversion
-   functions may not return an exception, even though L'\xdc00' is not
-   a valid Unicode character.
-
-
-   Guaranteed Conversions
-
-   ISO C divides the execution character set into the "basic character
-   set" (roughly the upper- and lower-case letters, the digits, the
-   graphic symbols used in C syntax --- that does not include '$',
-   '@', or '`' --- and the whitespace characters), and "extended
-   characters".  Characters in the basic character set, and strings
-   containing them, may always be converted to and from Minor values
-   without error.  */
+   Errors can occur during conversion: byte strings may not be
+   well-formed encodings of code points; code points may be
+   unassigned; and characters may not exist in the destination
+   character set.
+
+   Minor reports errors that would result in the loss information.
+   However, if a conversion can be performed without doing so, Minor
+   may carry it through; for example, if the C execution character set
+   is also Unicode, then Minor can arbitrary code points to characters
+   or store them in strings, even if those code points have no
+   character assigned to them.
+
+   ISO C divides the C execution character set into the "basic
+   character set" --- the upper- and lower-case letters, the digits,
+   the graphic symbols used in C syntax (all the ASCII symbols but
+   '$', '@', or '`'), the whitespace characters, and the null
+   character --- and "extended characters".  Characters in the basic
+   character set, and strings containing only such characters, may
+   always be converted to and from Minor values without error.  */
 
 
 /* Return true if REF refers to a character; otherwise, return false.  */
@@ -607,7 +568,7 @@
 /* Return CHARACTER as a C char / wchar_t.  If CHARACTER cannot be
    represented in the given type, return EOF / WEOF, and set the
    pending exception.  If CHARACTER is not a character, abort.  */
-int mn_to_char (mn_call *, mn_ref *character);
+int    mn_to_char  (mn_call *, mn_ref *character);
 wint_t mn_to_wchar (mn_call *, mn_ref *character);
 
 /* Return the Minor character corresponding to the 'char' or 'wchar_t'
@@ -627,9 +588,10 @@
    See the comments in the "Characters" section describing the general
    conventions for handling text and dealing with conversion errors.
 
-   These functions all copy the entire string for the user's use.  If
-   it's important to avoid this, then we could introduce a lease-based
-   interface here.  Leases are described in the file doc/leases.  */
+   The functions here that provide the contents of a string all
+   produce copies of the text for the user's use.  If it's important
+   to avoid this, then we could introduce a lease-based interface
+   here.  Leases are described in the file doc/leases.  */
 
 
 /* Return true if REF refers to a string; otherwise, return false.  */
@@ -647,19 +609,19 @@
    memory for the string returned is allocated using malloc; the
    caller is responsible for freeing it.
 
+   If STRING contains null characters, truncate it just before the
+   first one.  (Would it be more useful to just return the entire
+   string, embedded nulls and all, with an extra null on the end?)
+
    If STRING cannot be fully and accurately converted to the C
    execution character set, return NULL and set the pending exception.
 
-   If STRING contains null characters, truncate it just before the
-   first one.  (Would it be more helpful to just return the entire
-   string, embedded nulls and all, with an extra null on the end?) 
-
    If STRING is not a string, abort. */
 char *mn_string_to_str (mn_call *, mn_ref *string);
 
 /* Return the contents of STRING as a block of characters, and set
-   *LENGTH to its length.  The memory returned is allocated using
-   malloc; the caller is responsible for freeing it.
+   *LENGTH to its length in bytes.  The memory returned is allocated
+   *using malloc; the caller is responsible for freeing it.
 
    If STRING cannot be fully and accurately converted to the C
    execution character set, return NULL and set the pending exception.
@@ -705,6 +667,10 @@
    abort.  */
 mn_ref *mn_string_to_symbol (mn_call *, mn_ref *name);
 
+/* Return the name of the symbol SYMBOL as a Minor string.  If SYMBOL
+   is not a symbol, abort.  */
+mn_ref *mn_symbol_name (mn_call *, mn_ref *symbol);
+
 /* Return the symbol whose name is the null-terminated C string NAME.
 
    Every symbol's name is a valid string.  If NAME cannot be fully and
@@ -713,7 +679,7 @@
 mn_ref *mn_symbol_from_str (mn_call *, const char *name);
 
 /* Return the name of the symbol SYMBOL, as a malloc'd block of
-   characters, and set *LENGTH to its length.
+   characters, and set *LENGTH to its length in bytes.
 
    The memory for the string returned is allocated using malloc; the
    caller is responsible for freeing it.
@@ -724,10 +690,6 @@
    If SYMBOL is not a symbol, abort.  */
 char *mn_symbol_to_mem (mn_call *, mn_ref *symbol, size_t *length);
 
-/* Return the name of the symbol SYMBOL as a Minor string.  If SYMBOL
-   is not a symbol, abort.  */
-mn_ref *mn_symbol_name (mn_call *, mn_ref *symbol);
-
 
 
 /* Procedures.  */

Modified: trunk/include/minor/unicode.h
==============================================================================
--- trunk/include/minor/unicode.h	(original)
+++ trunk/include/minor/unicode.h	Sun Apr 24 19:50:55 2005
@@ -8,21 +8,33 @@
 #include <minor/essentials.h>
 
 /* Minor uses Unicode to represent characters and strings.  Each
-   element of a Minor string is a Unicode "scalar value": a number
-   assigned to a character.
+   element of a Minor string is a Unicode code point.
 
    I found _Unicode Technical Report #17: Character Encoding Model_ to
    be really helpful in getting a handle on all the terminology
-   involved with this stuff.  The writing is clear and to the point,
-   even if the subject matter is mind-numbing.  That tech report,
+   involved with this stuff.  The writing is pretty clear and to the
+   point, given the mind-numbing subject matter.  That tech report,
    along with the rest of the Unicode spec, is available on the web
    from http://www.unicode.org/.  If you find something here that's
-   inconsistent with that, it's unintentional; please let me know
-   about it.  */
+   inconsistent with that, it's a mistake; please let me know about
+   it.
 
+   Some of the definitions we use:
 
-/* The type representing a Unicode scalar value.
-   And a type representing a Unicode scalar value plus one distinguished
+   - A "code point" is an integer in the general range covered by a
+     character set.
+
+   - A "scalar value" is a code point that's actually assigned to a
+     character.
+
+   - A "code unit" is the fixed-width unit that a variable-width
+     character encoding form is made out of.  For example, UTF-8 uses
+     eight-bit code units, while UTF-16 uses 16-bit code units.  */
+
+
+/* The type representing a Unicode code point.
+
+   And a type representing a Unicode code point plus one distinguished
    value, MN_UEOF.
 
    On systems which use ISO 10646 as the coded character set for the
@@ -47,35 +59,12 @@
 
 #endif
 
-
-/* Return a reference to the Minor character corresponding to the
-   Unicode character whose scalar value is SCALAR.  */
-mn_ref *mn_from_unicode (mn_call *, mn_unicode_t scalar);
-
-/* Return the scalar value for the Unicode character corresponding to
-   the Minor character CHARACTER.  If CHARACTER is not a Minor
-   character, abort.  */
-mn_unicode_t mn_to_unicode (mn_call *, mn_ref *character);
-
-
-/* The type representing a UTF-8 code unit.  */
+/* The type representing a UTF-8 code unit.  This is always an
+   eight-bit byte, but using the typedef may make one's intent
+   clearer.  */
 typedef uint8_t mn_utf8_t;
 
 
-/* Return a string containing the characters encoded in UTF-8 by the
-   SIZE bytes at UTF8.  If those bytes are not valid UTF-8 text, then
-   return NULL.  */
-mn_ref *mn_string_from_utf8 (mn_call *, const mn_utf8_t *utf8, size_t size);
-
-/* Return the contents of STR as UTF-8 text, and set *SIZE to its size
-   in bytes.  If STR is not a string, abort.  */
-mn_utf8_t *mn_string_to_utf8 (mn_call *, mn_ref *str, size_t *size);
-
-
-/* There should also be functions here to access string contents as
-   arrays of mn_unicode_t values.  */
-
-
 
 /* Conversions between Unicode and the C execution character set.  */
 
@@ -89,12 +78,13 @@
    mn_call arguments or references.  They use custom conventions for
    reporting errors, rather than setting the pending exception.  */
 
+
 /* Given a Unicode scalar value U, return the corresponding 'char' /
    'wchar_t' value in the current C execution character set.
 
    If U cannot be represented as a 'char' / 'wchar_t', return EOF /
    WEOF.  */
-int mn_char_from_unicode (mn_unicode_t u);
+int    mn_char_from_unicode  (mn_unicode_t u);
 wint_t mn_wchar_from_unicode (mn_unicode_t u);
 
 
@@ -102,22 +92,23 @@
    the current C execution character set, return the corresponding
    Unicode scalar value.
 
-   If there is no corresponding Unicode character, return
-   MN_UNICODE_EOF.  (Since Unicode is pretty comprehensive at this
-   point, this probably means that the input value isn't a valid
-   scalar value in the C execution character set, either.  */
-mn_unicode_int_t mn_char_to_unicode (int c);
+   If C is not assigned a character in the current C execution
+   character set, or if there is no corresponding Unicode character,
+   return MN_UNICODE_EOF.
+
+   (As an special case, if the conversion can be done without losing
+   information, these functions may go ahead and do it anyway.  For
+   example, if the current locale uses Unicode code points for
+   wchar_t, as Linux does, then these are the identity function.)  */
+mn_unicode_int_t mn_char_to_unicode  (int    c);
 mn_unicode_int_t mn_wchar_to_unicode (wint_t c);
 
 
-... /* These should distinguish the error cases described in minor.h,
-       under "Characters".  */
-
 /* Given a string / wide string MEM in the current C execution
    character set, which is LEN bytes / wchar_t values long, return the
    corresponding string in UTF-8, set *RESULT_SIZE to its size in
-   UTF-8 code units (bytes), and set *RESULT_CHARS to the number of
-   characters it encodes.
+   UTF-8 code units, and set *RESULT_CHARS to the number of characters
+   it encodes.
 
    Any null characters in MEM are converted to null characters in the
    result.
@@ -125,7 +116,7 @@
    The result is allocated using malloc; the caller is responsible for
    freeing it.
 
-   If MEM cannot be converted in its entirety to Unicode, return
+   If MEM cannot be fully and accurately converted to Unicode, return
    NULL.  */
 mn_utf8_t *mn_mem_to_utf8 (const char *mem, size_t len,
 			   size_t *result_size, size_t *result_chars);
@@ -142,7 +133,7 @@
    The result is allocated using malloc; the caller is responsible for
    freeing it.
 
-   If MEM cannot be converted in its entirety to the C execution
+   If MEM cannot be fully and accurately converted to the C execution
    character set, return NULL.  */
 char *mn_mem_from_utf8 (const mn_utf8_t *mem, size_t len, size_t *result_len);
 wchar_t *mn_wmem_from_utf8 (const mn_utf8_t *mem, size_t len,
@@ -162,29 +153,32 @@
    reporting errors, rather than setting the pending exception.  */
 
 /* The maximum length of a Unicode character encoded in UTF-8, in code
-   units (bytes).  */
+   units.  */
 #define MN_UTF8_MAX_LEN (4)
 
 /* The largest code point that can be encoded in UTF-8.
 
    (ISO 10646's version of UTF-8 actually allows characters up to six
-   bytes long, and can thus represent code points up to 0x7fffffff.
-   But there are no characters allocated in that additional area;
-   Unicode and ISO-10646 have, by agreement, identical code point
-   assignments.  So we're going to go with the Unicode limit.)  */
+   code units long, and can thus represent code points up to
+   0x7fffffff.  But there are no characters allocated in that
+   additional area; Unicode and ISO-10646 have, by agreement,
+   identical code point assignments.  So we're going to go with the
+   Unicode limit.)  */
 #define MN_UTF8_MAX_CODE_POINT (0x1fffff)
 
-/* Return the scalar value encoded in UTF-8 at UTF8.  If UTF8 does not point
-   at a well-formed UTF-8 sequence, abort.  */
-mn_utf8_t *mn_get_utf8 (const mn_utf8_t *utf8, mn_unicode_t *scalar);
-
-/* Encode the scalar value SCALAR in UTF-8 at UTF8; return the address
-   of the next code unit after the encoded character.  */
-mn_utf8_t *mn_put_utf8 (mn_utf8_t *utf8, mn_unicode_t scalar);
+/* Set *CODE_POINT to the code point encoded in UTF-8 at UTF8, and
+   return the address of the next code unit.  If UTF8 does not point
+   at a well-formed UTF-8 sequence, return NULL.  */
+mn_utf8_t *mn_get_utf8 (const mn_utf8_t *utf8, mn_unicode_t *code_point);
+
+/* Encode the code point CODE_POINT in UTF-8 at UTF8; return the
+   address of the next code unit after the encoded character.  */
+mn_utf8_t *mn_put_utf8 (mn_utf8_t *utf8, mn_unicode_t code_point);
 
 /* Scan forward N UTF-8 characters from UTF8, and return the address
    of the start of the next character.  Scan no further than LIMIT; if
-   we reach that, return LIMIT.  */
+   we reach that, return LIMIT.  If we encounter an ill-formed UTF-8
+   character, return NULL.  */
 mn_utf8_t *mn_forward_utf8 (const mn_utf8_t *utf8, size_t n,
 			    const mn_utf8_t *limit);
 
@@ -224,6 +218,39 @@
    exceptions.  */
 _Bool mn_ill_formed_number_exception_p (mn_call *, mn_ref *exception);
 
+
+
+/* Building and extracting string and character contents in Unicode.  */
+
+
+/* Return a reference to the Minor character corresponding to the
+   Unicode character whose scalar value is SCALAR.  (This actually
+   accepts any code point, not just scalar values.)  */
+mn_ref *mn_from_unicode (mn_call *, mn_unicode_t scalar);
+
+/* Return the scalar value for the Unicode character corresponding to
+   the Minor character CHARACTER.  If CHARACTER is not a Minor
+   character, abort.  (Minor characters are actually just code points;
+   we don't check if they really represent assigned characters, so
+   this can return unassigned code points as well.)  */
+mn_unicode_t mn_to_unicode (mn_call *, mn_ref *character);
+
+/* Return a string containing the characters encoded in UTF-8 by the
+   SIZE code units at UTF8.  If those code units are not valid UTF-8
+   text, then return NULL.  */
+mn_ref *mn_string_from_utf8 (mn_call *, const mn_utf8_t *utf8, size_t size);
+
+/* Return the contents of STR as UTF-8 text, and set *SIZE to its size
+   in code units.  The returned value is allocated using 'malloc'; it
+   is the caller's responsibility to free it.  If STR is not a string,
+   or if STR contains ill-formed UTF-8 text, abort.  */
+mn_utf8_t *mn_string_to_utf8 (mn_call *, mn_ref *str, size_t *size);
+
+
+/* There should also be functions here to access string contents as
+   arrays of mn_unicode_t values.  */
+
+
 
 /* Unicode character general categories.  */
 
@@ -316,7 +343,7 @@
    you want Unicode code points, unless you know that your C execution
    character set is a subset of Unicode, like ASCII.  (Hey, you never
    know when someone will want to port your code --- and Minor --- to
-   OS/400.)
+   OS/400!)
 
    Names are as they appear in the Unicode Character Database, with
    some traditional extras.  */




More information about the Minor mailing list