This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

(C++) patch to add universal-character-names


This patch accomplishes the straightforward bits of adding UCNs to
g++.  The remaining part is figuring out what to do with UCNs in
identifiers; how do we encode them in the names?  How do we mangle
them in the assembly output?  What code can we share with the Java
frontend?

2000-03-07  Jason Merrill  <jason@casey.cygnus.com>

	Add initial support for '\uNNNN' specifier.
	* lex.c (read_ucs): New fn.
	(readescape, skip_white_space): Call it.
	(is_extended_char, is_extended_char_1): New fns.
	(utf8_extend_token): New fn, #if 0'd out.
	(real_yylex): Treat extended chars like letters.

Index: lex.c
===================================================================
RCS file: /cvs/gcc/egcs/gcc/cp/lex.c,v
retrieving revision 1.183
diff -c -p -r1.183 lex.c
*** lex.c	2000/03/03 02:27:15	1.183
--- lex.c	2000/03/08 08:41:08
*************** static int read_line_number PARAMS ((int
*** 90,95 ****
--- 90,98 ----
  static int token_getch PARAMS ((void));
  static void token_put_back PARAMS ((int));
  static void mark_impl_file_chain PARAMS ((void *));
+ static int read_ucs PARAMS ((int));
+ static int is_extended_char PARAMS ((int));
+ static int is_extended_char_1 PARAMS ((int));
  
  /* Given a file name X, return the nondirectory portion.
     Keep in mind that X can be computed more than once.  */
*************** skip_white_space (c)
*** 2236,2245 ****
  	case '\\':
  	  c = getch ();
  	  if (c == '\n')
! 	    lineno++;
  	  else
  	    error ("stray '\\' in program");
- 	  c = getch ();
  	  break;
  
  	default:
--- 2239,2254 ----
  	case '\\':
  	  c = getch ();
  	  if (c == '\n')
! 	    {
! 	      lineno++;
! 	      c = getch ();
! 	    }
! 	  else if (c == 'u')
! 	    c = read_ucs (4);
! 	  else if (c == 'U')
! 	    c = read_ucs (8);
  	  else
  	    error ("stray '\\' in program");
  	  break;
  
  	default:
*************** do_pending_lang_change ()
*** 2799,2804 ****
--- 2808,3183 ----
      pop_lang_context ();
  }
  
+ /* Parse a '\uNNNN' or '\UNNNNNNNN' sequence.
+ 
+    [lex.charset]: The character designated by the universal-character-name 
+    \UNNNNNNNN is that character whose character short name in ISO/IEC 10646
+    is NNNNNNNN; the character designated by the universal-character-name
+    \uNNNN is that character whose character short name in ISO/IEC 10646 is
+    0000NNNN. If the hexadecimal value for a universal character name is
+    less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the
+    universal character name designates a character in the basic source
+    character set, then the program is ill-formed.
+ 
+    For now, we just assume that wchar_t is Unicode, so we don't need to do
+    any mapping.  */
+ 
+ static int
+ read_ucs (length)
+      int length;
+ {
+   unsigned int code = 0;
+   int c;
+ 
+   for (; length; --length)
+     {
+       c = getch ();
+       if (! ISXDIGIT (c))
+ 	{
+ 	  error ("non hex digit '%c' in universal-character-name", c);
+ 	  put_back (c);
+ 	  break;
+ 	}
+       code <<= 4;
+       if (c >= 'a' && c <= 'f')
+ 	code += c - 'a' + 10;
+       if (c >= 'A' && c <= 'F')
+ 	code += c - 'A' + 10;
+       if (c >= '0' && c <= '9')
+ 	code += c - '0';
+     }
+ 
+ #ifdef TARGET_EBCDIC
+   sorry ("universal-character-name on EBCDIC target");
+   return 0x3F;
+ #endif
+ 
+   if (code > 0x9f && !(code & 0x80000000))
+     /* True extended character, OK.  */;
+   else if (code >= 0x20 && code < 0x7f)
+     {
+       /* ASCII printable character.  The C character set consists of all of
+ 	 these except $, @ and `.  We use hex escapes so that this also
+ 	 works with EBCDIC hosts.  */
+       if (code != 0x24 && code != 0x40 && code != 0x60)
+ 	error ("universal-character-name designates `%c', part of the basic source character set", code);
+     }
+   else
+     error ("invalid universal-character-name");
+   return code;
+ }
+ 
+ /* Returns nonzero if C is a universal-character-name.  Give an error if it
+    is not one which may appear in an identifier, as per [extendid].  */
+ 
+ static inline int
+ is_extended_char (c)
+      int c;
+ {
+ #ifdef TARGET_EBCDIC
+   return 0;
+ #else
+   /* ASCII.  */
+   if (c < 0x7f)
+     return 0;
+   
+   return is_extended_char_1 (c);
+ #endif
+ }
+ 
+ static int
+ is_extended_char_1 (c)
+      int c;
+ {
+   /* None of the valid chars are outside the Basic Multilingual Plane (the
+      low 16 bits).  */
+   if (c > 0xffff)
+     {
+       error ("universal-character-name `\\U%08x' not valid in identifier", c);
+       return 1;
+     }
+   
+   /* Latin */
+   if ((c >= 0x00c0 && c <= 0x00d6)
+       || (c >= 0x00d8 && c <= 0x00f6)
+       || (c >= 0x00f8 && c <= 0x01f5)
+       || (c >= 0x01fa && c <= 0x0217)
+       || (c >= 0x0250 && c <= 0x02a8)
+       || (c >= 0x1e00 && c <= 0x1e9a)
+       || (c >= 0x1ea0 && c <= 0x1ef9))
+     return 1;
+ 
+   /* Greek */
+   if ((c == 0x0384)
+       || (c >= 0x0388 && c <= 0x038a)
+       || (c == 0x038c)
+       || (c >= 0x038e && c <= 0x03a1)
+       || (c >= 0x03a3 && c <= 0x03ce)
+       || (c >= 0x03d0 && c <= 0x03d6)
+       || (c == 0x03da)
+       || (c == 0x03dc)
+       || (c == 0x03de)
+       || (c == 0x03e0)
+       || (c >= 0x03e2 && c <= 0x03f3)
+       || (c >= 0x1f00 && c <= 0x1f15)
+       || (c >= 0x1f18 && c <= 0x1f1d)
+       || (c >= 0x1f20 && c <= 0x1f45)
+       || (c >= 0x1f48 && c <= 0x1f4d)
+       || (c >= 0x1f50 && c <= 0x1f57)
+       || (c == 0x1f59)
+       || (c == 0x1f5b)
+       || (c == 0x1f5d)
+       || (c >= 0x1f5f && c <= 0x1f7d)
+       || (c >= 0x1f80 && c <= 0x1fb4)
+       || (c >= 0x1fb6 && c <= 0x1fbc)
+       || (c >= 0x1fc2 && c <= 0x1fc4)
+       || (c >= 0x1fc6 && c <= 0x1fcc)
+       || (c >= 0x1fd0 && c <= 0x1fd3)
+       || (c >= 0x1fd6 && c <= 0x1fdb)
+       || (c >= 0x1fe0 && c <= 0x1fec)
+       || (c >= 0x1ff2 && c <= 0x1ff4)
+       || (c >= 0x1ff6 && c <= 0x1ffc))
+     return 1;
+ 
+   /* Cyrillic */
+   if ((c >= 0x0401 && c <= 0x040d)
+       || (c >= 0x040f && c <= 0x044f)
+       || (c >= 0x0451 && c <= 0x045c)
+       || (c >= 0x045e && c <= 0x0481)
+       || (c >= 0x0490 && c <= 0x04c4)
+       || (c >= 0x04c7 && c <= 0x04c8)
+       || (c >= 0x04cb && c <= 0x04cc)
+       || (c >= 0x04d0 && c <= 0x04eb)
+       || (c >= 0x04ee && c <= 0x04f5)
+       || (c >= 0x04f8 && c <= 0x04f9))
+     return 1;
+ 
+   /* Armenian */
+   if ((c >= 0x0531 && c <= 0x0556)
+       || (c >= 0x0561 && c <= 0x0587))
+     return 1;
+ 
+   /* Hebrew */
+   if ((c >= 0x05d0 && c <= 0x05ea)
+       || (c >= 0x05f0 && c <= 0x05f4))
+     return 1;
+ 
+   /* Arabic */
+   if ((c >= 0x0621 && c <= 0x063a)
+       || (c >= 0x0640 && c <= 0x0652)
+       || (c >= 0x0670 && c <= 0x06b7)
+       || (c >= 0x06ba && c <= 0x06be)
+       || (c >= 0x06c0 && c <= 0x06ce)
+       || (c >= 0x06e5 && c <= 0x06e7))
+     return 1;
+ 
+   /* Devanagari */
+   if ((c >= 0x0905 && c <= 0x0939)
+       || (c >= 0x0958 && c <= 0x0962))
+     return 1;
+ 
+   /* Bengali */
+   if ((c >= 0x0985 && c <= 0x098c)
+       || (c >= 0x098f && c <= 0x0990)
+       || (c >= 0x0993 && c <= 0x09a8)
+       || (c >= 0x09aa && c <= 0x09b0)
+       || (c == 0x09b2)
+       || (c >= 0x09b6 && c <= 0x09b9)
+       || (c >= 0x09dc && c <= 0x09dd)
+       || (c >= 0x09df && c <= 0x09e1)
+       || (c >= 0x09f0 && c <= 0x09f1))
+     return 1;
+ 
+   /* Gurmukhi */
+   if ((c >= 0x0a05 && c <= 0x0a0a)
+       || (c >= 0x0a0f && c <= 0x0a10)
+       || (c >= 0x0a13 && c <= 0x0a28)
+       || (c >= 0x0a2a && c <= 0x0a30)
+       || (c >= 0x0a32 && c <= 0x0a33)
+       || (c >= 0x0a35 && c <= 0x0a36)
+       || (c >= 0x0a38 && c <= 0x0a39)
+       || (c >= 0x0a59 && c <= 0x0a5c)
+       || (c == 0x0a5e))
+     return 1;
+ 
+   /* Gujarati */
+   if ((c >= 0x0a85 && c <= 0x0a8b)
+       || (c == 0x0a8d)
+       || (c >= 0x0a8f && c <= 0x0a91)
+       || (c >= 0x0a93 && c <= 0x0aa8)
+       || (c >= 0x0aaa && c <= 0x0ab0)
+       || (c >= 0x0ab2 && c <= 0x0ab3)
+       || (c >= 0x0ab5 && c <= 0x0ab9)
+       || (c == 0x0ae0))
+     return 1;
+ 
+   /* Oriya */
+   if ((c >= 0x0b05 && c <= 0x0b0c)
+       || (c >= 0x0b0f && c <= 0x0b10)
+       || (c >= 0x0b13 && c <= 0x0b28)
+       || (c >= 0x0b2a && c <= 0x0b30)
+       || (c >= 0x0b32 && c <= 0x0b33)
+       || (c >= 0x0b36 && c <= 0x0b39)
+       || (c >= 0x0b5c && c <= 0x0b5d)
+       || (c >= 0x0b5f && c <= 0x0b61))
+     return 1;
+ 
+   /* Tamil */
+   if ((c >= 0x0b85 && c <= 0x0b8a)
+       || (c >= 0x0b8e && c <= 0x0b90)
+       || (c >= 0x0b92 && c <= 0x0b95)
+       || (c >= 0x0b99 && c <= 0x0b9a)
+       || (c == 0x0b9c)
+       || (c >= 0x0b9e && c <= 0x0b9f)
+       || (c >= 0x0ba3 && c <= 0x0ba4)
+       || (c >= 0x0ba8 && c <= 0x0baa)
+       || (c >= 0x0bae && c <= 0x0bb5)
+       || (c >= 0x0bb7 && c <= 0x0bb9))
+     return 1;
+ 
+   /* Telugu */
+   if ((c >= 0x0c05 && c <= 0x0c0c)
+       || (c >= 0x0c0e && c <= 0x0c10)
+       || (c >= 0x0c12 && c <= 0x0c28)
+       || (c >= 0x0c2a && c <= 0x0c33)
+       || (c >= 0x0c35 && c <= 0x0c39)
+       || (c >= 0x0c60 && c <= 0x0c61))
+     return 1;
+ 
+   /* Kannada */
+   if ((c >= 0x0c85 && c <= 0x0c8c)
+       || (c >= 0x0c8e && c <= 0x0c90)
+       || (c >= 0x0c92 && c <= 0x0ca8)
+       || (c >= 0x0caa && c <= 0x0cb3)
+       || (c >= 0x0cb5 && c <= 0x0cb9)
+       || (c >= 0x0ce0 && c <= 0x0ce1))
+     return 1;
+ 
+   /* Malayalam */
+   if ((c >= 0x0d05 && c <= 0x0d0c)
+       || (c >= 0x0d0e && c <= 0x0d10)
+       || (c >= 0x0d12 && c <= 0x0d28)
+       || (c >= 0x0d2a && c <= 0x0d39)
+       || (c >= 0x0d60 && c <= 0x0d61))
+     return 1;
+ 
+   /* Thai */
+   if ((c >= 0x0e01 && c <= 0x0e30)
+       || (c >= 0x0e32 && c <= 0x0e33)
+       || (c >= 0x0e40 && c <= 0x0e46)
+       || (c >= 0x0e4f && c <= 0x0e5b))
+     return 1;
+ 
+   /* Lao */
+   if ((c >= 0x0e81 && c <= 0x0e82)
+       || (c == 0x0e84)
+       || (c == 0x0e87)
+       || (c == 0x0e88)
+       || (c == 0x0e8a)
+       || (c == 0x0e0d)
+       || (c >= 0x0e94 && c <= 0x0e97)
+       || (c >= 0x0e99 && c <= 0x0e9f)
+       || (c >= 0x0ea1 && c <= 0x0ea3)
+       || (c == 0x0ea5)
+       || (c == 0x0ea7)
+       || (c == 0x0eaa)
+       || (c == 0x0eab)
+       || (c >= 0x0ead && c <= 0x0eb0)
+       || (c == 0x0eb2)
+       || (c == 0x0eb3)
+       || (c == 0x0ebd)
+       || (c >= 0x0ec0 && c <= 0x0ec4)
+       || (c == 0x0ec6))
+     return 1;
+ 
+   /* Georgian */
+   if ((c >= 0x10a0 && c <= 0x10c5)
+       || (c >= 0x10d0 && c <= 0x10f6))
+     return 1;
+ 
+   /* Hiragana */
+   if ((c >= 0x3041 && c <= 0x3094)
+       || (c >= 0x309b && c <= 0x309e))
+     return 1;
+ 
+   /* Katakana */
+   if ((c >= 0x30a1 && c <= 0x30fe))
+     return 1;
+ 
+   /* Bopmofo */
+   if ((c >= 0x3105 && c <= 0x312c))
+     return 1;
+ 
+   /* Hangul */
+   if ((c >= 0x1100 && c <= 0x1159)
+       || (c >= 0x1161 && c <= 0x11a2)
+       || (c >= 0x11a8 && c <= 0x11f9))
+     return 1;
+ 
+   /* CJK Unified Ideographs */
+   if ((c >= 0xf900 && c <= 0xfa2d)
+       || (c >= 0xfb1f && c <= 0xfb36)
+       || (c >= 0xfb38 && c <= 0xfb3c)
+       || (c == 0xfb3e)
+       || (c >= 0xfb40 && c <= 0xfb41)
+       || (c >= 0xfb42 && c <= 0xfb44)
+       || (c >= 0xfb46 && c <= 0xfbb1)
+       || (c >= 0xfbd3 && c <= 0xfd3f)
+       || (c >= 0xfd50 && c <= 0xfd8f)
+       || (c >= 0xfd92 && c <= 0xfdc7)
+       || (c >= 0xfdf0 && c <= 0xfdfb)
+       || (c >= 0xfe70 && c <= 0xfe72)
+       || (c == 0xfe74)
+       || (c >= 0xfe76 && c <= 0xfefc)
+       || (c >= 0xff21 && c <= 0xff3a)
+       || (c >= 0xff41 && c <= 0xff5a)
+       || (c >= 0xff66 && c <= 0xffbe)
+       || (c >= 0xffc2 && c <= 0xffc7)
+       || (c >= 0xffca && c <= 0xffcf)
+       || (c >= 0xffd2 && c <= 0xffd7)
+       || (c >= 0xffda && c <= 0xffdc)
+       || (c >= 0x4e00 && c <= 0x9fa5))
+     return 1;
+ 
+   error ("universal-character-name `\\u%04x' not valid in identifier", c);
+   return 1;
+ }
+ 
+ #if 0
+ /* Add the UTF-8 representation of C to the token_buffer.  */
+ 
+ static void
+ utf8_extend_token (c)
+      int c;
+ {
+   int shift, mask;
+ 
+   if      (c <= 0x0000007f)
+     {
+       extend_token (c);
+       return;
+     }
+   else if (c <= 0x000007ff)
+     shift = 6, mask = 0xc0;
+   else if (c <= 0x0000ffff)
+     shift = 12, mask = 0xe0;
+   else if (c <= 0x001fffff)
+     shift = 18, mask = 0xf0;
+   else if (c <= 0x03ffffff)
+     shift = 24, mask = 0xf8;
+   else
+     shift = 30, mask = 0xfc;
+ 
+   extend_token (mask | (c >> shift));
+   do
+     {
+       shift -= 6;
+       extend_token ((unsigned char) (0x80 | (c >> shift)));
+     }
+   while (shift);
+ }
+ #endif
+ 
  #define ENDFILE -1  /* token that represents end-of-file */
  
  /* Read an escape sequence, returning its equivalent as a character,
*************** readescape (ignore_ptr)
*** 2869,2874 ****
--- 3248,3258 ----
        put_back (c);
        return code;
  
+     case 'U':
+       return read_ucs (8);
+     case 'u':
+       return read_ucs (4);
+ 
      case '\\': case '\'': case '"':
        return c;
  
*************** real_yylex ()
*** 3542,3549 ****
      case 'z':
      case '_':
      case '$':
- #if USE_CPPLIB
      letter:
        if (cpp_token == CPP_NAME)
  	{
  	  /* Note that one character has already been read from
--- 3926,3933 ----
      case 'z':
      case '_':
      case '$':
      letter:
+ #if USE_CPPLIB
        if (cpp_token == CPP_NAME)
  	{
  	  /* Note that one character has already been read from
*************** real_yylex ()
*** 3561,3582 ****
  #endif
  	{
  	  p = token_buffer;
! 	  while (ISALNUM (c) || (c == '_') || c == '$')
  	    {
  	      /* Make sure this char really belongs in an identifier.  */
! 	      if (c == '$')
  		{
  		  if (! dollars_in_ident)
  		    error ("`$' in identifier");
  		  else if (pedantic)
  		    pedwarn ("`$' in identifier");
  		}
  
  	      if (p >= token_buffer + maxtoken)
  		p = extend_token_buffer (p);
  
  	      *p++ = c;
  	      c = token_getch ();
  	    }
  
  	  *p = 0;
--- 3945,3987 ----
  #endif
  	{
  	  p = token_buffer;
! 	  while (1)
  	    {
  	      /* Make sure this char really belongs in an identifier.  */
! 	      if (ISALNUM (c) || c == '_')
! 		/* OK */;
! 	      else if (c == '$')
  		{
  		  if (! dollars_in_ident)
  		    error ("`$' in identifier");
  		  else if (pedantic)
  		    pedwarn ("`$' in identifier");
  		}
+ 	      /* FIXME we should use some sort of multibyte character
+ 		 encoding.  Locale-dependent?  Always UTF-8?  */
+ 	      else if (is_extended_char (c))
+ 		{
+ 		  sorry ("universal characters in identifiers");
+ 		  c = '_';
+ 		}
+ 	      else
+ 		break;
  
  	      if (p >= token_buffer + maxtoken)
  		p = extend_token_buffer (p);
  
  	      *p++ = c;
+ 
+ 	    idtryagain:
  	      c = token_getch ();
+ 	      
+ 	      if (c == '\\')
+ 		{
+ 		  int ignore = 0;
+ 		  c = readescape (&ignore);
+ 		  if (ignore)
+ 		    goto idtryagain;
+ 		}
  	    }
  
  	  *p = 0;
*************** real_yylex ()
*** 4634,4639 ****
--- 5039,5046 ----
        break;
  
      default:
+       if (is_extended_char (c))
+ 	goto letter;
        value = c;
      }
  

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]