ACE: UTF16_Encoding_Converter.cpp Source File

00001 // UTF16_Encoding_Converter.cpp,v 4.1 2006/01/09 15:18:53 elliott_c Exp
00002 
00003 // ======================================================================
00004 //
00005 // The actual conversion methods are covered by the copyright information
00006 // below.  It is not the actual code provided by Unicode, Inc. but is an
00007 // ACE-ified and only slightly modified version.
00008 // Chad Elliott 4/28/2005
00009 //
00010 // Copyright 2001-2004 Unicode, Inc.
00011 //
00012 // Limitations on Rights to Redistribute This Code
00013 //
00014 // Unicode, Inc. hereby grants the right to freely use the information
00015 // supplied in this file in the creation of products supporting the
00016 // Unicode Standard, and to make copies of this file in any form
00017 // for internal or external distribution as long as this notice
00018 // remains attached.
00019 //
00020 // ======================================================================
00021 
00022 #include "ace/UTF16_Encoding_Converter.h"
00023 
00024 #if defined (ACE_USES_WCHAR)
00025 #include "ace/OS_NS_stdio.h"
00026 #include "ace/OS_Memory.h"
00027 #include "ace/Min_Max.h"
00028 
00029 #if !defined (__ACE_INLINE__)
00030 #include "ace/UTF16_Encoding_Converter.inl"
00031 #endif /* __ACE_INLINE__ */
00032 
00033 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
00034 
00035 static const ACE_UINT32 halfShift = 10;
00036 static const ACE_UINT32 halfBase  = 0x00010000;
00037 static const ACE_UINT32 halfMask  = 0x000003FF;
00038 
00039 static const ACE_UINT32 UNI_SUR_HIGH_START   = 0x0000D800;
00040 static const ACE_UINT32 UNI_SUR_HIGH_END     = 0x0000DBFF;
00041 static const ACE_UINT32 UNI_SUR_LOW_START    = 0x0000DC00;
00042 static const ACE_UINT32 UNI_SUR_LOW_END      = 0x0000DFFF;
00043 static const ACE_UINT32 UNI_REPLACEMENT_CHAR = 0x0000FFFD;
00044 static const ACE_UINT32 UNI_MAX_BMP          = 0x0000FFFF;
00045 static const ACE_UINT32 UNI_MAX_UTF16        = 0x0010FFFF;
00046 
00047 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
00048 // into the first byte, depending on how many bytes follow.  There are
00049 // as many entries in this table as there are UTF-8 sequence types.
00050 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
00051 // for *legal* UTF-8 will be 4 or fewer bytes total.
00052 static const ACE_Byte firstByteMark[7] = { 0x00, 0x00, 0xC0,
00053                                            0xE0, 0xF0, 0xF8, 0xFC };
00054 
00055 // Index into the table below with the first byte of a UTF-8 sequence to
00056 // get the number of trailing bytes that are supposed to follow it.
00057 // Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
00058 // left as-is for anyone who may want to do such conversion, which was
00059 // allowed in earlier algorithms.
00060 static const ACE_Byte trailingBytesForUTF8[256] = {
00061     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00062     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00063     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00064     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00065     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00066     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00067     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00068     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00069 };
00070 
00071 // Magic values subtracted from a buffer value during UTF8 conversion.
00072 // This table contains as many values as there might be trailing bytes
00073 // in a UTF-8 sequence.
00074 static const ACE_UINT32 offsetsFromUTF8[6] = { 0x00000000, 0x00003080,
00075                                                0x000E2080, 0x03C82080,
00076                                                0xFA082080, 0x82082080 };
00077 
00078 
00079 ACE_UTF16_Encoding_Converter::ACE_UTF16_Encoding_Converter (bool swap)
00080  : swap_ (swap)
00081 {
00082 }
00083 
00084 ACE_UTF16_Encoding_Converter::~ACE_UTF16_Encoding_Converter (void)
00085 {
00086 }
00087 
00088 ACE_UTF16_Encoding_Converter::Result
00089 ACE_UTF16_Encoding_Converter::to_utf8 (const void* source,
00090                                        size_t source_size,
00091                                        ACE_Byte* target,
00092                                        size_t target_size,
00093                                        bool strict)
00094 {
00095   static const ACE_UINT32 byteMask = 0xBF;
00096   static const ACE_UINT32 byteMark = 0x80;
00097   Result result = CONVERSION_OK;
00098 
00099   ACE_Byte* targetEnd = target + target_size;
00100   const ACE_UINT16* sourceStart = static_cast<const ACE_UINT16*> (source);
00101   const ACE_UINT16* sourceEnd   = sourceStart +
00102                                   (source_size / sizeof (ACE_UINT16));
00103 
00104   while (sourceStart < sourceEnd)
00105     {
00106       ACE_UINT16 nw = *sourceStart++;
00107       ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_WORD (nw) : nw);
00108 
00109       // If we have a surrogate pair, convert to ACE_UINT32 first.
00110       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
00111         {
00112           // If the 16 bits following the high surrogate are in the
00113           // sourceStart buffer...
00114           if (sourceStart < sourceEnd)
00115             {
00116               ACE_UINT32 ch2 = (this->swap_ ? ACE_SWAP_WORD (*sourceStart) :
00117                                               *sourceStart);
00118               // If it's a low surrogate, convert to ACE_UINT32.
00119               if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
00120                 {
00121                   ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00122                     + (ch2 - UNI_SUR_LOW_START) + halfBase;
00123                   ++sourceStart;
00124                 }
00125               else if (strict)
00126                 {
00127                   // it's an unpaired high surrogate
00128                   result = SOURCE_ILLEGAL;
00129                   break;
00130                 }
00131             }
00132           else
00133             {
00134               // We don't have the 16 bits following the high surrogate.
00135               result = SOURCE_EXHAUSTED;
00136               break;
00137             }
00138         }
00139       else if (strict)
00140         {
00141           // UTF-16 surrogate values are illegal in UTF-32
00142           if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
00143             {
00144               result = SOURCE_ILLEGAL;
00145               break;
00146             }
00147         }
00148 
00149       // Figure out how many bytes the result will require
00150       unsigned short bytesToWrite = 0;
00151       if (ch < 0x80)
00152         bytesToWrite = 1;
00153       else if (ch < 0x800)
00154         bytesToWrite = 2;
00155       else if (ch < 0x10000)
00156         bytesToWrite = 3;
00157       else if (ch < 0x110000)
00158         bytesToWrite = 4;
00159       else
00160         {
00161           bytesToWrite = 3;
00162           ch = UNI_REPLACEMENT_CHAR;
00163         }
00164 
00165       target += bytesToWrite;
00166       if (target > targetEnd)
00167         {
00168           result = TARGET_EXHAUSTED;
00169           break;
00170         }
00171 
00172       // NOTE: Everything falls through for efficiency purposes.
00173       switch (bytesToWrite)
00174         {
00175         case 4:
00176           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00177           ch >>= 6;
00178         case 3:
00179           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00180           ch >>= 6;
00181         case 2:
00182           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00183           ch >>= 6;
00184         case 1:
00185           *--target = (ACE_Byte)(ch | firstByteMark[bytesToWrite]);
00186         }
00187       target += bytesToWrite;
00188     }
00189 
00190   return result;
00191 }
00192 
00193 ACE_UTF16_Encoding_Converter::Result
00194 ACE_UTF16_Encoding_Converter::from_utf8 (const ACE_Byte* source,
00195                                          size_t source_size,
00196                                          void* target,
00197                                          size_t target_size,
00198                                          bool strict)
00199 {
00200   Result result = CONVERSION_OK;
00201   const ACE_Byte* sourceEnd = source + source_size;
00202   ACE_UINT16* targetStart   = static_cast<ACE_UINT16*> (target);
00203   ACE_UINT16* targetEnd     = targetStart + target_size;
00204 
00205   while (source < sourceEnd)
00206     {
00207       ACE_UINT32 ch = 0;
00208       unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00209       if (source + extraBytesToRead >= sourceEnd)
00210         {
00211           result = SOURCE_EXHAUSTED;
00212           break;
00213         }
00214 
00215       // Do this check whether lenient or strict
00216       if (!this->is_legal_utf8 (source, extraBytesToRead + 1))
00217         {
00218           result = SOURCE_ILLEGAL;
00219           break;
00220         }
00221 
00222       // The cases all fall through. See "Note A" below.
00223       switch (extraBytesToRead)
00224         {
00225         case 5: // remember, illegal UTF-8
00226           ch += *source++;
00227           ch <<= 6;
00228         case 4: // remember, illegal UTF-8
00229           ch += *source++;
00230           ch <<= 6;
00231         case 3:
00232           ch += *source++;
00233           ch <<= 6;
00234         case 2:
00235           ch += *source++;
00236           ch <<= 6;
00237         case 1:
00238           ch += *source++;
00239           ch <<= 6;
00240         case 0:
00241           ch += *source++;
00242       }
00243       ch -= offsetsFromUTF8[extraBytesToRead];
00244 
00245       if (targetStart >= targetEnd)
00246         {
00247           result = TARGET_EXHAUSTED;
00248           break;
00249         }
00250 
00251       if (ch <= UNI_MAX_BMP) // Target is a character <= 0xFFFF
00252         {
00253           // UTF-16 surrogate values are illegal in UTF-32
00254           if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00255             {
00256               if (strict)
00257                 {
00258                   result = SOURCE_ILLEGAL;
00259                   break;
00260                 }
00261               else
00262                 {
00263                   *targetStart++ = UNI_REPLACEMENT_CHAR;
00264                 }
00265             }
00266           else
00267             {
00268               *targetStart++ = (ACE_UINT16)ch;
00269             }
00270         }
00271       else if (ch > UNI_MAX_UTF16)
00272         {
00273           if (strict)
00274             {
00275               result = SOURCE_ILLEGAL;
00276               break;
00277             }
00278           else
00279             {
00280               *targetStart++ = UNI_REPLACEMENT_CHAR;
00281             }
00282         }
00283       else
00284         {
00285           // targetStart is a character in range 0xFFFF - 0x10FFFF.
00286           if (targetStart + 1 >= targetEnd)
00287             {
00288               result = TARGET_EXHAUSTED;
00289               break;
00290             }
00291           ch -= halfBase;
00292           *targetStart++ = (ACE_UINT16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00293           *targetStart++ = (ACE_UINT16)((ch & halfMask) + UNI_SUR_LOW_START);
00294         }
00295     }
00296 
00297   return result;
00298 }
00299 
00300 ACE_UTF16_Encoding_Converter*
00301 ACE_UTF16_Encoding_Converter::encoded (const ACE_Byte* source,
00302                                        size_t source_size)
00303 {
00304   static const size_t begin = 16;
00305   static const size_t converted = begin * 4;
00306 
00307   ACE_Byte target[converted];
00308   ACE_UTF16_Encoding_Converter* converter;
00309   ACE_NEW_RETURN (converter,
00310                   ACE_UTF16_Encoding_Converter (false),
00311                   0);
00312   if (converter->to_utf8 (source,
00313                           ACE_MIN (begin, source_size),
00314                           target,
00315                           converted) == CONVERSION_OK)
00316     {
00317       return converter;
00318     }
00319   else
00320     {
00321       delete converter;
00322     }
00323 
00324   return 0;
00325 }
00326 
00327 ACE_UINT32
00328 ACE_UTF16_Encoding_Converter::get_UNI_SUR_HIGH_START (void)
00329 {
00330   return UNI_SUR_HIGH_START;
00331 }
00332 
00333 ACE_UINT32
00334 ACE_UTF16_Encoding_Converter::get_UNI_SUR_LOW_END (void)
00335 {
00336   return UNI_SUR_LOW_END;
00337 }
00338 
00339 ACE_UINT32
00340 ACE_UTF16_Encoding_Converter::get_UNI_REPLACEMENT_CHAR (void)
00341 {
00342   return UNI_REPLACEMENT_CHAR;
00343 }
00344 
00345 const ACE_Byte*
00346 ACE_UTF16_Encoding_Converter::get_first_byte_mark (void)
00347 {
00348   return firstByteMark;
00349 }
00350 
00351 const ACE_Byte*
00352 ACE_UTF16_Encoding_Converter::get_trailing_bytes_for_utf8 (void)
00353 {
00354   return trailingBytesForUTF8;
00355 }
00356 
00357 const ACE_UINT32*
00358 ACE_UTF16_Encoding_Converter::get_offsets_from_utf8 (void)
00359 {
00360   return offsetsFromUTF8;
00361 }
00362 
00363 ACE_END_VERSIONED_NAMESPACE_DECL
00364 #endif /* ACE_USES_WCHAR */