ACE: UTF32_Encoding_Converter.cpp Source File

00001 // UTF32_Encoding_Converter.cpp,v 4.2 2006/03/12 19:15:22 jwillemsen Exp
00002 
00003 // ======================================================================
00004 //
00005 // The actual conversion methods are covered by the copyright information
00006 // below.  It is not the actual code provided by Unicode, Inc. but is an
00007 // ACE-ified and only slightly modified version.
00008 //
00009 // Chad Elliott 4/28/2005
00010 //
00011 // Copyright 2001-2004 Unicode, Inc.
00012 //
00013 // Limitations on Rights to Redistribute This Code
00014 //
00015 // Unicode, Inc. hereby grants the right to freely use the information
00016 // supplied in this file in the creation of products supporting the
00017 // Unicode Standard, and to make copies of this file in any form
00018 // for internal or external distribution as long as this notice
00019 // remains attached.
00020 //
00021 // ======================================================================
00022 
00023 #include "ace/UTF32_Encoding_Converter.h"
00024 
00025 #if defined (ACE_USES_WCHAR)
00026 #include "ace/OS_NS_stdio.h"
00027 #include "ace/OS_Memory.h"
00028 #include "ace/Min_Max.h"
00029 
00030 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
00031 
00032 static const ACE_UINT32 UNI_MAX_LEGAL_UTF32 = 0x0010FFFF;
00033 
00034 ACE_UTF32_Encoding_Converter::ACE_UTF32_Encoding_Converter (bool swap)
00035  : ACE_UTF16_Encoding_Converter (swap)
00036 {
00037 }
00038 
00039 ACE_UTF32_Encoding_Converter::~ACE_UTF32_Encoding_Converter (void)
00040 {
00041 }
00042 
00043 ACE_UTF32_Encoding_Converter::Result
00044 ACE_UTF32_Encoding_Converter::to_utf8 (const void* source,
00045                                        size_t source_size,
00046                                        ACE_Byte* target,
00047                                        size_t target_size,
00048                                        bool strict)
00049 {
00050   static const ACE_UINT32 byteMask = 0xBF;
00051   static const ACE_UINT32 byteMark = 0x80;
00052   static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START ();
00053   static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END ();
00054   static const ACE_Byte* firstByteMark = get_first_byte_mark ();
00055 
00056   Result result = CONVERSION_OK;
00057   ACE_Byte* targetEnd = target + target_size;
00058   const ACE_UINT32* sourceStart = static_cast<const ACE_UINT32*> (source);
00059   const ACE_UINT32* sourceEnd = sourceStart + (source_size / sizeof (ACE_UINT32));
00060 
00061   while (sourceStart < sourceEnd)
00062     {
00063       ACE_UINT32 nw = *sourceStart++;
00064       ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_LONG (nw) : nw);
00065       unsigned short bytesToWrite = 0;
00066 
00067       if (strict)
00068         {
00069           // UTF-16 surrogate values are illegal in UTF-32
00070           if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00071             {
00072               result = SOURCE_ILLEGAL;
00073               break;
00074             }
00075         }
00076 
00077       // Figure out how many bytes the result will require. Turn any
00078       // illegally large ACE_UINT32 things (> Plane 17) into replacement
00079       // chars.
00080       if (ch < 0x80)
00081         {
00082           bytesToWrite = 1;
00083         }
00084       else if (ch < 0x800)
00085         {
00086           bytesToWrite = 2;
00087         }
00088       else if (ch < 0x10000)
00089         {
00090           bytesToWrite = 3;
00091         }
00092       else if (ch <= UNI_MAX_LEGAL_UTF32)
00093         {
00094           bytesToWrite = 4;
00095         }
00096       else
00097         {
00098           result = SOURCE_ILLEGAL;
00099           break;
00100         }
00101 
00102       target += bytesToWrite;
00103       if (target > targetEnd)
00104         {
00105           result = TARGET_EXHAUSTED;
00106           break;
00107         }
00108 
00109       // NOTE: everything falls through.
00110       switch (bytesToWrite)
00111         {
00112         case 4:
00113           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00114           ch >>= 6;
00115         case 3:
00116           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00117           ch >>= 6;
00118         case 2:
00119           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00120           ch >>= 6;
00121         case 1:
00122           *--target = (ACE_Byte) (ch | firstByteMark[bytesToWrite]);
00123       }
00124       target += bytesToWrite;
00125     }
00126 
00127   return result;
00128 }
00129 
00130 ACE_UTF32_Encoding_Converter::Result
00131 ACE_UTF32_Encoding_Converter::from_utf8 (const ACE_Byte* source,
00132                                          size_t source_size,
00133                                          void* target,
00134                                          size_t target_size,
00135                                          bool strict)
00136 {
00137   static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START ();
00138   static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END ();
00139   static const ACE_UINT32 UNI_REPLACEMENT_CHAR = get_UNI_REPLACEMENT_CHAR ();
00140   static const ACE_Byte* trailingBytesForUTF8 = get_trailing_bytes_for_utf8 ();
00141   static const ACE_UINT32* offsetsFromUTF8 = get_offsets_from_utf8 ();
00142 
00143   Result result = CONVERSION_OK;
00144   const ACE_Byte* sourceEnd = source + source_size;
00145   ACE_UINT32* targetStart = static_cast<ACE_UINT32*> (target);
00146   ACE_UINT32* targetEnd   = targetStart + target_size;
00147 
00148   while (source < sourceEnd)
00149     {
00150       ACE_UINT32 ch = 0;
00151       unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00152       if (source + extraBytesToRead >= sourceEnd)
00153         {
00154           result = SOURCE_EXHAUSTED;
00155           break;
00156         }
00157 
00158       // Do this check whether lenient or strict
00159       if (!this->is_legal_utf8 (source, extraBytesToRead + 1))
00160         {
00161           result = SOURCE_ILLEGAL;
00162           break;
00163         }
00164 
00165       // The cases all fall through. See "Note A" below.
00166       switch (extraBytesToRead)
00167         {
00168         case 5:
00169           ch += *source++;
00170           ch <<= 6;
00171         case 4:
00172           ch += *source++;
00173           ch <<= 6;
00174         case 3:
00175           ch += *source++;
00176           ch <<= 6;
00177         case 2:
00178           ch += *source++;
00179           ch <<= 6;
00180         case 1:
00181           ch += *source++;
00182           ch <<= 6;
00183         case 0:
00184           ch += *source++;
00185       }
00186       ch -= offsetsFromUTF8[extraBytesToRead];
00187 
00188       if (targetStart >= targetEnd)
00189         {
00190           result = TARGET_EXHAUSTED;
00191           break;
00192         }
00193 
00194       if (ch <= UNI_MAX_LEGAL_UTF32)
00195         {
00196           // UTF-16 surrogate values are illegal in UTF-32, and anything
00197           // over Plane 17 (> 0x10FFFF) is illegal.
00198           if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00199             {
00200               if (strict)
00201                 {
00202                   result = SOURCE_ILLEGAL;
00203                   break;
00204                 }
00205               else
00206                 {
00207                   *targetStart++ = UNI_REPLACEMENT_CHAR;
00208                 }
00209             }
00210           else
00211             {
00212               *targetStart++ = ch;
00213             }
00214         }
00215       else
00216         {
00217           result = SOURCE_ILLEGAL;
00218           break;
00219         }
00220     }
00221 
00222   return result;
00223 }
00224 
00225 ACE_UTF32_Encoding_Converter*
00226 ACE_UTF32_Encoding_Converter::encoded (const ACE_Byte* source,
00227                                        size_t source_size)
00228 {
00229   static const size_t begin = 16;
00230   static const size_t converted = begin * 4;
00231 
00232   ACE_Byte target[converted];
00233   ACE_UTF32_Encoding_Converter* converter = 0;
00234   ACE_NEW_RETURN (converter,
00235                   ACE_UTF32_Encoding_Converter (false),
00236                   0);
00237 
00238   if (converter->to_utf8 (source,
00239                           ACE_MIN (begin, source_size),
00240                           target,
00241                           converted) == CONVERSION_OK)
00242     {
00243       return converter;
00244     }
00245   else
00246     {
00247       delete converter;
00248     }
00249 
00250   return 0;
00251 }
00252 
00253 ACE_END_VERSIONED_NAMESPACE_DECL
00254 #endif /* ACE_USES_WCHAR */