00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "ace/UTF32_Encoding_Converter.h"
00024
00025 #if defined (ACE_USES_WCHAR)
00026 #include "ace/OS_NS_stdio.h"
00027 #include "ace/OS_Memory.h"
00028 #include "ace/Min_Max.h"
00029
00030 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
00031
00032 static const ACE_UINT32 UNI_MAX_LEGAL_UTF32 = 0x0010FFFF;
00033
00034 ACE_UTF32_Encoding_Converter::ACE_UTF32_Encoding_Converter (bool swap)
00035 : ACE_UTF16_Encoding_Converter (swap)
00036 {
00037 }
00038
00039 ACE_UTF32_Encoding_Converter::~ACE_UTF32_Encoding_Converter (void)
00040 {
00041 }
00042
00043 ACE_UTF32_Encoding_Converter::Result
00044 ACE_UTF32_Encoding_Converter::to_utf8 (const void* source,
00045 size_t source_size,
00046 ACE_Byte* target,
00047 size_t target_size,
00048 bool strict)
00049 {
00050 static const ACE_UINT32 byteMask = 0xBF;
00051 static const ACE_UINT32 byteMark = 0x80;
00052 static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START ();
00053 static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END ();
00054 static const ACE_Byte* firstByteMark = get_first_byte_mark ();
00055
00056 Result result = CONVERSION_OK;
00057 ACE_Byte* targetEnd = target + target_size;
00058 const ACE_UINT32* sourceStart = static_cast<const ACE_UINT32*> (source);
00059 const ACE_UINT32* sourceEnd = sourceStart + (source_size / sizeof (ACE_UINT32));
00060
00061 while (sourceStart < sourceEnd)
00062 {
00063 ACE_UINT32 nw = *sourceStart++;
00064 ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_LONG (nw) : nw);
00065 unsigned short bytesToWrite = 0;
00066
00067 if (strict)
00068 {
00069
00070 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00071 {
00072 result = SOURCE_ILLEGAL;
00073 break;
00074 }
00075 }
00076
00077
00078
00079
00080 if (ch < 0x80)
00081 {
00082 bytesToWrite = 1;
00083 }
00084 else if (ch < 0x800)
00085 {
00086 bytesToWrite = 2;
00087 }
00088 else if (ch < 0x10000)
00089 {
00090 bytesToWrite = 3;
00091 }
00092 else if (ch <= UNI_MAX_LEGAL_UTF32)
00093 {
00094 bytesToWrite = 4;
00095 }
00096 else
00097 {
00098 result = SOURCE_ILLEGAL;
00099 break;
00100 }
00101
00102 target += bytesToWrite;
00103 if (target > targetEnd)
00104 {
00105 result = TARGET_EXHAUSTED;
00106 break;
00107 }
00108
00109
00110 switch (bytesToWrite)
00111 {
00112 case 4:
00113 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00114 ch >>= 6;
00115 case 3:
00116 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00117 ch >>= 6;
00118 case 2:
00119 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00120 ch >>= 6;
00121 case 1:
00122 *--target = (ACE_Byte) (ch | firstByteMark[bytesToWrite]);
00123 }
00124 target += bytesToWrite;
00125 }
00126
00127 return result;
00128 }
00129
00130 ACE_UTF32_Encoding_Converter::Result
00131 ACE_UTF32_Encoding_Converter::from_utf8 (const ACE_Byte* source,
00132 size_t source_size,
00133 void* target,
00134 size_t target_size,
00135 bool strict)
00136 {
00137 static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START ();
00138 static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END ();
00139 static const ACE_UINT32 UNI_REPLACEMENT_CHAR = get_UNI_REPLACEMENT_CHAR ();
00140 static const ACE_Byte* trailingBytesForUTF8 = get_trailing_bytes_for_utf8 ();
00141 static const ACE_UINT32* offsetsFromUTF8 = get_offsets_from_utf8 ();
00142
00143 Result result = CONVERSION_OK;
00144 const ACE_Byte* sourceEnd = source + source_size;
00145 ACE_UINT32* targetStart = static_cast<ACE_UINT32*> (target);
00146 ACE_UINT32* targetEnd = targetStart + target_size;
00147
00148 while (source < sourceEnd)
00149 {
00150 ACE_UINT32 ch = 0;
00151 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00152 if (source + extraBytesToRead >= sourceEnd)
00153 {
00154 result = SOURCE_EXHAUSTED;
00155 break;
00156 }
00157
00158
00159 if (!this->is_legal_utf8 (source, extraBytesToRead + 1))
00160 {
00161 result = SOURCE_ILLEGAL;
00162 break;
00163 }
00164
00165
00166 switch (extraBytesToRead)
00167 {
00168 case 5:
00169 ch += *source++;
00170 ch <<= 6;
00171 case 4:
00172 ch += *source++;
00173 ch <<= 6;
00174 case 3:
00175 ch += *source++;
00176 ch <<= 6;
00177 case 2:
00178 ch += *source++;
00179 ch <<= 6;
00180 case 1:
00181 ch += *source++;
00182 ch <<= 6;
00183 case 0:
00184 ch += *source++;
00185 }
00186 ch -= offsetsFromUTF8[extraBytesToRead];
00187
00188 if (targetStart >= targetEnd)
00189 {
00190 result = TARGET_EXHAUSTED;
00191 break;
00192 }
00193
00194 if (ch <= UNI_MAX_LEGAL_UTF32)
00195 {
00196
00197
00198 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00199 {
00200 if (strict)
00201 {
00202 result = SOURCE_ILLEGAL;
00203 break;
00204 }
00205 else
00206 {
00207 *targetStart++ = UNI_REPLACEMENT_CHAR;
00208 }
00209 }
00210 else
00211 {
00212 *targetStart++ = ch;
00213 }
00214 }
00215 else
00216 {
00217 result = SOURCE_ILLEGAL;
00218 break;
00219 }
00220 }
00221
00222 return result;
00223 }
00224
00225 ACE_UTF32_Encoding_Converter*
00226 ACE_UTF32_Encoding_Converter::encoded (const ACE_Byte* source,
00227 size_t source_size)
00228 {
00229 static const size_t begin = 16;
00230 static const size_t converted = begin * 4;
00231
00232 ACE_Byte target[converted];
00233 ACE_UTF32_Encoding_Converter* converter = 0;
00234 ACE_NEW_RETURN (converter,
00235 ACE_UTF32_Encoding_Converter (false),
00236 0);
00237
00238 if (converter->to_utf8 (source,
00239 ACE_MIN (begin, source_size),
00240 target,
00241 converted) == CONVERSION_OK)
00242 {
00243 return converter;
00244 }
00245 else
00246 {
00247 delete converter;
00248 }
00249
00250 return 0;
00251 }
00252
00253 ACE_END_VERSIONED_NAMESPACE_DECL
00254 #endif