00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "ace/UTF16_Encoding_Converter.h"
00023
00024 #if defined (ACE_USES_WCHAR)
00025 #include "ace/OS_NS_stdio.h"
00026 #include "ace/OS_Memory.h"
00027 #include "ace/Min_Max.h"
00028
00029 #if !defined (__ACE_INLINE__)
00030 #include "ace/UTF16_Encoding_Converter.inl"
00031 #endif
00032
00033 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
00034
00035 static const ACE_UINT32 halfShift = 10;
00036 static const ACE_UINT32 halfBase = 0x00010000;
00037 static const ACE_UINT32 halfMask = 0x000003FF;
00038
00039 static const ACE_UINT32 UNI_SUR_HIGH_START = 0x0000D800;
00040 static const ACE_UINT32 UNI_SUR_HIGH_END = 0x0000DBFF;
00041 static const ACE_UINT32 UNI_SUR_LOW_START = 0x0000DC00;
00042 static const ACE_UINT32 UNI_SUR_LOW_END = 0x0000DFFF;
00043 static const ACE_UINT32 UNI_REPLACEMENT_CHAR = 0x0000FFFD;
00044 static const ACE_UINT32 UNI_MAX_BMP = 0x0000FFFF;
00045 static const ACE_UINT32 UNI_MAX_UTF16 = 0x0010FFFF;
00046
00047
00048
00049
00050
00051
00052 static const ACE_Byte firstByteMark[7] = { 0x00, 0x00, 0xC0,
00053 0xE0, 0xF0, 0xF8, 0xFC };
00054
00055
00056
00057
00058
00059
00060 static const ACE_Byte trailingBytesForUTF8[256] = {
00061 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00062 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00063 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00064 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00065 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00066 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00067 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00068 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00069 };
00070
00071
00072
00073
00074 static const ACE_UINT32 offsetsFromUTF8[6] = { 0x00000000, 0x00003080,
00075 0x000E2080, 0x03C82080,
00076 0xFA082080, 0x82082080 };
00077
00078
00079 ACE_UTF16_Encoding_Converter::ACE_UTF16_Encoding_Converter (bool swap)
00080 : swap_ (swap)
00081 {
00082 }
00083
00084 ACE_UTF16_Encoding_Converter::~ACE_UTF16_Encoding_Converter (void)
00085 {
00086 }
00087
00088 ACE_UTF16_Encoding_Converter::Result
00089 ACE_UTF16_Encoding_Converter::to_utf8 (const void* source,
00090 size_t source_size,
00091 ACE_Byte* target,
00092 size_t target_size,
00093 bool strict)
00094 {
00095 static const ACE_UINT32 byteMask = 0xBF;
00096 static const ACE_UINT32 byteMark = 0x80;
00097 Result result = CONVERSION_OK;
00098
00099 ACE_Byte* targetEnd = target + target_size;
00100 const ACE_UINT16* sourceStart = static_cast<const ACE_UINT16*> (source);
00101 const ACE_UINT16* sourceEnd = sourceStart +
00102 (source_size / sizeof (ACE_UINT16));
00103
00104 while (sourceStart < sourceEnd)
00105 {
00106 ACE_UINT16 nw = *sourceStart++;
00107 ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_WORD (nw) : nw);
00108
00109
00110 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
00111 {
00112
00113
00114 if (sourceStart < sourceEnd)
00115 {
00116 ACE_UINT32 ch2 = (this->swap_ ? ACE_SWAP_WORD (*sourceStart) :
00117 *sourceStart);
00118
00119 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
00120 {
00121 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00122 + (ch2 - UNI_SUR_LOW_START) + halfBase;
00123 ++sourceStart;
00124 }
00125 else if (strict)
00126 {
00127
00128 result = SOURCE_ILLEGAL;
00129 break;
00130 }
00131 }
00132 else
00133 {
00134
00135 result = SOURCE_EXHAUSTED;
00136 break;
00137 }
00138 }
00139 else if (strict)
00140 {
00141
00142 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
00143 {
00144 result = SOURCE_ILLEGAL;
00145 break;
00146 }
00147 }
00148
00149
00150 unsigned short bytesToWrite = 0;
00151 if (ch < 0x80)
00152 bytesToWrite = 1;
00153 else if (ch < 0x800)
00154 bytesToWrite = 2;
00155 else if (ch < 0x10000)
00156 bytesToWrite = 3;
00157 else if (ch < 0x110000)
00158 bytesToWrite = 4;
00159 else
00160 {
00161 bytesToWrite = 3;
00162 ch = UNI_REPLACEMENT_CHAR;
00163 }
00164
00165 target += bytesToWrite;
00166 if (target > targetEnd)
00167 {
00168 result = TARGET_EXHAUSTED;
00169 break;
00170 }
00171
00172
00173 switch (bytesToWrite)
00174 {
00175 case 4:
00176 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00177 ch >>= 6;
00178 case 3:
00179 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00180 ch >>= 6;
00181 case 2:
00182 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
00183 ch >>= 6;
00184 case 1:
00185 *--target = (ACE_Byte)(ch | firstByteMark[bytesToWrite]);
00186 }
00187 target += bytesToWrite;
00188 }
00189
00190 return result;
00191 }
00192
00193 ACE_UTF16_Encoding_Converter::Result
00194 ACE_UTF16_Encoding_Converter::from_utf8 (const ACE_Byte* source,
00195 size_t source_size,
00196 void* target,
00197 size_t target_size,
00198 bool strict)
00199 {
00200 Result result = CONVERSION_OK;
00201 const ACE_Byte* sourceEnd = source + source_size;
00202 ACE_UINT16* targetStart = static_cast<ACE_UINT16*> (target);
00203 ACE_UINT16* targetEnd = targetStart + target_size;
00204
00205 while (source < sourceEnd)
00206 {
00207 ACE_UINT32 ch = 0;
00208 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00209 if (source + extraBytesToRead >= sourceEnd)
00210 {
00211 result = SOURCE_EXHAUSTED;
00212 break;
00213 }
00214
00215
00216 if (!this->is_legal_utf8 (source, extraBytesToRead + 1))
00217 {
00218 result = SOURCE_ILLEGAL;
00219 break;
00220 }
00221
00222
00223 switch (extraBytesToRead)
00224 {
00225 case 5:
00226 ch += *source++;
00227 ch <<= 6;
00228 case 4:
00229 ch += *source++;
00230 ch <<= 6;
00231 case 3:
00232 ch += *source++;
00233 ch <<= 6;
00234 case 2:
00235 ch += *source++;
00236 ch <<= 6;
00237 case 1:
00238 ch += *source++;
00239 ch <<= 6;
00240 case 0:
00241 ch += *source++;
00242 }
00243 ch -= offsetsFromUTF8[extraBytesToRead];
00244
00245 if (targetStart >= targetEnd)
00246 {
00247 result = TARGET_EXHAUSTED;
00248 break;
00249 }
00250
00251 if (ch <= UNI_MAX_BMP)
00252 {
00253
00254 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
00255 {
00256 if (strict)
00257 {
00258 result = SOURCE_ILLEGAL;
00259 break;
00260 }
00261 else
00262 {
00263 *targetStart++ = UNI_REPLACEMENT_CHAR;
00264 }
00265 }
00266 else
00267 {
00268 *targetStart++ = (ACE_UINT16)ch;
00269 }
00270 }
00271 else if (ch > UNI_MAX_UTF16)
00272 {
00273 if (strict)
00274 {
00275 result = SOURCE_ILLEGAL;
00276 break;
00277 }
00278 else
00279 {
00280 *targetStart++ = UNI_REPLACEMENT_CHAR;
00281 }
00282 }
00283 else
00284 {
00285
00286 if (targetStart + 1 >= targetEnd)
00287 {
00288 result = TARGET_EXHAUSTED;
00289 break;
00290 }
00291 ch -= halfBase;
00292 *targetStart++ = (ACE_UINT16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00293 *targetStart++ = (ACE_UINT16)((ch & halfMask) + UNI_SUR_LOW_START);
00294 }
00295 }
00296
00297 return result;
00298 }
00299
00300 ACE_UTF16_Encoding_Converter*
00301 ACE_UTF16_Encoding_Converter::encoded (const ACE_Byte* source,
00302 size_t source_size)
00303 {
00304 static const size_t begin = 16;
00305 static const size_t converted = begin * 4;
00306
00307 ACE_Byte target[converted];
00308 ACE_UTF16_Encoding_Converter* converter;
00309 ACE_NEW_RETURN (converter,
00310 ACE_UTF16_Encoding_Converter (false),
00311 0);
00312 if (converter->to_utf8 (source,
00313 ACE_MIN (begin, source_size),
00314 target,
00315 converted) == CONVERSION_OK)
00316 {
00317 return converter;
00318 }
00319 else
00320 {
00321 delete converter;
00322 }
00323
00324 return 0;
00325 }
00326
00327 ACE_UINT32
00328 ACE_UTF16_Encoding_Converter::get_UNI_SUR_HIGH_START (void)
00329 {
00330 return UNI_SUR_HIGH_START;
00331 }
00332
00333 ACE_UINT32
00334 ACE_UTF16_Encoding_Converter::get_UNI_SUR_LOW_END (void)
00335 {
00336 return UNI_SUR_LOW_END;
00337 }
00338
00339 ACE_UINT32
00340 ACE_UTF16_Encoding_Converter::get_UNI_REPLACEMENT_CHAR (void)
00341 {
00342 return UNI_REPLACEMENT_CHAR;
00343 }
00344
00345 const ACE_Byte*
00346 ACE_UTF16_Encoding_Converter::get_first_byte_mark (void)
00347 {
00348 return firstByteMark;
00349 }
00350
00351 const ACE_Byte*
00352 ACE_UTF16_Encoding_Converter::get_trailing_bytes_for_utf8 (void)
00353 {
00354 return trailingBytesForUTF8;
00355 }
00356
00357 const ACE_UINT32*
00358 ACE_UTF16_Encoding_Converter::get_offsets_from_utf8 (void)
00359 {
00360 return offsetsFromUTF8;
00361 }
00362
00363 ACE_END_VERSIONED_NAMESPACE_DECL
00364 #endif