Go to the documentation of this file.00001
00002
00003 #include "ACEXML/common/Transcode.h"
00004 #include "ace/OS_NS_string.h"
00005 #include "ace/Truncate.h"
00006
00007 int
00008 ACEXML_Transcoder::utf162utf8 (ACEXML_UTF16 src,
00009 ACEXML_UTF8 *dst,
00010 size_t len)
00011 {
00012
00013
00014 if (dst == 0)
00015 return ACEXML_INVALID_ARGS;
00016
00017 if (src < 0x80)
00018 {
00019 if (len < 1)
00020 return ACEXML_DESTINATION_TOO_SHORT;
00021
00022 *dst = static_cast<ACEXML_UTF8> (src);
00023 return 1;
00024 }
00025 else if (src < 0x800)
00026 {
00027 if (len < 2)
00028 return ACEXML_DESTINATION_TOO_SHORT;
00029
00030 *dst = 0xc0 | (static_cast<ACEXML_UTF8> (src) / 0x40);
00031 *(dst+1) = 0x80 | (static_cast<ACEXML_UTF8> (src) % 0x40);
00032 return 2;
00033 }
00034 else
00035 {
00036 if (len < 3)
00037 return ACEXML_DESTINATION_TOO_SHORT;
00038
00039
00040 if (src >= 0xD800 && src < 0xE000)
00041 return ACEXML_IS_SURROGATE;
00042
00043 *dst = 0xe0 | (static_cast<ACEXML_UTF8> (src) / 0x1000);
00044 *(dst+1) = 0x80 | ((static_cast<ACEXML_UTF8> (src) % 0x1000) / 0x40);
00045 *(dst+2) = 0x80 | (static_cast<ACEXML_UTF8> (src) % 0x40);
00046 return 3;
00047 }
00048 }
00049
00050 int
00051 ACEXML_Transcoder::ucs42utf8 (ACEXML_UCS4 src,
00052 ACEXML_UTF8 *dst,
00053 size_t len)
00054 {
00055 if (src < 0x10000)
00056 {
00057 int retv = ACEXML_Transcoder::utf162utf8
00058 (static_cast<ACEXML_UTF16> (src),
00059 dst, len);
00060 return (retv == ACEXML_IS_SURROGATE ? ACEXML_NON_UNICODE : retv);
00061 }
00062 else if (src >= 0x100000 && src < 0x110000)
00063 {
00064 if (len < 4)
00065 return ACEXML_DESTINATION_TOO_SHORT;
00066
00067 if (dst == 0)
00068 return ACEXML_INVALID_ARGS;
00069
00070 *dst = 0xf0 | (static_cast<ACEXML_UTF8> (src / 0x40000));
00071 *(dst+1) = 0x80 | ((static_cast<ACEXML_UTF8> (src % 0x40000)) / 0x1000);
00072 *(dst+2) = 0x80 | ((static_cast<ACEXML_UTF8> (src % 0x1000)) / 0x40);
00073 *(dst+3) = 0x80 | (static_cast<ACEXML_UTF8> (src % 0x40));
00074 return 4;
00075 }
00076 return ACEXML_NON_UNICODE;
00077 }
00078
00079
00080 int
00081 ACEXML_Transcoder::ucs42utf16 (ACEXML_UCS4 src,
00082 ACEXML_UTF16 *dst,
00083 size_t len)
00084 {
00085 if (dst == 0)
00086 return ACEXML_INVALID_ARGS;
00087
00088 if (src < 0x10000)
00089 {
00090 if (len < 1)
00091 return ACEXML_DESTINATION_TOO_SHORT;
00092
00093 if (src >= 0xD800 && src < 0xE000)
00094 return ACEXML_NON_UNICODE;
00095
00096 *dst = static_cast<ACEXML_UTF16> (src);
00097 return 1;
00098 }
00099 else if (src >= 0x100000 && src < 0x110000)
00100
00101 {
00102 if (len < 2)
00103 return ACEXML_DESTINATION_TOO_SHORT;
00104
00105 *dst = 0xD800 | (static_cast<ACEXML_UTF16> (src) / 0x400);
00106 *(dst+1) = 0xDC00 | (static_cast<ACEXML_UTF16> (src) % 0x400);
00107 return 2;
00108 }
00109
00110 return ACEXML_NON_UNICODE;
00111 }
00112
00113 int
00114 ACEXML_Transcoder::surrogate2utf8 (ACEXML_UTF16 high,
00115 ACEXML_UTF16 low,
00116 ACEXML_UTF8 *dst,
00117 size_t len)
00118 {
00119 if (len < 3)
00120 return ACEXML_DESTINATION_TOO_SHORT;
00121
00122 if (dst == 0 ||
00123 (high >= 0xD800 && high < 0xDC00) ||
00124 (low >= 0xDC00 && low < 0xE000))
00125 return ACEXML_INVALID_ARGS;
00126
00127 ACEXML_UCS4 src = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
00128 *dst = static_cast<ACEXML_UTF8> (0xD800 | (src / 0x400));
00129 *(dst+1) = static_cast<ACEXML_UTF8> (0xDC00 | (src % 0x400));
00130 return 2;
00131 }
00132
00133 int
00134 ACEXML_Transcoder::surrogate2ucs4 (ACEXML_UTF16 high,
00135 ACEXML_UTF16 low,
00136 ACEXML_UCS4 &dst)
00137 {
00138 if ((high >= 0xD800 && high < 0xDC00) ||
00139 (low >= 0xDC00 && low < 0xE000))
00140 return ACEXML_INVALID_ARGS;
00141
00142 dst = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
00143 return ACEXML_SUCCESS;
00144 }
00145
00146 int
00147 ACEXML_Transcoder::utf82ucs4 (const ACEXML_UTF8 *the_src,
00148 size_t len,
00149 ACEXML_UCS4 &dst)
00150 {
00151 if (the_src == 0)
00152 {
00153 return ACEXML_INVALID_ARGS;
00154 }
00155
00156 const unsigned char *src = reinterpret_cast<const unsigned char *> (the_src);
00157
00158 size_t forward = 1;
00159
00160 if (forward > len)
00161 {
00162 return ACEXML_END_OF_SOURCE;
00163 }
00164
00165 if (static_cast<unsigned char> (*src) < 0x80)
00166 {
00167 dst = *src;
00168 }
00169 else if ((*src & 0xE0) == 0xC0)
00170 {
00171 dst = (*(src++) & 0x1f) * 0x40;
00172 if (++forward > len)
00173 return ACEXML_END_OF_SOURCE;
00174 if ((*src & 0xC0) != 0x80)
00175 return ACEXML_NON_UNICODE;
00176 dst += *src & 0x3f;
00177 }
00178 else if ((*src & 0xF0) == 0xE0)
00179 {
00180 dst = (*src++ & 0x0f) * 0x40;
00181 if (++forward > len)
00182 return ACEXML_END_OF_SOURCE;
00183 if ((*src & 0xC0) != 0x80)
00184 return ACEXML_NON_UNICODE;
00185 dst = (dst + (*src++ & 0x3f)) * 0x40;
00186 if (++forward > len)
00187 return ACEXML_END_OF_SOURCE;
00188 if ((*src & 0xC0) != 0x80)
00189 return ACEXML_NON_UNICODE;
00190 dst += *src & 0x3f;
00191 }
00192 else if ((*src & 0xF8) == 0xF0)
00193 {
00194 dst = (*src++ & 0x0f) * 0x40;
00195 if (++forward > len)
00196 return ACEXML_END_OF_SOURCE;
00197 if ((*src & 0xC0) != 0x80)
00198 return ACEXML_NON_UNICODE;
00199 dst = (dst + (*src++ & 0x3f)) * 0x40;
00200 if (++forward > len)
00201 return ACEXML_END_OF_SOURCE;
00202 if ((*src & 0xC0) != 0x80)
00203 return ACEXML_NON_UNICODE;
00204 dst = (dst + (*src++ & 0x3f)) * 0x40;
00205 if (++forward > len)
00206 return ACEXML_END_OF_SOURCE;
00207 if ((*src & 0xC0) != 0x80)
00208 return ACEXML_NON_UNICODE;
00209 dst += *src & 0x3f;
00210 }
00211 else
00212 {
00213 return ACEXML_NON_UNICODE;
00214 }
00215
00216 return ACE_Utils::truncate_cast<int> (forward);
00217 }
00218
00219 int
00220 ACEXML_Transcoder::utf162ucs4 (const ACEXML_UTF16 *src,
00221 size_t len,
00222 ACEXML_UCS4 &dst)
00223 {
00224 if (src == 0)
00225 {
00226 return ACEXML_INVALID_ARGS;
00227 }
00228
00229 size_t forward = 1;
00230 if (*src >= 0xDC00 && *src < 0xE000)
00231 {
00232 if (len < 2)
00233 {
00234 return ACEXML_END_OF_SOURCE;
00235 }
00236
00237 return ACEXML_Transcoder::surrogate2ucs4 (*src,
00238 *(src+1),
00239 dst);
00240 }
00241 else
00242 {
00243 if (len < 1)
00244 {
00245 return ACEXML_END_OF_SOURCE;
00246 }
00247
00248 dst = *src;
00249 }
00250
00251 return ACE_Utils::truncate_cast<int> (forward);
00252 }
00253
00254 int
00255 ACEXML_Transcoder::utf8s2utf16s (const ACEXML_UTF8 *src,
00256 ACEXML_UTF16 *dst,
00257 size_t len)
00258 {
00259 if (src == 0 || dst == 0)
00260 {
00261 return ACEXML_INVALID_ARGS;
00262 }
00263
00264 size_t src_len = ACE_OS::strlen (src) + 1;
00265
00266 size_t total_len = 0;
00267 int forward;
00268 ACEXML_UCS4 temp;
00269
00270 while (src_len > 0)
00271 {
00272 if ((forward = ACEXML_Transcoder::utf82ucs4 (src,
00273 src_len,
00274 temp)) <= 0)
00275 return forward;
00276
00277 src += forward;
00278 src_len -= forward;
00279
00280 if ((forward = ACEXML_Transcoder::ucs42utf16 (temp,
00281 dst,
00282 len)) <= 0)
00283 return forward;
00284
00285 total_len += forward;
00286 dst += forward;
00287 len -= forward;
00288 }
00289
00290 return ACE_Utils::truncate_cast<int> (total_len);
00291 }
00292
00293 int
00294 ACEXML_Transcoder::utf16s2utf8s (const ACEXML_UTF16 *src,
00295 ACEXML_UTF8 *dst,
00296 size_t len)
00297 {
00298 if (src == 0 || dst == 0)
00299 return ACEXML_INVALID_ARGS;
00300
00301 size_t src_len = 1;
00302 for (const ACEXML_UTF16 *p = src; *p++ != 0; ++src_len)
00303 ;
00304
00305 size_t total_len = 0;
00306 int forward;
00307 ACEXML_UCS4 temp;
00308
00309 while (src_len > 0)
00310 {
00311 if ((forward = ACEXML_Transcoder::utf162ucs4 (src,
00312 src_len,
00313 temp)) <= 0)
00314 return forward;
00315
00316 src += forward;
00317 src_len -= forward;
00318
00319 if ((forward = ACEXML_Transcoder::ucs42utf8 (temp,
00320 dst,
00321 len)) <= 0)
00322 return forward;
00323
00324 total_len += forward;
00325 dst += forward;
00326 len -= forward;
00327 }
00328
00329 return ACE_Utils::truncate_cast<int> (total_len);
00330 }