00001
00002
00003 #include "ACEXML/common/Transcode.h"
00004 #include "ace/OS_NS_string.h"
00005
00006 int
00007 ACEXML_Transcoder::utf162utf8 (ACEXML_UTF16 src,
00008 ACEXML_UTF8 *dst,
00009 size_t len)
00010 {
00011
00012
00013 if (dst == 0)
00014 return ACEXML_INVALID_ARGS;
00015
00016 if (src < 0x80)
00017 {
00018 if (len < 1)
00019 return ACEXML_DESTINATION_TOO_SHORT;
00020
00021 *dst = static_cast<ACEXML_UTF8> (src);
00022 return 1;
00023 }
00024 else if (src < 0x800)
00025 {
00026 if (len < 2)
00027 return ACEXML_DESTINATION_TOO_SHORT;
00028
00029 *dst = 0xc0 | (static_cast<ACEXML_UTF8> (src) / 0x40);
00030 *(dst+1) = 0x80 | (static_cast<ACEXML_UTF8> (src) % 0x40);
00031 return 2;
00032 }
00033 else
00034 {
00035 if (len < 3)
00036 return ACEXML_DESTINATION_TOO_SHORT;
00037
00038
00039 if (src >= 0xD800 && src < 0xE000)
00040 return ACEXML_IS_SURROGATE;
00041
00042 *dst = 0xe0 | (static_cast<ACEXML_UTF8> (src) / 0x1000);
00043 *(dst+1) = 0x80 | ((static_cast<ACEXML_UTF8> (src) % 0x1000) / 0x40);
00044 *(dst+2) = 0x80 | (static_cast<ACEXML_UTF8> (src) % 0x40);
00045 return 3;
00046 }
00047 }
00048
00049 int
00050 ACEXML_Transcoder::ucs42utf8 (ACEXML_UCS4 src,
00051 ACEXML_UTF8 *dst,
00052 size_t len)
00053 {
00054 if (src < 0x10000)
00055 {
00056 int retv = ACEXML_Transcoder::utf162utf8
00057 (static_cast<ACEXML_UTF16> (src),
00058 dst, len);
00059 return (retv == ACEXML_IS_SURROGATE ? ACEXML_NON_UNICODE : retv);
00060 }
00061 else if (src >= 0x100000 && src < 0x110000)
00062 {
00063 if (len < 4)
00064 return ACEXML_DESTINATION_TOO_SHORT;
00065
00066 if (dst == 0)
00067 return ACEXML_INVALID_ARGS;
00068
00069 *dst = 0xf0 | (static_cast<ACEXML_UTF8> (src / 0x40000));
00070 *(dst+1) = 0x80 | ((static_cast<ACEXML_UTF8> (src % 0x40000)) / 0x1000);
00071 *(dst+2) = 0x80 | ((static_cast<ACEXML_UTF8> (src % 0x1000)) / 0x40);
00072 *(dst+3) = 0x80 | (static_cast<ACEXML_UTF8> (src % 0x40));
00073 return 4;
00074 }
00075 return ACEXML_NON_UNICODE;
00076 }
00077
00078
00079 int
00080 ACEXML_Transcoder::ucs42utf16 (ACEXML_UCS4 src,
00081 ACEXML_UTF16 *dst,
00082 size_t len)
00083 {
00084 if (dst == 0)
00085 return ACEXML_INVALID_ARGS;
00086
00087 if (src < 0x10000)
00088 {
00089 if (len < 1)
00090 return ACEXML_DESTINATION_TOO_SHORT;
00091
00092 if (src >= 0xD800 && src < 0xE000)
00093 return ACEXML_NON_UNICODE;
00094
00095 *dst = static_cast<ACEXML_UTF16> (src);
00096 return 1;
00097 }
00098 else if (src >= 0x100000 && src < 0x110000)
00099
00100 {
00101 if (len < 2)
00102 return ACEXML_DESTINATION_TOO_SHORT;
00103
00104 *dst = 0xD800 | (static_cast<ACEXML_UTF16> (src) / 0x400);
00105 *(dst+1) = 0xDC00 | (static_cast<ACEXML_UTF16> (src) % 0x400);
00106 return 2;
00107 }
00108
00109 return ACEXML_NON_UNICODE;
00110 }
00111
00112 int
00113 ACEXML_Transcoder::surrogate2utf8 (ACEXML_UTF16 high,
00114 ACEXML_UTF16 low,
00115 ACEXML_UTF8 *dst,
00116 size_t len)
00117 {
00118 if (len < 3)
00119 return ACEXML_DESTINATION_TOO_SHORT;
00120
00121 if (dst == 0 ||
00122 (high >= 0xD800 && high < 0xDC00) ||
00123 (low >= 0xDC00 && low < 0xE000))
00124 return ACEXML_INVALID_ARGS;
00125
00126 ACEXML_UCS4 src = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
00127 *dst = static_cast<ACEXML_UTF8> (0xD800 | (src / 0x400));
00128 *(dst+1) = static_cast<ACEXML_UTF8> (0xDC00 | (src % 0x400));
00129 return 2;
00130 }
00131
00132 int
00133 ACEXML_Transcoder::surrogate2ucs4 (ACEXML_UTF16 high,
00134 ACEXML_UTF16 low,
00135 ACEXML_UCS4 &dst)
00136 {
00137 if ((high >= 0xD800 && high < 0xDC00) ||
00138 (low >= 0xDC00 && low < 0xE000))
00139 return ACEXML_INVALID_ARGS;
00140
00141 dst = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
00142 return ACEXML_SUCCESS;
00143 }
00144
00145 int
00146 ACEXML_Transcoder::utf82ucs4 (const ACEXML_UTF8 *the_src,
00147 size_t len,
00148 ACEXML_UCS4 &dst)
00149 {
00150 if (the_src == 0)
00151 return ACEXML_INVALID_ARGS;
00152
00153 const unsigned char *src = reinterpret_cast<const unsigned char *> (the_src);
00154
00155 size_t forward = 1;
00156
00157 if (forward > len)
00158 return ACEXML_END_OF_SOURCE;
00159
00160 if (static_cast<unsigned char> (*src) < 0x80)
00161 dst = *src;
00162 else if ((*src & 0xE0) == 0xC0)
00163 {
00164 dst = (*(src++) & 0x1f) * 0x40;
00165 if (++forward > len)
00166 return ACEXML_END_OF_SOURCE;
00167 if ((*src & 0xC0) != 0x80)
00168 return ACEXML_NON_UNICODE;
00169 dst += *src & 0x3f;
00170 }
00171 else if ((*src & 0xF0) == 0xE0)
00172 {
00173 dst = (*src++ & 0x0f) * 0x40;
00174 if (++forward > len)
00175 return ACEXML_END_OF_SOURCE;
00176 if ((*src & 0xC0) != 0x80)
00177 return ACEXML_NON_UNICODE;
00178 dst = (dst + (*src++ & 0x3f)) * 0x40;
00179 if (++forward > len)
00180 return ACEXML_END_OF_SOURCE;
00181 if ((*src & 0xC0) != 0x80)
00182 return ACEXML_NON_UNICODE;
00183 dst += *src & 0x3f;
00184 }
00185 else if ((*src & 0xF8) == 0xF0)
00186 {
00187 dst = (*src++ & 0x0f) * 0x40;
00188 if (++forward > len)
00189 return ACEXML_END_OF_SOURCE;
00190 if ((*src & 0xC0) != 0x80)
00191 return ACEXML_NON_UNICODE;
00192 dst = (dst + (*src++ & 0x3f)) * 0x40;
00193 if (++forward > len)
00194 return ACEXML_END_OF_SOURCE;
00195 if ((*src & 0xC0) != 0x80)
00196 return ACEXML_NON_UNICODE;
00197 dst = (dst + (*src++ & 0x3f)) * 0x40;
00198 if (++forward > len)
00199 return ACEXML_END_OF_SOURCE;
00200 if ((*src & 0xC0) != 0x80)
00201 return ACEXML_NON_UNICODE;
00202 dst += *src & 0x3f;
00203 }
00204 else
00205 return ACEXML_NON_UNICODE;
00206
00207 return forward;
00208 }
00209
00210 int
00211 ACEXML_Transcoder::utf162ucs4 (const ACEXML_UTF16 *src,
00212 size_t len,
00213 ACEXML_UCS4 &dst)
00214 {
00215 if (src == 0)
00216 return ACEXML_INVALID_ARGS;
00217
00218 size_t forward = 1;
00219 if (*src >= 0xDC00 && *src < 0xE000)
00220 {
00221 if (len < 2)
00222 return ACEXML_END_OF_SOURCE;
00223 return ACEXML_Transcoder::surrogate2ucs4 (*src,
00224 *(src+1),
00225 dst);
00226 }
00227 else
00228 {
00229 if (len < 1)
00230 return ACEXML_END_OF_SOURCE;
00231 dst = *src;
00232 }
00233
00234 return forward;
00235 }
00236
00237 int
00238 ACEXML_Transcoder::utf8s2utf16s (const ACEXML_UTF8 *src,
00239 ACEXML_UTF16 *dst,
00240 size_t len)
00241 {
00242 if (src == 0 || dst == 0)
00243 return ACEXML_INVALID_ARGS;
00244
00245 size_t src_len = ACE_OS::strlen (src) + 1;
00246
00247 size_t total_len = 0;
00248 int forward;
00249 ACEXML_UCS4 temp;
00250
00251 while (src_len > 0)
00252 {
00253 if ((forward = ACEXML_Transcoder::utf82ucs4 (src,
00254 src_len,
00255 temp)) <= 0)
00256 return forward;
00257
00258 src += forward;
00259 src_len -= forward;
00260
00261 if ((forward = ACEXML_Transcoder::ucs42utf16 (temp,
00262 dst,
00263 len)) <= 0)
00264 return forward;
00265
00266 total_len += forward;
00267 dst += forward;
00268 len -= forward;
00269 }
00270
00271 return static_cast<int> (total_len);
00272 }
00273
00274 int
00275 ACEXML_Transcoder::utf16s2utf8s (const ACEXML_UTF16 *src,
00276 ACEXML_UTF8 *dst,
00277 size_t len)
00278 {
00279 if (src == 0 || dst == 0)
00280 return ACEXML_INVALID_ARGS;
00281
00282 size_t src_len = 1;
00283 for (const ACEXML_UTF16 *p = src; *p++ != 0; ++src_len)
00284 ;
00285
00286 size_t total_len = 0;
00287 int forward;
00288 ACEXML_UCS4 temp;
00289
00290 while (src_len > 0)
00291 {
00292 if ((forward = ACEXML_Transcoder::utf162ucs4 (src,
00293 src_len,
00294 temp)) <= 0)
00295 return forward;
00296
00297 src += forward;
00298 src_len -= forward;
00299
00300 if ((forward = ACEXML_Transcoder::ucs42utf8 (temp,
00301 dst,
00302 len)) <= 0)
00303 return forward;
00304
00305 total_len += forward;
00306 dst += forward;
00307 len -= forward;
00308 }
00309
00310 return static_cast<int> (total_len);
00311 }