Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

textcodec.cpp

Go to the documentation of this file.
00001 /***************************************************************************
00002     textcodec.cpp  -  Definitions of TextCodec class methods.
00003                              -------------------
00004     begin                : June 21 2002
00005     copyright            : (C) 2003 by Vojtìch Toman
00006     email                : vtoman@lit.cz
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  *                                                                         *
00011  *   This program is free software; you can redistribute it and/or modify  *
00012  *   it under the terms of the GNU General Public License as published by  *
00013  *   the Free Software Foundation; either version 2 of the License, or     *
00014  *   (at your option) any later version.                                   *
00015  *                                                                         *
00016  ***************************************************************************/
00017 
00026 #ifdef __GNUG__
00027 # pragma implementation
00028 #endif
00029 
00030 
00031 #include "textcodec.h"
00032 
00033 
00037 
00038 
00042 static int US_ASCII_to_UTF8_table[] =
00043 {
00044   //character codes greater than 127 are all marked as invalid
00045   -1, -1, -1, -1, -1, -1, -1, -1,
00046   -1, -1, -1, -1, -1, -1, -1, -1,
00047   -1, -1, -1, -1, -1, -1, -1, -1,
00048   -1, -1, -1, -1, -1, -1, -1, -1,
00049   -1, -1, -1, -1, -1, -1, -1, -1,
00050   -1, -1, -1, -1, -1, -1, -1, -1,
00051   -1, -1, -1, -1, -1, -1, -1, -1,
00052   -1, -1, -1, -1, -1, -1, -1, -1,
00053   -1, -1, -1, -1, -1, -1, -1, -1,
00054   -1, -1, -1, -1, -1, -1, -1, -1,
00055   -1, -1, -1, -1, -1, -1, -1, -1,
00056   -1, -1, -1, -1, -1, -1, -1, -1,
00057   -1, -1, -1, -1, -1, -1, -1, -1,
00058   -1, -1, -1, -1, -1, -1, -1, -1,
00059   -1, -1, -1, -1, -1, -1, -1, -1,
00060   -1, -1, -1, -1, -1, -1, -1, -1
00061 };
00062 
00063 
00064 
00068 static int ISO_8859_2_to_UTF8_table[] =
00069 {
00070   0xc280, 0xc281, 0xc282, 0xc283, 0xc284, 0xc285, 0xc286, 0xc287,
00071   0xc288, 0xc289, 0xc28a, 0xc28b, 0xc28c, 0xc28d, 0xc28e, 0xc28f,
00072   0xc290, 0xc291, 0xc292, 0xc293, 0xc294, 0xc295, 0xc296, 0xc297,
00073   0xc298, 0xc299, 0xc29a, 0xc29b, 0xc29c, 0xc29d, 0xc29e, 0xc29f,
00074   0xc2a0, 0xc484, 0xcb98, 0xc581, 0xc2a4, 0xc4bd, 0xc59a, 0xc2a7,
00075   0xc2a8, 0xc5a0, 0xc59e, 0xc5a4, 0xc5b9, 0xc2ad, 0xc5bd, 0xc5bb,
00076   0xc2b0, 0xc485, 0xcb9b, 0xc582, 0xc2b4, 0xc4be, 0xc59b, 0xcb87,
00077   0xc2b8, 0xc5a1, 0xc59f, 0xc5a5, 0xc5ba, 0xcb9d, 0xc5be, 0xc5bc,
00078   0xc594, 0xc381, 0xc382, 0xc482, 0xc384, 0xc4b9, 0xc486, 0xc387,
00079   0xc48c, 0xc389, 0xc498, 0xc38b, 0xc49a, 0xc38d, 0xc38e, 0xc48e,
00080   0xc490, 0xc583, 0xc587, 0xc393, 0xc394, 0xc590, 0xc396, 0xc397,
00081   0xc598, 0xc5ae, 0xc39a, 0xc5b0, 0xc39c, 0xc39d, 0xc5a2, 0xc39f,
00082   0xc595, 0xc3a1, 0xc3a2, 0xc483, 0xc3a4, 0xc4ba, 0xc487, 0xc3a7,
00083   0xc48d, 0xc3a9, 0xc499, 0xc3ab, 0xc49b, 0xc3ad, 0xc3ae, 0xc48f,
00084   0xc491, 0xc584, 0xc588, 0xc3b3, 0xc3b4, 0xc591, 0xc3b6, 0xc3b7,
00085   0xc599, 0xc5af, 0xc3ba, 0xc5b1, 0xc3bc, 0xc3bd, 0xc5a3, 0xcb99
00086 };
00087 
00088 
00089 
00096 #define EXPAT_MAP_SINGLE_BYTE_ENCODING_TO_UTF8(_enc_, _info_)   \
00097 {                                                               \
00098   size_t i;                                                     \
00099                                                                 \
00100   for (i = 0; i < 128; i++)                                     \
00101     _info_->map[i] = i;                                         \
00102                                                                 \
00103   for (i = 128; i < 256; i++)                                   \
00104     _info_->map[i] = _enc_ ## _to_UTF8_table[i-128];            \
00105 }
00106 
00107 
00108 
00116 #define ENCODING_SIZE(_mib_, _mibGiven_, _size_)        \
00117 if (_mib_ == _mibGiven_)                                \
00118   return _size_;
00119 
00120 
00121 
00129 #define ENCODING_MIB(_enc_, _encGiven_, _mib_)          \
00130 if (!xmlchar_cstrcmp(_encGiven_, _enc_))                \
00131   return _mib_;                                         \
00132 
00133 
00134 
00138 
00139 
00145 unsigned long TextCodec::suggestAlphabetBaseSize(Encodings::MIB mib) throw (ExaltUnknownEncodingException)
00146 {
00147   ENCODING_SIZE(Encodings::UTF_8, mib, 256);
00148   ENCODING_SIZE(Encodings::UTF_16, mib, 256);
00149   ENCODING_SIZE(Encodings::ISO_8859_1, mib, 256);
00150   ENCODING_SIZE(Encodings::ISO_8859_2, mib, 256);
00151   ENCODING_SIZE(Encodings::US_ASCII, mib, 128);
00152 
00153   throw ExaltUnknownEncodingException();
00154   return DEFAULT_ALPHABET_BASE_SIZE;
00155 }
00156 
00157 
00158 
00165 bool TextCodec::isAbleToConvert(Encodings::MIB mib)
00166 {
00167   switch (mib)
00168     {
00169     case Encodings::US_ASCII:
00170       //(expat will probably never cause this :)
00171       //just for illustration
00172       return true;
00173 
00174     case Encodings::ISO_8859_2:
00175       return true;
00176 
00177     default:
00178       return false;
00179     }
00180 }
00181 
00182 
00183 
00190 void TextCodec::fillInMapArray(XmlEncoding *info, Encodings::MIB mib)
00191 {
00192   //support for UTF-8 and UTF-16 missing!
00193   //when XML declaration contains for example: encoding="utf8", textcodec
00194   //has to "convert" this!
00195 
00196   //IMPORTANT
00197   //we should to take care about how expat has been compiled
00198   //here we silently suppose that it uses UTF-8 encoding internally,
00199   //which may be simply wrong! (It can use UTF-16 as well)
00200 
00201   switch (mib)
00202     {
00203     case Encodings::US_ASCII:
00204       //the first 128 characters map directly to Unicode values (i believe)
00205       //next 128 are invalid
00206       //(expat will probably never cause this :)
00207       EXPAT_MAP_SINGLE_BYTE_ENCODING_TO_UTF8(US_ASCII, info);
00208       break;
00209 
00210     case Encodings::ISO_8859_2:
00211       EXPAT_MAP_SINGLE_BYTE_ENCODING_TO_UTF8(ISO_8859_2, info);
00212 
00213       break;
00214       
00215     default:
00216       //unable to convert: all items of the map array are marked as invalid
00217       for (size_t i = 0; i < 256; i++)
00218         info->map[i] = -1;
00219     }
00220 }
00221 
00222 
00223 
00232 int TextCodec::convert(const char *s, Encodings::MIB mib)
00233 {
00234   switch (mib)
00235     {
00236     case Encodings::US_ASCII:
00237       //return first char of the byte sequence
00238       //should be sufficient
00239       //(expat will probably never cause this :)
00240       return *s;
00241   
00242     default:
00243       //stupid: s is always invalid sequence
00244       return -1;
00245     }
00246 }
00247 
00248 
00254 void TextCodec::release(Encodings::MIB mib)
00255 {
00256   //nothing special
00257 }
00258 
00259 
00260 
00271 bool TextCodec::knowsMIB(Encodings::MIB mib)
00272 {
00273   size_t i = 0;
00274   Encodings::EncodingName encodingNames[] = { ENCODING_NAMES }; //ENCODING_NAMES defined in encodings.h
00275 
00276   while (encodingNames[i].name)
00277     {
00278       if (encodingNames[i].mib == mib)
00279         return true;
00280 
00281       i++;
00282     }
00283 
00284   return false;
00285 }
00286 
00287 
00288 
00296 Encodings::MIB TextCodec::getMIB(const XmlChar *encoding) throw (ExaltUnknownEncodingException)
00297 {
00298   if (!encoding)
00299     {
00300 #ifdef XML_UNICODE
00301       //implicit expat encoding is UTF-16
00302       return Encodings::UTF_16;
00303 #else
00304       //implicit expat encoding is UTF-8
00305       return Encodings::UTF_8;
00306 #endif
00307     }
00308 
00309   ENCODING_MIB("US-ASCII", encoding, Encodings::US_ASCII);
00310 
00311   ENCODING_MIB("KOI8_V", encoding, Encodings::KOI8_V);
00312   ENCODING_MIB("KOI8_R", encoding, Encodings::KOI8_R);
00313 
00314   ENCODING_MIB("UTF-8", encoding, Encodings::UTF_8);
00315   ENCODING_MIB("UTF-16", encoding, Encodings::UTF_16);
00316 
00317   ENCODING_MIB("ISO-8859-1", encoding, Encodings::ISO_8859_1);
00318   ENCODING_MIB("ISO-8859-2", encoding, Encodings::ISO_8859_2);
00319   ENCODING_MIB("ISO-8859-3", encoding, Encodings::ISO_8859_3);
00320   ENCODING_MIB("ISO-8859-4", encoding, Encodings::ISO_8859_4);
00321   ENCODING_MIB("ISO-8859-5", encoding, Encodings::ISO_8859_5);
00322   ENCODING_MIB("ISO-8859-6", encoding, Encodings::ISO_8859_6);
00323   ENCODING_MIB("ISO-8859-7", encoding, Encodings::ISO_8859_7);
00324   ENCODING_MIB("ISO-8859-8", encoding, Encodings::ISO_8859_8);
00325   ENCODING_MIB("ISO-8859-9", encoding, Encodings::ISO_8859_9);
00326   ENCODING_MIB("ISO-8859-10", encoding, Encodings::ISO_8859_10);
00327   ENCODING_MIB("ISO-8859-11", encoding, Encodings::ISO_8859_11);
00328   ENCODING_MIB("ISO-8859-13", encoding, Encodings::ISO_8859_13);
00329   ENCODING_MIB("ISO-8859-14", encoding, Encodings::ISO_8859_14);
00330   ENCODING_MIB("ISO-8859-15", encoding, Encodings::ISO_8859_15);
00331 
00332   ENCODING_MIB("CP_1250", encoding, Encodings::CP_1250);
00333   ENCODING_MIB("CP_1251", encoding, Encodings::CP_1251);
00334   ENCODING_MIB("CP_1252", encoding, Encodings::CP_1252);
00335   ENCODING_MIB("CP_1253", encoding, Encodings::CP_1253);
00336   ENCODING_MIB("CP_1254", encoding, Encodings::CP_1254);
00337   ENCODING_MIB("CP_1255", encoding, Encodings::CP_1255);
00338   ENCODING_MIB("CP_1256", encoding, Encodings::CP_1256);
00339   ENCODING_MIB("CP_1257", encoding, Encodings::CP_1257);
00340   ENCODING_MIB("CP_1258", encoding, Encodings::CP_1258);
00341 
00342 
00343   ERR("Unknown character encoding: " << encoding);
00344   throw ExaltUnknownEncodingException();
00345   return Encodings::Unknown;
00346 }
00347 
00348 
00349 
00357 void TextCodec::output(IODevice *device, const XmlChar c, Encodings::MIB toEncoding) throw (ExaltEncodingException, ExaltIOException)
00358 {
00359   switch (toEncoding)
00360     {
00361     case Encodings::UTF_8:
00362     case Encodings::UTF_16:
00363       device->writeData((const char *)&c, SIZEOF_XML_CHAR);
00364       break;
00365 
00366     default:
00367       //unable to output in specified encoding
00368       ERR("Unsupported output encoding!");
00369       throw ExaltUnsupportedOutputEncodingException();
00370     }
00371 }
00372 
00373 
00374 
00382 void TextCodec::output(IODevice *device, const XmlChar *str, Encodings::MIB toEncoding) throw (ExaltEncodingException, ExaltIOException)
00383 {
00384   switch (toEncoding)
00385     {
00386     case Encodings::UTF_8:
00387     case Encodings::UTF_16:
00388       for (size_t i = 0; str[i]; i++)
00389         device->writeData((const char *)&str[i], SIZEOF_XML_CHAR);
00390       break;
00391 
00392     default:
00393       //unable to output in specified encoding
00394       ERR("Unsupported output encoding!");
00395       throw ExaltUnsupportedOutputEncodingException();
00396     }
00397 }
00398 
00399 
00400 
00409 void TextCodec::output(IODevice *device, const XmlChar *str, size_t length, Encodings::MIB toEncoding) throw (ExaltEncodingException, ExaltIOException)
00410 {
00411   switch (toEncoding)
00412     {
00413     case Encodings::UTF_8:
00414     case Encodings::UTF_16:
00415       for (size_t i = 0; i < length; i++)
00416         device->writeData((const char *)&str[i], SIZEOF_XML_CHAR);
00417       break;
00418 
00419     default:
00420       //unable to output in specified encoding
00421       ERR("Unsupported output encoding!");
00422       throw ExaltUnsupportedOutputEncodingException();
00423     }
00424 }
00425 

Generated on Wed Feb 5 10:43:02 2003 for Exalt by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002