/*
 * Copyright (c) 2004 Nokia. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the
 * distribution.
 *
 * Neither the name of Nokia nor the names of its contributors may be
 * used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "KWQTextCodec.h"

#include "KWQAssertions.h"
#include "KWQCharsets.h"

#include "KWIQMacros.h"
#include <errno.h>   // errno for iconv error return values
#include <iconv.h>   // iconv errno value definitions
#include <glib.h>    // g_iconv

// g_iconv helper functions

/** returns invalid GIConv handle */
inline static GIConv
_g_iconv_invalid() 
{
    return (GIConv)-1;
}

/** checks whether iconv handle is valid */
inline static gboolean
_g_iconv_is_valid(GIConv c) 
{
    return c != _g_iconv_invalid();
}

/**  clears iconv handle state
 * man 3 iconv_open
 * To bring the state back to the initial state, use iconv with NULL as inbuf
 * argument.
 *
 * This is pretty much shot in the dark, as it is not documented in g_iconv
 */
static void 
_g_iconv_clear_state(GIConv handle)
{
    gchar *inbuf = NULL;
    gsize i_left = 0;
    gsize o_left = 0;
    g_iconv(handle, &inbuf, &i_left, NULL, &o_left);
}



// apple compability defs
// translate errno values to these constants

enum OSStatus {
    noErr,
    kTECOutputBufferFullStatus,
    kTECPartialCharErr,
    kTECUnmappableElementErr,
    kTextMalformedInputErr,
    kTextUndefinedElementErr,
    // own
    kIConvUnknownCharset,
    kUnknownError
};



const UniChar BOM = 0xFEFF;


class KWQTextDecoder : public QTextDecoder {
public:
    KWQTextDecoder(CFStringEncoding, KWQEncodingFlags, CFStringEncoding targetEncoding = kCFStringEncodingUnicode, KWQEncodingFlags targetFlags = NoEncodingFlags);
    ~KWQTextDecoder();
    
    QString toUnicode(const char *chs, int len, bool flush);
private:

    QString convert(const char *chs, int len, bool flush)
        { return convert(reinterpret_cast<const unsigned char *>(chs), len, flush); }
    QString convert(const unsigned char *chs, int len, bool flush);
    QString convertLatin1(const unsigned char *chs, int len);
    QString convertUTF16(const unsigned char *chs, int len);
    QString convertUsingIConv(const unsigned char *chs, int len, bool flush);
    
    OSStatus createIConvConverter();
    OSStatus convertOneChunkUsingIConv(const unsigned char *inputBuffer, int inputBufferLength, int &inputLength,
        void *outputBuffer, int outputBufferLength, int &outputLength);
    static void appendOmittingNullsAndBOMs(QString &s, const UniChar *characters, int byteCount);
    
    KWQTextDecoder(const KWQTextDecoder &);
    KWQTextDecoder &operator=(const KWQTextDecoder &);

    CFStringEncoding _encoding;
    CFStringEncoding _targetEncoding;
    bool _littleEndian;
    bool _atStart;
    bool _error;

    unsigned _numBufferedBytes;
    unsigned char _bufferedBytes[16]; // bigger than any single multi-byte character

    // State for decoding.
    GIConv _converter;

    static GIConv  _cachedConverter;
    static CFStringEncoding _cachedConverterEncoding;
    static CFStringEncoding _cachedConverterTargetEncoding;
};

GIConv KWQTextDecoder::_cachedConverter = _g_iconv_invalid();
CFStringEncoding KWQTextDecoder::_cachedConverterEncoding = kCFStringEncodingInvalidId;
CFStringEncoding KWQTextDecoder::_cachedConverterTargetEncoding = kCFStringEncodingInvalidId;

extern "C" {
static void 
encodingToCodecValueDestroyFunc(gpointer data){
    QTextCodec *c = static_cast<QTextCodec*>(data);
    delete c;
}
}

/**  class which deallocates static objects on deinit of the library
 * code which deallocates static objects on deinit of the library. Used so memory checkers (valgrind)
 * complain less
 *
 * No constructor to avoid static init code. Not sure whether it is possible for a compiler not to
 * generate any init code. Check this.
 */
struct static_deinitialized {
    static GHashTable *encodingToCodec;

    ~static_deinitialized() {
	if (encodingToCodec) g_hash_table_destroy(encodingToCodec);
	encodingToCodec = 0;
    }
};
GHashTable *static_deinitialized::encodingToCodec = 0;
static static_deinitialized deinitializer; // causes the destructor to be run when library unloads
    
static QTextCodec *codecForCFStringEncoding(CFStringEncoding encoding, KWQEncodingFlags flags)
{
    if (encoding == kCFStringEncodingInvalidId) {
        return 0;
    }    

    if (G_UNLIKELY(static_deinitialized::encodingToCodec == 0))
	static_deinitialized::encodingToCodec = g_hash_table_new_full(g_direct_hash, 
								      g_direct_equal,
								      NULL,
								      encodingToCodecValueDestroyFunc);
    
    QTextCodec *newCodec  = new QTextCodec(encoding, flags);	
    gint hashValue = newCodec->hash();
    
    QTextCodec *codec = static_cast<QTextCodec*>(g_hash_table_lookup(static_deinitialized::encodingToCodec, 
								     GINT_TO_POINTER(hashValue)));
    if (!codec){
	g_hash_table_insert(static_deinitialized::encodingToCodec, GINT_TO_POINTER(hashValue), newCodec);
	return newCodec;
    }else{
	delete newCodec;
	return codec;
    }
}

QTextCodec *QTextCodec::codecForName(const char *name)
{
    KWQEncodingFlags flags;
    CFStringEncoding encoding = KWQCFStringEncodingFromIANACharsetName(name, &flags);
    return codecForCFStringEncoding(encoding, flags);
}


QTextCodec *QTextCodec::codecForNameEightBitOnly(const char *name)
{
    KWQEncodingFlags flags;
    CFStringEncoding encoding = KWQCFStringEncodingFromIANACharsetName(name, &flags);
    switch (encoding) {
    case kCFStringEncodingUnicode:
	encoding = kCFStringEncodingUTF8;
	break;
    default:
        break;
    }
    return codecForCFStringEncoding(encoding, flags);
}

QTextCodec *QTextCodec::codecForLocale()
{
    const gchar* name;
    g_get_charset(&name);
    return codecForName(name);
}

const char *QTextCodec::name() const
{
    if (!_encodingString) return KWQCFStringEncodingToIANACharsetName(_encoding);
    return _encodingString;
}

QTextDecoder *QTextCodec::makeDecoder() const
{
    return new KWQTextDecoder(_encoding, _flags);
}

inline CFStringEncoding effectiveEncoding(CFStringEncoding e)
{
    switch (e) {
        case kCFStringEncodingISOLatin1:
        case kCFStringEncodingASCII:
            e = kCFStringEncodingWindowsLatin1;
            break;
        default:
	    break;
    }

    return e;
}

QCString QTextCodec::fromUnicode(const QString &qcs) const
{
    if (qcs.length() <= 0) {
	//g_warning("QTextCodec::fromUnicode empty string");
        return QCString();
    }

    const QChar* data = qcs.unicode();
    int len = qcs.length();

    // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
    QString copy;
    QChar currencySymbol = backslashAsCurrencySymbol();
    if (currencySymbol != '\\' && qcs.find('\\') != -1) {
        copy = qcs;
	copy.replace('\\', currencySymbol);
	data = copy.unicode();
	len = copy.length();
    }    

    // Handle UCS-2 -> UCS-2
    if (_encoding == kCFStringEncodingUnicode && _flags == NoEncodingFlags)
	return QCString(reinterpret_cast<const char*>(data),2*len);

    GError* err=NULL;
    gchar *dest=0,*utf8=0;
    glong r_utf8,w_utf8;
    gsize r_dest,w_dest;
    
    // UCS-2 -> UTF-8
    utf8 = g_utf16_to_utf8(reinterpret_cast<const gunichar2*>(data),len,
	&r_utf8,&w_utf8,&err);
    
    if (_encoding == kCFStringEncodingUTF8 && _flags == NoEncodingFlags){
	// If UTF-8 is what we wanted
	dest=utf8;
	utf8=NULL;
	w_dest=w_utf8;
    } else{	
	// Handle UTF-8 -> destination charset
	dest = g_convert(reinterpret_cast<const gchar*>(utf8), 
			 w_utf8+1,
			 name(), 
			 "UTF-8", 
			 &r_dest, 
			 &w_dest, 
			 &err);
	g_free(utf8);
    }
    
    
    if (err!=NULL){
	REPORT_G_CONV_ERROR(err);
	g_error_free(err);
	return QCString();
    }
    
    QCString str(dest, w_dest+1);
    g_free(dest);
    return str;
}


QString QTextCodec::toUnicode(const char *chs, int len) const
{
    return KWQTextDecoder(_encoding, _flags).toUnicode(chs, len, true);
}

QString QTextCodec::toUnicode(const QByteArray &qba, int len) const
{
    return KWQTextDecoder(_encoding, _flags).toUnicode(qba.data(), len, true);
}
    
QChar QTextCodec::backslashAsCurrencySymbol() const
{
    // FIXME: We should put this information into KWQCharsetData instead of having a switch here.
    switch (_encoding) {
        case kCFStringEncodingShiftJIS_X0213_00:
        case kCFStringEncodingEUC_JP:
            return 0x00A5; // yen sign
        default:
            return '\\';
    }
}

bool operator==(const QTextCodec &a, const QTextCodec &b)
{
    return a._encoding == b._encoding && a._flags == b._flags;
}

unsigned QTextCodec::hash() const
{
    unsigned h = _encoding;

    h += (h << 10);
    h ^= (h << 6);
    
    h ^= _flags;

    h += (h << 3);
    h ^= (h >> 11);
    h += (h << 15);
    
    return h;
}
#if 0 //KWIQ: Codec comparision not used at the moment
static boolean QTextCodecsEqual(const void *a, const void *b)
{
    return *static_cast<const QTextCodec *>(a) == *static_cast<const QTextCodec *>(b);
}

static CFHashCode QTextCodecHash(const void *value)
{
    return static_cast<const QTextCodec *>(value)->hash();
}
#endif

// ================

QTextDecoder::~QTextDecoder()
{
}

// ================

KWQTextDecoder::KWQTextDecoder(CFStringEncoding e, KWQEncodingFlags f, CFStringEncoding targetEncoding, KWQEncodingFlags /*targetFlags*/)
    : _encoding(e)
    , _targetEncoding(targetEncoding)
    , _littleEndian(f & ::LittleEndian)
    , _atStart(true)
    , _error(false)
    , _numBufferedBytes(0)
    , _converter(_g_iconv_invalid())
{
}

KWQTextDecoder::~KWQTextDecoder()
{
    if (_g_iconv_is_valid(_converter)) {
        if (_g_iconv_is_valid(_cachedConverter)) {
	    g_iconv_close(_cachedConverter);
        }
        _cachedConverter = _converter;
        _cachedConverterEncoding = _encoding;
	_cachedConverterTargetEncoding = _targetEncoding;
    }
}

QString KWQTextDecoder::convertLatin1(const unsigned char *s, int length)
{
    ASSERT(_numBufferedBytes == 0);

    int i;
    for (i = 0; i != length; ++i) {
        if (s[i] == 0) {
            break;
        }
    }
    if (i == length) {
        return QString(reinterpret_cast<const char *>(s), length);
    }

    QString result;
    
    result.reserve(length);
    
    result.append(reinterpret_cast<const char *>(s), i);
    int start = ++i;
    for (; i != length; ++i) {
        if (s[i] == 0) {
            if (start != i) {
                result.append(reinterpret_cast<const char *>(&s[start]), i - start);
            }
            start = i + 1;
        }
    }
    if (start != length) {
        result.append(reinterpret_cast<const char *>(&s[start]), length - start);
    }

    return result;
}

QString KWQTextDecoder::convertUTF16(const unsigned char *s, int length)
{
    ASSERT(_numBufferedBytes == 0 || _numBufferedBytes == 1);

    const unsigned char *p = s;
    unsigned len = length;
    
    QString result;
    
    result.reserve(length / 2);

    if (_numBufferedBytes != 0 && len != 0) {
        ASSERT(_numBufferedBytes == 1);
        UniChar c;
        if (_littleEndian) {
            c = _bufferedBytes[0] | (p[0] << 8);
        } else {
            c = (_bufferedBytes[0] << 8) | p[0];
        }
        if (c) {
            result.append(reinterpret_cast<QChar *>(&c), 1);
        }
        _numBufferedBytes = 0;
        p += 1;
        len -= 1;
    }
    
    while (len > 1) {
        UniChar buffer[16384];
        int runLength = MIN(len / 2, sizeof(buffer) / sizeof(buffer[0]));
        int bufferLength = 0;
        if (_littleEndian) {
            for (int i = 0; i < runLength; ++i) {
                UniChar c = p[0] | (p[1] << 8);
                p += 2;
                if (c && c != BOM) {
                    buffer[bufferLength++] = c;
                }
            }
        } else {
            for (int i = 0; i < runLength; ++i) {
                UniChar c = (p[0] << 8) | p[1];
                p += 2;
                if (c && c != BOM) {
                    buffer[bufferLength++] = c;
                }
            }
        }
        result.append(reinterpret_cast<QChar *>(buffer), bufferLength);
        len -= bufferLength * 2;
    }
    
    if (len) {
        ASSERT(_numBufferedBytes == 0);
        _numBufferedBytes = 1;
        _bufferedBytes[0] = p[0];
    }
    
    return result;
}

OSStatus KWQTextDecoder::createIConvConverter()
{
    const CFStringEncoding encoding = effectiveEncoding(_encoding);
    const CFStringEncoding targetEncoding = effectiveEncoding(_targetEncoding);
    if (_cachedConverterEncoding == encoding && 
	_cachedConverterTargetEncoding == targetEncoding) {
	ASSERT(_encoding != kCFStringEncodingInvalidId);

        _converter = _cachedConverter;
        _cachedConverter = _g_iconv_invalid();
        _cachedConverterEncoding = kCFStringEncodingInvalidId;
        _cachedConverterTargetEncoding = kCFStringEncodingInvalidId;
	_g_iconv_clear_state(_converter);

    } else {
	const gchar *from = KWQCFStringEncodingToIANACharsetName(encoding);
	const gchar *to = KWQCFStringEncodingToIANACharsetName(targetEncoding);

	OSStatus status = noErr;
	_converter = g_iconv_open(to, from);

        if (!_g_iconv_is_valid(_converter)) {
	    if (errno == EINVAL)
		status = kIConvUnknownCharset;
	    else 
		status = kUnknownError;

            ERROR("the Text Encoding Converter won't convert from text encoding 0x%X to encoding 0x%X, (%s -> %s) error %d", encoding, targetEncoding, from, to, status);
            return status;
        }
    }
    
    return noErr;
}

void KWQTextDecoder::appendOmittingNullsAndBOMs(QString &s, const UniChar *characters, int byteCount)
{
    ASSERT(byteCount % sizeof(UniChar) == 0);
    int start = 0;
    int characterCount = byteCount / sizeof(UniChar);
    for (int i = 0; i != characterCount; ++i) {
        UniChar c = characters[i];
        if (c == 0 || c == BOM) {
            if (start != i) {
                s.append(reinterpret_cast<const QChar *>(&characters[start]), i - start);
            }
            start = i + 1;
        }
    }
    if (start != characterCount) {
        s.append(reinterpret_cast<const QChar *>(&characters[start]), characterCount - start);
    }
}


/*   The  conversion can stop for four reasons:
 *   1.  An  invalid multibyte sequence is encountered in the input. In this
 *   case it sets errno to EILSEQ and returns (size_t)(-1). *inbuf  is  left
 *   pointing to the beginning of the invalid multibyte sequence.
 *
 *   2.   The   input  byte  sequence  has  been  entirely  converted,  i.e.
 *        *inbytesleft has gone down to 0. In this case iconv returns the  number
 *        of non-reversible conversions performed during this call.
 *
 *   3.  An  incomplete  multibyte sequence is encountered in the input, and
 *   the input byte sequence terminates after it. In this case it sets errno
 *   to  EINVAL  and  returns  (size_t)(-1).  *inbuf is left pointing to the
 *   beginning of the incomplete multibyte sequence.
 *
 *   4. The output buffer has no more room for the next converted character.
 *   In this case it sets errno to E2BIG and returns (size_t)(-1).
 */

/* we are lazy, and like the idea that return results make sense semantically, so 
 * we emulate TECConvertText()
 */

static OSStatus 
IConvConvertText(GIConv converter, unsigned char* inputBuffer, int inputLen, unsigned long *outBytesRead, 
		 unsigned char* outputBuffer, int outputLen, unsigned long* outBytesWritten)
{
    ASSERT(_g_iconv_is_valid(converter));
    OSStatus status = noErr;

    gchar* inBuf  = reinterpret_cast<gchar *>(inputBuffer);
    gchar* outBuf  = reinterpret_cast<gchar *>(outputBuffer);

    gsize bytesInLeft = (gsize) inputLen;
    gsize bytesOutLeft = (gsize) outputLen;
    
    size_t res = g_iconv(converter, &inBuf, &bytesInLeft, &outBuf, &bytesOutLeft);
    *outBytesRead = inputLen - bytesInLeft;
    *outBytesWritten = outputLen - bytesOutLeft;

    if (res == ((size_t)-1)) {
	if (errno == EILSEQ){
	    status = kTextMalformedInputErr;
	} else if (errno == E2BIG) {
	    status = kTECOutputBufferFullStatus;
	} else if (errno == EINVAL) {
	    status = kTECPartialCharErr;
	} else {
	    status = kUnknownError;
	    ERROR("Unknonwn error: %d", errno);
	}
    } else {
	status = noErr;
    }

    return status;
}

OSStatus KWQTextDecoder::convertOneChunkUsingIConv(const unsigned char *inputBuffer, int inputBufferLength, int &inputLength,
    void *outputBuffer, int outputBufferLength, int &outputLength)
{
    OSStatus status;
    unsigned long bytesRead = 0;
    unsigned long bytesWritten = 0;

    if (_numBufferedBytes != 0) {
        // Finish converting a partial character that's in our buffer.
        
        // First, fill the partial character buffer with as many bytes as are available.
        ASSERT(_numBufferedBytes < sizeof(_bufferedBytes));
        const int spaceInBuffer = sizeof(_bufferedBytes) - _numBufferedBytes;
        const int bytesToPutInBuffer = MIN(spaceInBuffer, inputBufferLength);
        ASSERT(bytesToPutInBuffer != 0);
        memcpy(_bufferedBytes + _numBufferedBytes, inputBuffer, bytesToPutInBuffer);

        // Now, do a conversion on the buffer.
	status = IConvConvertText(_converter, _bufferedBytes, _numBufferedBytes + bytesToPutInBuffer, &bytesRead,
				reinterpret_cast<unsigned char *>(outputBuffer), outputBufferLength, &bytesWritten);

        if (status == kTECPartialCharErr && bytesRead == 0) {
            // Handle the case where the partial character was not converted.
            if (bytesToPutInBuffer >= spaceInBuffer) {
                ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %u bytes in the buffer", sizeof(_bufferedBytes));
                _numBufferedBytes = 0;
                status = kTECUnmappableElementErr; // should never happen, but use this error code
            } else {
                // Tell the caller we read all the source bytes and keep them in the buffer.
                _numBufferedBytes += bytesToPutInBuffer;
                bytesRead = bytesToPutInBuffer;
                status = noErr;
            }
        } else {
            // We are done with the partial character buffer.
            // Also, we have read some of the bytes from the main buffer.
            if (bytesRead > _numBufferedBytes) {
                bytesRead -= _numBufferedBytes;
            } else {
                ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
                bytesRead = 0;
            }
            _numBufferedBytes = 0;
            if (status == kTECPartialCharErr) {
                // While there may be a partial character problem in the small buffer,
                // we have to try again and not get confused and think there is a partial
                // character problem in the large buffer.
                status = noErr;
            }
        }
    } else {
        status = IConvConvertText(_converter, const_cast<unsigned char*>(inputBuffer), inputBufferLength, &bytesRead,
				  reinterpret_cast<unsigned char *>(outputBuffer), 
				  outputBufferLength, &bytesWritten);
    }

#if 0
    // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
    if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0) {
        status = kTECOutputBufferFullStatus;
    }
#endif

    inputLength = bytesRead;
    outputLength = bytesWritten;
    return status;
}

QString KWQTextDecoder::convertUsingIConv(const unsigned char *chs, int len, bool flush)
{
    // Get a converter for the passed-in encoding.
    if (!_g_iconv_is_valid(_converter) && createIConvConverter() != noErr) {
        return QString();
    }
    
    QString result;

    result.reserve(len);

    const unsigned char *sourcePointer = chs;
    int sourceLength = len;
    bool bufferWasFull = false;
    UniChar buffer[16384];

    while (sourceLength || bufferWasFull) {
        int bytesRead = 0;
        int bytesWritten = 0;
        OSStatus status = convertOneChunkUsingIConv(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
        ASSERT(bytesRead <= sourceLength);
        sourcePointer += bytesRead;
        sourceLength -= bytesRead;
        
        switch (status) {
            case noErr:
            case kTECOutputBufferFullStatus:
                break;
            case kTextMalformedInputErr:
            case kTextUndefinedElementErr:
                // FIXME: Put FFFD character into the output string in this case?
                _g_iconv_clear_state(_converter);
                if (sourceLength) {
                    sourcePointer += 1;
                    sourceLength -= 1;
                }
                break;
            case kTECPartialCharErr: {
                // Put the partial character into the buffer.
                ASSERT(_numBufferedBytes == 0);
                const int bufferSize = sizeof(_numBufferedBytes);
                if (sourceLength < bufferSize) {
                    memcpy(_bufferedBytes, sourcePointer, sourceLength);
                    _numBufferedBytes = sourceLength;
                } else {
                    ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
                }
                sourceLength = 0;
                break;
            }
            default:
                ERROR("text decoding failed with error %d", status);
                _error = true;
                return QString();
        }

        appendOmittingNullsAndBOMs(result, buffer, bytesWritten);

        bufferWasFull = status == kTECOutputBufferFullStatus;
    }
#if 0    
    if (flush) {
        unsigned long bytesWritten = 0;
        TECFlushText(_converter, reinterpret_cast<unsigned char *>(buffer), sizeof(buffer), &bytesWritten);
        appendOmittingNullsAndBOMs(result, buffer, bytesWritten);
    }
#endif

#if 0
    // Workaround for a bug in the Text Encoding Converter (see bug 3225472).
    // Simplified Chinese pages use the code U+A3A0 to mean "full-width space".
    // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
    // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
    if (_encoding == kCFStringEncodingGB_18030_2000) {
        result.replace(0xE5E5, 0x3000);
    }
#endif
    
    return result;
}

QString KWQTextDecoder::convert(const unsigned char *chs, int len, bool flush)
{
    //#define PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE 1000

    switch (_encoding) {
    case kCFStringEncodingISOLatin1:
    case kCFStringEncodingWindowsLatin1:
        return convertLatin1(chs, len);

    case kCFStringEncodingUnicode:
        return convertUTF16(chs, len);

    default:
#if PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE
        QString result;
        int chunkSize;
        for (int i = 0; i != len; i += chunkSize) {
            chunkSize = len - i;
            if (chunkSize > PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE) {
                chunkSize = PARTIAL_CHARACTER_HANDLING_TEST_CHUNK_SIZE;
            }
            result += convertUsingIConv(chs + i, chunkSize, flush && (i + chunkSize == len));
        }
        return result;
#else
        return convertUsingIConv(chs, len, flush);
#endif
    }
}

QString KWQTextDecoder::toUnicode(const char *chs, int len, bool flush)
{
    ASSERT_ARG(len, len >= 0);
    
    if (_error || !chs || (len <= 0 && !flush)) {
        return QString();
    }

    // Handle normal case.
    if (!_atStart) {
        return convert(chs, len, flush);
    }

    // Check to see if we found a BOM.
    int numBufferedBytes = _numBufferedBytes;
    int buf1Len = numBufferedBytes;
    int buf2Len = len;
    const unsigned char *buf1 = _bufferedBytes;
    const unsigned char *buf2 = reinterpret_cast<const unsigned char *>(chs);
    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    int BOMLength = 0;
    if (c1 == 0xFF && c2 == 0xFE) {
        _encoding = kCFStringEncodingUnicode;
        _littleEndian = true;
        BOMLength = 2;
    } else if (c1 == 0xFE && c2 == 0xFF) {
        _encoding = kCFStringEncodingUnicode;
        _littleEndian = false;
        BOMLength = 2;
    } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
        _encoding = kCFStringEncodingUTF8;
        BOMLength = 3;
    }

    // Handle case where we found a BOM.
    if (BOMLength != 0) {
        ASSERT(numBufferedBytes + len >= BOMLength);
        int skip = BOMLength - numBufferedBytes;
        _numBufferedBytes = 0;
        _atStart = false;
        return len == skip ? QString() : convert(chs + skip, len - skip, flush);
    }

    // Handle case where we know there is no BOM coming.
    const int bufferSize = sizeof(_bufferedBytes);
    if (numBufferedBytes + len > bufferSize || flush) {
        _atStart = false;
        if (numBufferedBytes == 0) {
            return convert(chs, len, flush);
        }
        unsigned char bufferedBytes[sizeof(_bufferedBytes)];
        memcpy(bufferedBytes, _bufferedBytes, numBufferedBytes);
        _numBufferedBytes = 0;
        return convert(bufferedBytes, numBufferedBytes, false) + convert(chs, len, flush);
    }

    // Continue to look for the BOM.
    memcpy(&_bufferedBytes[numBufferedBytes], chs, len);
    _numBufferedBytes += len;
    return QString();
}
