ultimatepp/bazaar/plugin/gdal/port/cpl_recode.cpp
cxl 23ff1e7e82 .gdal moved to bazaar
git-svn-id: svn://ultimatepp.org/upp/trunk@9273 f0d560ea-af0d-0410-9eb7-867de7ffcac7
2015-12-07 13:36:24 +00:00

363 lines
14 KiB
C++

/**********************************************************************
* $Id: cpl_recode.cpp 27044 2014-03-16 23:41:27Z rouault $
*
* Name: cpl_recode.cpp
* Project: CPL - Common Portability Library
* Purpose: Character set recoding and char/wchar_t conversions.
* Author: Andrey Kiselev, dron@ak4719.spb.edu
*
**********************************************************************
* Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
* Copyright (c) 2008, Frank Warmerdam
* Copyright (c) 2011-2014, Even Rouault <even dot rouault at mines-paris dot org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
**********************************************************************/
#include "cpl_string.h"
CPL_CVSID("$Id: cpl_recode.cpp 27044 2014-03-16 23:41:27Z rouault $");
#ifdef CPL_RECODE_ICONV
extern void CPLClearRecodeIconvWarningFlags();
extern char *CPLRecodeIconv( const char *, const char *, const char * );
extern char *CPLRecodeFromWCharIconv( const wchar_t *,
const char *, const char * );
extern wchar_t *CPLRecodeToWCharIconv( const char *,
const char *, const char * );
#endif /* CPL_RECODE_ICONV */
extern void CPLClearRecodeStubWarningFlags();
extern char *CPLRecodeStub( const char *, const char *, const char * );
extern char *CPLRecodeFromWCharStub( const wchar_t *,
const char *, const char * );
extern wchar_t *CPLRecodeToWCharStub( const char *,
const char *, const char * );
extern int CPLIsUTF8Stub( const char *, int );
/************************************************************************/
/* CPLRecode() */
/************************************************************************/
/**
* Convert a string from a source encoding to a destination encoding.
*
* The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
* and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
* <ul>
* <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in fact)</li>
* <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
* <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
* </ul>
*
* If an error occurs an error may, or may not be posted with CPLError().
*
* @param pszSource a NULL terminated string.
* @param pszSrcEncoding the source encoding.
* @param pszDstEncoding the destination encoding.
*
* @return a NULL terminated string which should be freed with CPLFree().
*
* @since GDAL 1.6.0
*/
char CPL_DLL *CPLRecode( const char *pszSource,
const char *pszSrcEncoding,
const char *pszDstEncoding )
{
/* -------------------------------------------------------------------- */
/* Handle a few common short cuts. */
/* -------------------------------------------------------------------- */
if ( EQUAL(pszSrcEncoding, pszDstEncoding) )
return CPLStrdup(pszSource);
if ( EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
&& ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
|| EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
return CPLStrdup(pszSource);
#ifdef CPL_RECODE_ICONV
/* -------------------------------------------------------------------- */
/* CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8 */
/* and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are hadled */
/* very well by the stub implementation which is faster than the */
/* iconv() route. Use a stub for these two ones and iconv() */
/* everything else. */
/* -------------------------------------------------------------------- */
if ( ( EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)
&& EQUAL(pszDstEncoding, CPL_ENC_UTF8) )
|| ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
&& EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
{
return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
}
else
{
return CPLRecodeIconv( pszSource, pszSrcEncoding, pszDstEncoding );
}
#else /* CPL_RECODE_STUB */
return CPLRecodeStub( pszSource, pszSrcEncoding, pszDstEncoding );
#endif /* CPL_RECODE_ICONV */
}
/************************************************************************/
/* CPLRecodeFromWChar() */
/************************************************************************/
/**
* Convert wchar_t string to UTF-8.
*
* Convert a wchar_t string into a multibyte utf-8 string. The only
* guaranteed supported source encoding is CPL_ENC_UCS2, and the only
* guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
* and CPL_ENC_ISO8859_1. In some cases (ie. using iconv()) other encodings
* may also be supported.
*
* Note that the wchar_t type varies in size on different systems. On
* win32 it is normally 2 bytes, and on unix 4 bytes.
*
* If an error occurs an error may, or may not be posted with CPLError().
*
* @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
* @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
* @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
*
* @return a zero terminated multi-byte string which should be freed with
* CPLFree(), or NULL if an error occurs.
*
* @since GDAL 1.6.0
*/
char CPL_DLL *CPLRecodeFromWChar( const wchar_t *pwszSource,
const char *pszSrcEncoding,
const char *pszDstEncoding )
{
#ifdef CPL_RECODE_ICONV
/* -------------------------------------------------------------------- */
/* Conversions from CPL_ENC_UCS2 */
/* to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
/* handled by the stub implementation. */
/* -------------------------------------------------------------------- */
if ( (EQUAL(pszSrcEncoding, CPL_ENC_UCS2) || EQUAL(pszSrcEncoding, "WCHAR_T"))
&& ( EQUAL(pszDstEncoding, CPL_ENC_UTF8)
|| EQUAL(pszDstEncoding, CPL_ENC_ASCII)
|| EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1) ) )
{
return CPLRecodeFromWCharStub( pwszSource,
pszSrcEncoding, pszDstEncoding );
}
else
{
return CPLRecodeFromWCharIconv( pwszSource,
pszSrcEncoding, pszDstEncoding );
}
#else /* CPL_RECODE_STUB */
return CPLRecodeFromWCharStub( pwszSource,
pszSrcEncoding, pszDstEncoding );
#endif /* CPL_RECODE_ICONV */
}
/************************************************************************/
/* CPLRecodeToWChar() */
/************************************************************************/
/**
* Convert UTF-8 string to a wchar_t string.
*
* Convert a 8bit, multi-byte per character input string into a wide
* character (wchar_t) string. The only guaranteed supported source encodings
* are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1). The only
* guaranteed supported destination encoding is CPL_ENC_UCS2. Other source
* and destination encodings may be supported depending on the underlying
* implementation.
*
* Note that the wchar_t type varies in size on different systems. On
* win32 it is normally 2 bytes, and on unix 4 bytes.
*
* If an error occurs an error may, or may not be posted with CPLError().
*
* @param pszSource input multi-byte character string.
* @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
* @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
*
* @return the zero terminated wchar_t string (to be freed with CPLFree()) or
* NULL on error.
*
* @since GDAL 1.6.0
*/
wchar_t CPL_DLL *CPLRecodeToWChar( const char *pszSource,
const char *pszSrcEncoding,
const char *pszDstEncoding )
{
#ifdef CPL_RECODE_ICONV
/* -------------------------------------------------------------------- */
/* Conversions to CPL_ENC_UCS2 */
/* from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
/* handled by the stub implementation. */
/* -------------------------------------------------------------------- */
if ( (EQUAL(pszDstEncoding, CPL_ENC_UCS2) || EQUAL(pszDstEncoding, "WCHAR_T"))
&& ( EQUAL(pszSrcEncoding, CPL_ENC_UTF8)
|| EQUAL(pszSrcEncoding, CPL_ENC_ASCII)
|| EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) ) )
{
return CPLRecodeToWCharStub( pszSource,
pszSrcEncoding, pszDstEncoding );
}
else
{
return CPLRecodeToWCharIconv( pszSource,
pszSrcEncoding, pszDstEncoding );
}
#else /* CPL_RECODE_STUB */
return CPLRecodeToWCharStub( pszSource, pszSrcEncoding, pszDstEncoding );
#endif /* CPL_RECODE_ICONV */
}
/************************************************************************/
/* CPLIsUTF8() */
/************************************************************************/
/**
* Test if a string is encoded as UTF-8.
*
* @param pabyData input string to test
* @param nLen length of the input string, or -1 if the function must compute
* the string length. In which case it must be null terminated.
* @return TRUE if the string is encoded as UTF-8. FALSE otherwise
*
* @since GDAL 1.7.0
*/
int CPLIsUTF8(const char* pabyData, int nLen)
{
return CPLIsUTF8Stub( pabyData, nLen );
}
/************************************************************************/
/* CPLForceToASCII() */
/************************************************************************/
/**
* Return a new string that is made only of ASCII characters. If non-ASCII
* characters are found in the input string, they will be replaced by the
* provided replacement character.
*
* @param pabyData input string to test
* @param nLen length of the input string, or -1 if the function must compute
* the string length. In which case it must be null terminated.
* @param chReplacementChar character which will be used when the input stream
* contains a non ASCII character. Must be valid ASCII !
*
* @return a new string that must be freed with CPLFree().
*
* @since GDAL 1.7.0
*/
char CPL_DLL *CPLForceToASCII(const char* pabyData, int nLen, char chReplacementChar)
{
if (nLen < 0)
nLen = strlen(pabyData);
char* pszOutputString = (char*)CPLMalloc(nLen + 1);
int i;
for(i=0;i<nLen;i++)
{
if (((unsigned char*)pabyData)[i] > 127)
pszOutputString[i] = chReplacementChar;
else
pszOutputString[i] = pabyData[i];
}
pszOutputString[i] = '\0';
return pszOutputString;
}
/************************************************************************/
/* CPLEncodingCharSize() */
/************************************************************************/
/**
* Return bytes per character for encoding.
*
* This function returns the size in bytes of the smallest character
* in this encoding. For fixed width encodings (ASCII, UCS-2, UCS-4) this
* is straight forward. For encodings like UTF8 and UTF16 which represent
* some characters as a sequence of atomic character sizes the function
* still returns the atomic character size (1 for UTF8, 2 for UTF16).
*
* This function will return the correct value for well known encodings
* with corresponding CPL_ENC_ values. It may not return the correct value
* for other encodings even if they are supported by the underlying iconv
* or windows transliteration services. Hopefully it will improve over time.
*
* @param pszEncoding the name of the encoding.
*
* @return the size of a minimal character in bytes or -1 if the size is
* unknown.
*/
int CPLEncodingCharSize( const char *pszEncoding )
{
if( EQUAL(pszEncoding,CPL_ENC_UTF8) )
return 1;
else if( EQUAL(pszEncoding,CPL_ENC_UTF16) )
return 2;
else if( EQUAL(pszEncoding,CPL_ENC_UCS2) )
return 2;
else if( EQUAL(pszEncoding,CPL_ENC_UCS4) )
return 4;
else if( EQUAL(pszEncoding,CPL_ENC_ASCII) )
return 1;
else if( EQUALN(pszEncoding,"ISO-8859-",9) )
return 1;
else
return -1;
}
/************************************************************************/
/* CPLClearRecodeWarningFlags() */
/************************************************************************/
void CPLClearRecodeWarningFlags()
{
#ifdef CPL_RECODE_ICONV
CPLClearRecodeIconvWarningFlags();
#endif
CPLClearRecodeStubWarningFlags();
}
/************************************************************************/
/* CPLStrlenUTF8() */
/************************************************************************/
/**
* Return the number of UTF-8 characters of a nul-terminated string.
*
* This is different from strlen() which returns the number of bytes.
*
* @param pszUTF8Str a nul-terminated UTF-8 string
*
* @return the number of UTF-8 characters.
*/
int CPLStrlenUTF8(const char *pszUTF8Str) {
int i = 0, j = 0;
while (pszUTF8Str[i]) {
if ((pszUTF8Str[i] & 0xc0) != 0x80) j++;
i++;
}
return j;
}