/*H*************************************************************************************************/ /*! \File utf8.c \Description This module implements routines for converting to and from UTF-8. \Notes This code only decodes the first three octets of UTF-8, thus it only handles UCS-2 codes, not UCS-4 codes. It also does not handle UTF-16 (and surrogate pairs), and is therefore limited to encoding to/decoding from the basic reference plane. Helpful references: http://www.utf-8.com/ - links http://www.cis.ohio-state.edu/cgi-bin/rfc/rfc2279.html - RFC 2279 http://www.unicode.org/charts/ - UNICODE character charts http://www-106.ibm.com/developerworks/library/utfencodingforms/ - UNICODE primer http://www.columbia.edu/kermit/utf8.html - UTF-8 samples \Copyright Copyright (c) Tiburon Entertainment / Electronic Arts 2003. ALL RIGHTS RESERVED. \Version 1.0 03/25/03 (JLB) First version. */ /*************************************************************************************************H*/ /*** Include files *********************************************************************/ #include "DirtySDK/util/utf8.h" /*** Defines ***************************************************************************/ /*** Macros ****************************************************************************/ /*** Type Definitions ******************************************************************/ /*** Function Prototypes ***************************************************************/ /*** Variables *************************************************************************/ // Private variables // Public variables /*** Private Functions *****************************************************************/ /*F*************************************************************************************************/ /*! \Function _Utf8GetNumBytes \Description Decode the number of bytes in a UTF-8 encoded sequence. \Input cLead - lead character of UTF-8 sequence \Output int32_t - number of bytes in the sequence \Version 03/25/03 (JLB) */ /*************************************************************************************************F*/ static int32_t _Utf8GetNumBytes(unsigned char cLead) { int32_t iCodeSize; if ((cLead & 0x80) == 0x00) { iCodeSize = 1; } else if ((cLead & 0xE0) == 0xC0) { iCodeSize = 2; } else if ((cLead & 0xF0) == 0xE0) { iCodeSize = 3; } else { iCodeSize = 4; } return(iCodeSize); } /*F*************************************************************************************************/ /*! \Function _Utf8DecodeToUCS2 \Description Decode a UTF-8 sequence into a UCS-2 code point. \Input *pOutPtr - pointer to output for decoded UCS-2 value \Input *pStr - pointer to input UTF-8 string \Output int32_t - number of input 8bit characters consumed \Notes UCS-2 range (hex) UTF-8 octet sequence (binary) 007F 0xxxxxxx 07FF 110xxxxx 10xxxxxx FFFF 1110xxxx 10xxxxxx 10xxxxxx \Version 03/26/03 (JLB) */ /*************************************************************************************************F*/ static int32_t _Utf8DecodeToUCS2(uint16_t *pOutPtr, const unsigned char *pStr) { int32_t iCodeSize; if ((*pStr & 0x80) == 0x00) { pOutPtr[0] = (uint16_t)pStr[0]; iCodeSize = 1; } else if ((*pStr & 0xE0) == 0xC0) { pOutPtr[0] = ((pStr[0] & ~0xE0) << 6) | (pStr[1] & ~0xC0); iCodeSize = 2; } else if ((*pStr & 0xF0) == 0xE0) { pOutPtr[0] = ((pStr[0] & ~0xF0) << 12) | ((pStr[1] & ~0xC0) << 6) | (pStr[2] & ~0xC0); iCodeSize = 3; } else { iCodeSize = 4; } return(iCodeSize); } /*F*************************************************************************************************/ /*! \Function Utf8EncodeFromUCS2CodePt \Description Encode a single UCS-2 code point ("char") into a UTF-8 sequence. \Input uCodePt - input UCS-2 code point \Input *pOutPtr - pointer to output for encoded UTF-8 sequence. \Output int32_t - number of 8bit characters output \Notes See notes for _Utf8DecodeToUCS2() \Version 03/26/03 (JLB) */ /*************************************************************************************************F*/ int32_t Utf8EncodeFromUCS2CodePt(char *pOutPtr, uint16_t uCodePt) { int32_t iCodeSize; if (uCodePt < 0x0080) { pOutPtr[0] = (char)uCodePt; iCodeSize = 1; } else if (uCodePt < 0x800) { pOutPtr[0] = 0xC0 | (uCodePt >> 6); pOutPtr[1] = 0x80 | (uCodePt & 0x3F); iCodeSize = 2; } else { pOutPtr[0] = 0xE0 | (uCodePt >> 12); pOutPtr[1] = 0x80 | ((uCodePt >> 6) & 0x3F); pOutPtr[2] = 0x80 | (uCodePt & 0x3F); iCodeSize = 3; } return(iCodeSize); } /*F*************************************************************************************************/ /*! \Function _Utf8Translate \Description Look through translation subtables and translate uCodePt \Input *pOutBuf - output buffer to store 8bit translated output \Input *pTransTbl - translation table to translate with \Input uCodePt - UCS-2 code point to translate \Input cReplace - character to replace untranslatable characters with (or null-termination char to strip) \Output int32_t - number of ascii characters output \Version 03/26/03 (JLB) */ /*************************************************************************************************F*/ static int32_t _Utf8Translate(char *pOutBuf, const Utf8TransTblT *pTransTbl, uint16_t uCodePt, char cReplace) { unsigned char cCode; char *pOldBuf = pOutBuf; int32_t bFound; // look through subtables for (bFound = FALSE; pTransTbl->uRangeEnd != 0; pTransTbl++) { // are we in range? if ((uCodePt >= pTransTbl->uRangeBegin) && (uCodePt <= (pTransTbl->uRangeEnd))) { // dereference table uCodePt -= pTransTbl->uRangeBegin; cCode = (unsigned char)pTransTbl->pCodeTbl[uCodePt]; if ((cCode == 0xFF) && (cReplace != '\0')) { // untranslatable - replace *(pOutBuf++) = cReplace; } else { // translate *(pOutBuf++) = cCode; } bFound = TRUE; break; } } // not found and replacing? if ((bFound == FALSE) && (cReplace != '\0')) { // replace *(pOutBuf++) = cReplace; } // return number of characters output return((int32_t)(pOutBuf-pOldBuf)); } /*** Public Functions ******************************************************************/ /*F*************************************************************************************************/ /*! \Function Utf8Strip \Description Strip non-ASCII UTF-8 encoded data. \Input *pOutStr - pointer to output buffer (may be same as input buffer) \Input iBufSize - size of output buffer in ASCII units (char) \Input *pInStr - pointer to source string \Output int32_t - number of characters in new string, or zero if no utf8 data was stripped \Version 03/25/03 (JLB) */ /*************************************************************************************************F*/ int32_t Utf8Strip(char *pOutStr, int32_t iBufSize, const char *pInStr) { return(Utf8Replace(pOutStr, iBufSize, pInStr, '\0')); } /*F*************************************************************************************************/ /*! \Function Utf8Replace \Description Replace non-ASCII UTF-8 encoded data with the given character. \Input *pOutStr - pointer to output buffer (may be same as input buffer) \Input iBufSize - size of output buffer in ASCII units (char) \Input *pInStr - pointer to source string \Input cReplace - character to replace non-ASCII UTF-8 characters with \Output int32_t - number of characters in new string, or zero if no UTF-8 data was replaced \Version 03/25/03 (JLB) */ /*************************************************************************************************F*/ int32_t Utf8Replace(char *pOutStr, int32_t iBufSize, const char *pInStr, char cReplace) { int32_t iSrcIdx, iDstIdx = 0; // fast scan to find any utf8 encoded data for (iSrcIdx = 0; ((pInStr[iSrcIdx] & 0x80) == 0) && (pInStr[iSrcIdx] != '\0'); iSrcIdx++) { } // did we find any? if (pInStr[iSrcIdx] != '\0') { // yes, so replace/strip it for (iDstIdx = iSrcIdx; pInStr[iSrcIdx] != '\0' && iDstIdx < iBufSize; ) { // do we have utf8 data? if (pInStr[iSrcIdx] & 0x80) { // figure out how many bytes of utf8 data we have int32_t iNumBytes = _Utf8GetNumBytes(pInStr[iSrcIdx]); // skip them iSrcIdx += iNumBytes; // replace with cReplace if (cReplace != '\0') { pOutStr[iDstIdx++] = cReplace; } } else { // normal string data - copy it pOutStr[iDstIdx++] = pInStr[iSrcIdx++]; } } // terminate pOutStr[iDstIdx++] = '\0'; } return(iDstIdx); } /*F*************************************************************************************************/ /*! \Function Utf8StrLen \Description Returns the number of code points in a UTF-8 encoded string. \Input *pStr - pointer to string to get string length of \Output int32_t - number of code points in string \Version 03/26/03 (JLB) */ /*************************************************************************************************F*/ int32_t Utf8StrLen(const char *pStr) { int32_t iSrcIdx, iStrLen; for (iSrcIdx = iStrLen = 0; pStr[iSrcIdx] != '\0'; iStrLen++) { if (pStr[iSrcIdx] & 0x80) { // figure out how many bytes of utf8 data we have int32_t iNumBytes = _Utf8GetNumBytes(pStr[iSrcIdx]); // skip them iSrcIdx += iNumBytes; } else { iSrcIdx++; } } return(iStrLen); } /*F*************************************************************************************************/ /*! \Function Utf8EncodeFromUCS2 \Description Convert a UCS-2 code point sequence into a UTF-8 code point sequence. \Input *pOutStr - pointer to buffer to encode string to \Input iBufLen - length of output buffer, in char units \Input *pInStr - pointer to string to encode \Output int32_t - output string length, in char units \Version 03/27/03 (JLB) */ /*************************************************************************************************F*/ int32_t Utf8EncodeFromUCS2(char *pOutStr, int32_t iBufLen, const uint16_t *pInStr) { int32_t iStrLen; // a UCS-2 encoded string can generate up to three chars. iBufLen -= 2; // encode for (iStrLen = 0; (*pInStr != 0x0000) && (iStrLen < iBufLen); pInStr++) { iStrLen += Utf8EncodeFromUCS2CodePt(&pOutStr[iStrLen], *pInStr); } // NULL terminate & return length to caller pOutStr[iStrLen++] = '\0'; return(iStrLen); } /*F*************************************************************************************************/ /*! \Function Utf8DecodeToUCS2 \Description Convert a UTF-8 code point sequence into a UCS-2 code point sequence. \Input *pOutStr - pointer to buffer to decode string to \Input iBufLen - length of output buffer, in UCS-2 units (int16_t) \Input *pInStr - pointer to string to decode \Output int32_t - output string length, in code points \Version 03/26/03 (JLB) */ /*************************************************************************************************F*/ int32_t Utf8DecodeToUCS2(uint16_t *pOutStr, int32_t iBufLen, const char *pInStr) { int32_t iSrcIdx, iStrLen, iCodeSize; // ensure room for NULL terminator iBufLen--; // decode string for (iSrcIdx = iStrLen = 0; (pInStr[iSrcIdx] != '\0') && (iStrLen < iBufLen); ) { if ((iCodeSize = _Utf8DecodeToUCS2(&pOutStr[iStrLen], (const unsigned char *)&pInStr[iSrcIdx])) <= 3) { iStrLen++; } iSrcIdx += iCodeSize; } // NULL terminate & return length to caller pOutStr[iStrLen++] = 0x0000; return(iStrLen); } /*F*************************************************************************************************/ /*! \Function Utf8EncodeFrom8Bit \Description Encode the given 8bit input string to UTF-8, based on the input translation table \Input *pOutStr - pointer to output UTF-8 string buffer \Input iBufLen - length of buffer \Input *pInStr - pointer to input 8bit string \Input *pEncodeTbl - pointer to translation table to map 8bit string to UCS-2 \Output int32_t - length of output UCS-8 string \Version 03/28/03 (JLB) */ /*************************************************************************************************F*/ int32_t Utf8EncodeFrom8Bit(char *pOutStr, int32_t iBufLen, const char *pInStr, const Utf8EncodeTblT *pEncodeTbl) { int32_t iStrLen; uint16_t uCodePt; // a UCS-2 encoded value can generate up to three chars. iBufLen -= 2; // encode for (iStrLen = 0; (*pInStr != 0x0000) && (iStrLen < iBufLen); pInStr++) { uCodePt = pEncodeTbl->uCodeTbl[*(const unsigned char *)pInStr]; iStrLen += Utf8EncodeFromUCS2CodePt(&pOutStr[iStrLen], uCodePt); } // NULL terminate & return length to caller pOutStr[iStrLen++] = '\0'; return(iStrLen); } /*F*************************************************************************************************/ /*! \Function Utf8TranslateTo8Bit \Description Translates the given UTF-8 sequence based on the input translation table. \Input *pOutStr - pointer to buffer to decode string to \Input iBufLen - length of output buffer, in ASCII units (char) \Input *pInStr - pointer to string to decode \Input cReplace - \verbatim character to replace code point with if untranslateable ('\0' to strip) \endverbatim \Input *pTransTbl - pointer to NULL-terminated translation table array \Output int32_t - length of output string in ASCII characters (8bit) \Notes 'pTransTbl' is expected to be a NULL-terminated array of Utf8TransTblT structures that represent a sparse translation table from 16bit UCS-2 space to 8bit game font space. \Version 03/26/03 (JLB) */ /*************************************************************************************************F*/ int32_t Utf8TranslateTo8Bit(char *pOutStr, int32_t iBufLen, const char *pInStr, char cReplace, const Utf8TransTblT *pTransTbl) { int32_t iSrcIdx, iDstIdx; uint16_t uCodePt = 0; // ensure room for NULL terminator iBufLen--; // translate string for (iSrcIdx = iDstIdx = 0; (pInStr[iSrcIdx] != '\0') && (iDstIdx < iBufLen); ) { iSrcIdx += _Utf8DecodeToUCS2(&uCodePt, (const unsigned char *)&pInStr[iSrcIdx]); iDstIdx += _Utf8Translate(&pOutStr[iDstIdx], pTransTbl, uCodePt, cReplace); } // NULL terminate & return length to caller pOutStr[iDstIdx++] = '\0'; return(iDstIdx); }