mirror of
https://github.com/Mauler125/r5sdk.git
synced 2025-02-09 19:15:03 +01:00
DirtySDK (EA's Dirty Sockets library) will be used for the LiveAPI implementation, and depends on: EABase, EAThread.
528 lines
16 KiB
C
528 lines
16 KiB
C
/*H*************************************************************************************************/
|
|
/*!
|
|
|
|
\File utf8.c
|
|
|
|
\Description
|
|
This module implements routines for converting to and from UTF-8.
|
|
|
|
\Notes
|
|
This code only decodes the first three octets of UTF-8, thus it only handles UCS-2 codes,
|
|
not UCS-4 codes. It also does not handle UTF-16 (and surrogate pairs), and is therefore
|
|
limited to encoding to/decoding from the basic reference plane.
|
|
|
|
Helpful references:
|
|
|
|
http://www.utf-8.com/ - links
|
|
http://www.cis.ohio-state.edu/cgi-bin/rfc/rfc2279.html - RFC 2279
|
|
http://www.unicode.org/charts/ - UNICODE character charts
|
|
http://www-106.ibm.com/developerworks/library/utfencodingforms/ - UNICODE primer
|
|
http://www.columbia.edu/kermit/utf8.html - UTF-8 samples
|
|
|
|
\Copyright
|
|
Copyright (c) Tiburon Entertainment / Electronic Arts 2003. ALL RIGHTS RESERVED.
|
|
|
|
\Version 1.0 03/25/03 (JLB) First version.
|
|
|
|
*/
|
|
/*************************************************************************************************H*/
|
|
|
|
|
|
/*** Include files *********************************************************************/
|
|
|
|
#include "DirtySDK/util/utf8.h"
|
|
|
|
/*** Defines ***************************************************************************/
|
|
|
|
/*** Macros ****************************************************************************/
|
|
|
|
/*** Type Definitions ******************************************************************/
|
|
|
|
/*** Function Prototypes ***************************************************************/
|
|
|
|
/*** Variables *************************************************************************/
|
|
|
|
// Private variables
|
|
|
|
// Public variables
|
|
|
|
|
|
/*** Private Functions *****************************************************************/
|
|
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function _Utf8GetNumBytes
|
|
|
|
\Description
|
|
Decode the number of bytes in a UTF-8 encoded sequence.
|
|
|
|
\Input cLead - lead character of UTF-8 sequence
|
|
|
|
\Output
|
|
int32_t - number of bytes in the sequence
|
|
|
|
\Version 03/25/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
static int32_t _Utf8GetNumBytes(unsigned char cLead)
|
|
{
|
|
int32_t iCodeSize;
|
|
|
|
if ((cLead & 0x80) == 0x00)
|
|
{
|
|
iCodeSize = 1;
|
|
}
|
|
else if ((cLead & 0xE0) == 0xC0)
|
|
{
|
|
iCodeSize = 2;
|
|
}
|
|
else if ((cLead & 0xF0) == 0xE0)
|
|
{
|
|
iCodeSize = 3;
|
|
}
|
|
else
|
|
{
|
|
iCodeSize = 4;
|
|
}
|
|
|
|
return(iCodeSize);
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function _Utf8DecodeToUCS2
|
|
|
|
\Description
|
|
Decode a UTF-8 sequence into a UCS-2 code point.
|
|
|
|
\Input *pOutPtr - pointer to output for decoded UCS-2 value
|
|
\Input *pStr - pointer to input UTF-8 string
|
|
|
|
\Output
|
|
int32_t - number of input 8bit characters consumed
|
|
|
|
\Notes
|
|
UCS-2 range (hex) UTF-8 octet sequence (binary)
|
|
007F 0xxxxxxx
|
|
07FF 110xxxxx 10xxxxxx
|
|
FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
|
|
|
\Version 03/26/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
static int32_t _Utf8DecodeToUCS2(uint16_t *pOutPtr, const unsigned char *pStr)
|
|
{
|
|
int32_t iCodeSize;
|
|
|
|
if ((*pStr & 0x80) == 0x00)
|
|
{
|
|
pOutPtr[0] = (uint16_t)pStr[0];
|
|
iCodeSize = 1;
|
|
}
|
|
else if ((*pStr & 0xE0) == 0xC0)
|
|
{
|
|
pOutPtr[0] = ((pStr[0] & ~0xE0) << 6) | (pStr[1] & ~0xC0);
|
|
iCodeSize = 2;
|
|
}
|
|
else if ((*pStr & 0xF0) == 0xE0)
|
|
{
|
|
pOutPtr[0] = ((pStr[0] & ~0xF0) << 12) | ((pStr[1] & ~0xC0) << 6) | (pStr[2] & ~0xC0);
|
|
iCodeSize = 3;
|
|
}
|
|
else
|
|
{
|
|
iCodeSize = 4;
|
|
}
|
|
|
|
return(iCodeSize);
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function Utf8EncodeFromUCS2CodePt
|
|
|
|
\Description
|
|
Encode a single UCS-2 code point ("char") into a UTF-8 sequence.
|
|
|
|
\Input uCodePt - input UCS-2 code point
|
|
\Input *pOutPtr - pointer to output for encoded UTF-8 sequence.
|
|
|
|
\Output
|
|
int32_t - number of 8bit characters output
|
|
|
|
\Notes
|
|
See notes for _Utf8DecodeToUCS2()
|
|
|
|
\Version 03/26/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
int32_t Utf8EncodeFromUCS2CodePt(char *pOutPtr, uint16_t uCodePt)
|
|
{
|
|
int32_t iCodeSize;
|
|
|
|
if (uCodePt < 0x0080)
|
|
{
|
|
pOutPtr[0] = (char)uCodePt;
|
|
iCodeSize = 1;
|
|
}
|
|
else if (uCodePt < 0x800)
|
|
{
|
|
pOutPtr[0] = 0xC0 | (uCodePt >> 6);
|
|
pOutPtr[1] = 0x80 | (uCodePt & 0x3F);
|
|
iCodeSize = 2;
|
|
}
|
|
else
|
|
{
|
|
pOutPtr[0] = 0xE0 | (uCodePt >> 12);
|
|
pOutPtr[1] = 0x80 | ((uCodePt >> 6) & 0x3F);
|
|
pOutPtr[2] = 0x80 | (uCodePt & 0x3F);
|
|
iCodeSize = 3;
|
|
}
|
|
|
|
return(iCodeSize);
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function _Utf8Translate
|
|
|
|
\Description
|
|
Look through translation subtables and translate uCodePt
|
|
|
|
\Input *pOutBuf - output buffer to store 8bit translated output
|
|
\Input *pTransTbl - translation table to translate with
|
|
\Input uCodePt - UCS-2 code point to translate
|
|
\Input cReplace - character to replace untranslatable characters with (or null-termination char to strip)
|
|
|
|
\Output
|
|
int32_t - number of ascii characters output
|
|
|
|
\Version 03/26/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
static int32_t _Utf8Translate(char *pOutBuf, const Utf8TransTblT *pTransTbl, uint16_t uCodePt, char cReplace)
|
|
{
|
|
unsigned char cCode;
|
|
char *pOldBuf = pOutBuf;
|
|
int32_t bFound;
|
|
|
|
// look through subtables
|
|
for (bFound = FALSE; pTransTbl->uRangeEnd != 0; pTransTbl++)
|
|
{
|
|
// are we in range?
|
|
if ((uCodePt >= pTransTbl->uRangeBegin) && (uCodePt <= (pTransTbl->uRangeEnd)))
|
|
{
|
|
// dereference table
|
|
uCodePt -= pTransTbl->uRangeBegin;
|
|
cCode = (unsigned char)pTransTbl->pCodeTbl[uCodePt];
|
|
|
|
if ((cCode == 0xFF) && (cReplace != '\0'))
|
|
{
|
|
// untranslatable - replace
|
|
*(pOutBuf++) = cReplace;
|
|
}
|
|
else
|
|
{
|
|
// translate
|
|
*(pOutBuf++) = cCode;
|
|
}
|
|
|
|
bFound = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// not found and replacing?
|
|
if ((bFound == FALSE) && (cReplace != '\0'))
|
|
{
|
|
// replace
|
|
*(pOutBuf++) = cReplace;
|
|
}
|
|
|
|
// return number of characters output
|
|
return((int32_t)(pOutBuf-pOldBuf));
|
|
}
|
|
|
|
|
|
/*** Public Functions ******************************************************************/
|
|
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function Utf8Strip
|
|
|
|
\Description
|
|
Strip non-ASCII UTF-8 encoded data.
|
|
|
|
\Input *pOutStr - pointer to output buffer (may be same as input buffer)
|
|
\Input iBufSize - size of output buffer in ASCII units (char)
|
|
\Input *pInStr - pointer to source string
|
|
|
|
\Output
|
|
int32_t - number of characters in new string, or zero if no utf8 data was stripped
|
|
|
|
\Version 03/25/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
int32_t Utf8Strip(char *pOutStr, int32_t iBufSize, const char *pInStr)
|
|
{
|
|
return(Utf8Replace(pOutStr, iBufSize, pInStr, '\0'));
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function Utf8Replace
|
|
|
|
\Description
|
|
Replace non-ASCII UTF-8 encoded data with the given character.
|
|
|
|
\Input *pOutStr - pointer to output buffer (may be same as input buffer)
|
|
\Input iBufSize - size of output buffer in ASCII units (char)
|
|
\Input *pInStr - pointer to source string
|
|
\Input cReplace - character to replace non-ASCII UTF-8 characters with
|
|
|
|
\Output
|
|
int32_t - number of characters in new string, or zero if no UTF-8 data was replaced
|
|
|
|
\Version 03/25/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
int32_t Utf8Replace(char *pOutStr, int32_t iBufSize, const char *pInStr, char cReplace)
|
|
{
|
|
int32_t iSrcIdx, iDstIdx = 0;
|
|
|
|
// fast scan to find any utf8 encoded data
|
|
for (iSrcIdx = 0; ((pInStr[iSrcIdx] & 0x80) == 0) && (pInStr[iSrcIdx] != '\0'); iSrcIdx++)
|
|
{
|
|
}
|
|
|
|
// did we find any?
|
|
if (pInStr[iSrcIdx] != '\0')
|
|
{
|
|
// yes, so replace/strip it
|
|
for (iDstIdx = iSrcIdx; pInStr[iSrcIdx] != '\0' && iDstIdx < iBufSize; )
|
|
{
|
|
// do we have utf8 data?
|
|
if (pInStr[iSrcIdx] & 0x80)
|
|
{
|
|
// figure out how many bytes of utf8 data we have
|
|
int32_t iNumBytes = _Utf8GetNumBytes(pInStr[iSrcIdx]);
|
|
|
|
// skip them
|
|
iSrcIdx += iNumBytes;
|
|
|
|
// replace with cReplace
|
|
if (cReplace != '\0')
|
|
{
|
|
pOutStr[iDstIdx++] = cReplace;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// normal string data - copy it
|
|
pOutStr[iDstIdx++] = pInStr[iSrcIdx++];
|
|
}
|
|
}
|
|
|
|
// terminate
|
|
pOutStr[iDstIdx++] = '\0';
|
|
}
|
|
|
|
return(iDstIdx);
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function Utf8StrLen
|
|
|
|
\Description
|
|
Returns the number of code points in a UTF-8 encoded string.
|
|
|
|
\Input *pStr - pointer to string to get string length of
|
|
|
|
\Output
|
|
int32_t - number of code points in string
|
|
|
|
\Version 03/26/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
int32_t Utf8StrLen(const char *pStr)
|
|
{
|
|
int32_t iSrcIdx, iStrLen;
|
|
|
|
for (iSrcIdx = iStrLen = 0; pStr[iSrcIdx] != '\0'; iStrLen++)
|
|
{
|
|
if (pStr[iSrcIdx] & 0x80)
|
|
{
|
|
// figure out how many bytes of utf8 data we have
|
|
int32_t iNumBytes = _Utf8GetNumBytes(pStr[iSrcIdx]);
|
|
|
|
// skip them
|
|
iSrcIdx += iNumBytes;
|
|
}
|
|
else
|
|
{
|
|
iSrcIdx++;
|
|
}
|
|
}
|
|
|
|
return(iStrLen);
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function Utf8EncodeFromUCS2
|
|
|
|
\Description
|
|
Convert a UCS-2 code point sequence into a UTF-8 code point sequence.
|
|
|
|
\Input *pOutStr - pointer to buffer to encode string to
|
|
\Input iBufLen - length of output buffer, in char units
|
|
\Input *pInStr - pointer to string to encode
|
|
|
|
\Output
|
|
int32_t - output string length, in char units
|
|
|
|
\Version 03/27/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
int32_t Utf8EncodeFromUCS2(char *pOutStr, int32_t iBufLen, const uint16_t *pInStr)
|
|
{
|
|
int32_t iStrLen;
|
|
|
|
// a UCS-2 encoded string can generate up to three chars.
|
|
iBufLen -= 2;
|
|
|
|
// encode
|
|
for (iStrLen = 0; (*pInStr != 0x0000) && (iStrLen < iBufLen); pInStr++)
|
|
{
|
|
iStrLen += Utf8EncodeFromUCS2CodePt(&pOutStr[iStrLen], *pInStr);
|
|
}
|
|
|
|
// NULL terminate & return length to caller
|
|
pOutStr[iStrLen++] = '\0';
|
|
return(iStrLen);
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function Utf8DecodeToUCS2
|
|
|
|
\Description
|
|
Convert a UTF-8 code point sequence into a UCS-2 code point sequence.
|
|
|
|
\Input *pOutStr - pointer to buffer to decode string to
|
|
\Input iBufLen - length of output buffer, in UCS-2 units (int16_t)
|
|
\Input *pInStr - pointer to string to decode
|
|
|
|
\Output
|
|
int32_t - output string length, in code points
|
|
|
|
\Version 03/26/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
int32_t Utf8DecodeToUCS2(uint16_t *pOutStr, int32_t iBufLen, const char *pInStr)
|
|
{
|
|
int32_t iSrcIdx, iStrLen, iCodeSize;
|
|
|
|
// ensure room for NULL terminator
|
|
iBufLen--;
|
|
|
|
// decode string
|
|
for (iSrcIdx = iStrLen = 0; (pInStr[iSrcIdx] != '\0') && (iStrLen < iBufLen); )
|
|
{
|
|
if ((iCodeSize = _Utf8DecodeToUCS2(&pOutStr[iStrLen], (const unsigned char *)&pInStr[iSrcIdx])) <= 3)
|
|
{
|
|
iStrLen++;
|
|
}
|
|
|
|
iSrcIdx += iCodeSize;
|
|
}
|
|
|
|
// NULL terminate & return length to caller
|
|
pOutStr[iStrLen++] = 0x0000;
|
|
return(iStrLen);
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function Utf8EncodeFrom8Bit
|
|
|
|
\Description
|
|
Encode the given 8bit input string to UTF-8, based on the input translation table
|
|
|
|
\Input *pOutStr - pointer to output UTF-8 string buffer
|
|
\Input iBufLen - length of buffer
|
|
\Input *pInStr - pointer to input 8bit string
|
|
\Input *pEncodeTbl - pointer to translation table to map 8bit string to UCS-2
|
|
|
|
\Output
|
|
int32_t - length of output UCS-8 string
|
|
|
|
\Version 03/28/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
int32_t Utf8EncodeFrom8Bit(char *pOutStr, int32_t iBufLen, const char *pInStr, const Utf8EncodeTblT *pEncodeTbl)
|
|
{
|
|
int32_t iStrLen;
|
|
uint16_t uCodePt;
|
|
|
|
// a UCS-2 encoded value can generate up to three chars.
|
|
iBufLen -= 2;
|
|
|
|
// encode
|
|
for (iStrLen = 0; (*pInStr != 0x0000) && (iStrLen < iBufLen); pInStr++)
|
|
{
|
|
uCodePt = pEncodeTbl->uCodeTbl[*(const unsigned char *)pInStr];
|
|
iStrLen += Utf8EncodeFromUCS2CodePt(&pOutStr[iStrLen], uCodePt);
|
|
}
|
|
|
|
// NULL terminate & return length to caller
|
|
pOutStr[iStrLen++] = '\0';
|
|
return(iStrLen);
|
|
}
|
|
|
|
/*F*************************************************************************************************/
|
|
/*!
|
|
\Function Utf8TranslateTo8Bit
|
|
|
|
\Description
|
|
Translates the given UTF-8 sequence based on the input translation table.
|
|
|
|
\Input *pOutStr - pointer to buffer to decode string to
|
|
\Input iBufLen - length of output buffer, in ASCII units (char)
|
|
\Input *pInStr - pointer to string to decode
|
|
\Input cReplace - \verbatim character to replace code point with if untranslateable ('\0' to strip) \endverbatim
|
|
\Input *pTransTbl - pointer to NULL-terminated translation table array
|
|
|
|
\Output
|
|
int32_t - length of output string in ASCII characters (8bit)
|
|
|
|
\Notes
|
|
'pTransTbl' is expected to be a NULL-terminated array of Utf8TransTblT structures that
|
|
represent a sparse translation table from 16bit UCS-2 space to 8bit game font space.
|
|
|
|
\Version 03/26/03 (JLB)
|
|
*/
|
|
/*************************************************************************************************F*/
|
|
int32_t Utf8TranslateTo8Bit(char *pOutStr, int32_t iBufLen, const char *pInStr, char cReplace, const Utf8TransTblT *pTransTbl)
|
|
{
|
|
int32_t iSrcIdx, iDstIdx;
|
|
uint16_t uCodePt = 0;
|
|
|
|
// ensure room for NULL terminator
|
|
iBufLen--;
|
|
|
|
// translate string
|
|
for (iSrcIdx = iDstIdx = 0; (pInStr[iSrcIdx] != '\0') && (iDstIdx < iBufLen); )
|
|
{
|
|
iSrcIdx += _Utf8DecodeToUCS2(&uCodePt, (const unsigned char *)&pInStr[iSrcIdx]);
|
|
iDstIdx += _Utf8Translate(&pOutStr[iDstIdx], pTransTbl, uCodePt, cReplace);
|
|
}
|
|
|
|
// NULL terminate & return length to caller
|
|
pOutStr[iDstIdx++] = '\0';
|
|
return(iDstIdx);
|
|
}
|