SqMod/vendor/POCO/PDF/src/hpdf_encoder_utf.c

/*
 * << Haru Free PDF Library >> -- hpdf_encoder_utf.c
 *
 * URL: http://libharu.org
 *
 * Copyright (c) 1999-2006 Takeshi Kanno <takeshi_kanno@est.hi-ho.ne.jp>
 * Copyright (c) 2007-2008 Antony Dovgal <tony@daylessday.org>
 * Copyright (c) 2010      Sergey Konovalov <webmaster@crynet.ru>
 * Copyright (c) 2011      Koen Deforche <koen@emweb.be>
 *
 * Permission to use, copy, modify, distribute and sell this software
 * and its documentation for any purpose is hereby granted without fee,
 * provided that the above copyright notice appear in all copies and
 * that both that copyright notice and this permission notice appear
 * in supporting documentation.
 * It is provided "as is" without express or implied warranty.
 *
 */

#include "hpdf_conf.h"
#include "hpdf_utils.h"
#include "hpdf_encoder.h"
#include "hpdf.h"

typedef struct _UTF8_EncoderAttr_Rec  *UTF8_EncoderAttr;
typedef struct  _UTF8_EncoderAttr_Rec {
      HPDF_BYTE           current_byte;
      HPDF_BYTE           end_byte;
      HPDF_BYTE           utf8_bytes[8];
} UTF8_EncoderAttr_Rec;

static const HPDF_CidRange_Rec UTF8_NOTDEF_RANGE = {0x0000, 0x001F, 1};
static const HPDF_CidRange_Rec UTF8_SPACE_RANGE =  {0x0000, 0xFFFF, 0};
static const HPDF_CidRange_Rec UTF8_CID_RANGE[] = {
  { 0x0000, 0xFFFF, 0x0 },
  { 0xFFFF, 0xFFFF, 0x0 }
};

static HPDF_ByteType
UTF8_Encoder_ByteType_Func  (HPDF_Encoder        encoder,
                             HPDF_ParseText_Rec  *state);

static HPDF_UNICODE
UTF8_Encoder_ToUnicode_Func  (HPDF_Encoder   encoder,
                              HPDF_UINT16    code);

static char *
UTF8_Encoder_EncodeText_Func  (HPDF_Encoder        encoder,
			       const char         *text,
			       HPDF_UINT           len,
			       HPDF_UINT          *length);

static HPDF_STATUS
UTF8_Init  (HPDF_Encoder    encoder);

/*--------------------------------------------------------------------------*/


/*
 * This function is taken from hpdf_encoder_utf8.c, originally submitted
 * to libharu by 'Mirco'
 */
static HPDF_ByteType
UTF8_Encoder_ByteType_Func  (HPDF_Encoder        encoder,
                             HPDF_ParseText_Rec  *state)
{
    // This function is supposed to increment state->index
    // Not logical ! (look at function HPDF_String_Write in hpdf_string.c)

    // When HPDF_BYTE_TYPE_SINGLE is returned, the current byte is the
    //   CODE argument in call ToUnicode_Func
    // When HPDF_BYTE_TYPE_LEAD is returned, the current byte (msb) and the
    //   next byte (lsb) is the CODE arguement in call ToUnicodeFunc
    // When HPDF_BYTE_TYPE_TRIAL is returned, the current byte is ignored

    HPDF_CMapEncoderAttr  encoder_attr;
    HPDF_BYTE             byte;
    UTF8_EncoderAttr      utf8_attr;

    encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr;
    utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[0]);

    if (state->index == 0) {
	//First byte, initialize.
	HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Initialize: (%u) %s\n",
		      state->len, state->text));

	utf8_attr->current_byte = 0;
    }

    byte = state->text[state->index];
    state->index++;

    HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Byte: %hx\n", byte));

    if (utf8_attr->current_byte == 0) {
	utf8_attr->utf8_bytes[0] = byte;
	utf8_attr->current_byte = 1;

	if (!(byte & 0x80)) {
	    utf8_attr->current_byte = 0;
	    utf8_attr->end_byte = 0;
	    return HPDF_BYTE_TYPE_SINGLE;
	}

	if ((byte & 0xf8) == 0xf0)
	    utf8_attr->end_byte = 3;
	else if ((byte & 0xf0) == 0xe0)
	    utf8_attr->end_byte = 2;
	else if ((byte & 0xe0) == 0xc0)
	    utf8_attr->end_byte = 1;
	else
	    utf8_attr->current_byte = 0; //ERROR, skip this byte
    } else {
	utf8_attr->utf8_bytes[utf8_attr->current_byte] = byte;
	if (utf8_attr->current_byte == utf8_attr->end_byte) {
	    utf8_attr->current_byte = 0;
	    return HPDF_BYTE_TYPE_SINGLE;
	}

	utf8_attr->current_byte++;
    }

    return HPDF_BYTE_TYPE_TRIAL;
}

/*
 * This function is taken from hpdf_encoder_utf8.c, originally submitted
 * to libharu by 'Mirco'
 */
static HPDF_UNICODE
UTF8_Encoder_ToUnicode_Func  (HPDF_Encoder   encoder,
                              HPDF_UINT16    code)
{
    // Supposed to convert CODE to unicode.
    // This function is allways called after ByteType_Func.
    // ByteType_Func recognizes the utf-8 bytes belonging to one character.

    HPDF_CMapEncoderAttr encoder_attr;
    UTF8_EncoderAttr     utf8_attr;
    unsigned int         val;

    encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr;
    utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[0]);

    switch (utf8_attr->end_byte) {
    case 3:
	val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0x7) << 18) +
	    (unsigned int) ((utf8_attr->utf8_bytes[1]) << 12)       +
	    (unsigned int) ((utf8_attr->utf8_bytes[2] & 0x3f) << 6) +
	    (unsigned int) ((utf8_attr->utf8_bytes[3] & 0x3f));
	break;
    case 2:
	val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0xf) << 12) +
	    (unsigned int) ((utf8_attr->utf8_bytes[1] & 0x3f) << 6) +
	    (unsigned int) ((utf8_attr->utf8_bytes[2] & 0x3f));
	break;
    case 1:
	val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0x1f) << 6) +
	    (unsigned int) ((utf8_attr->utf8_bytes[1] & 0x3f));
	break;
    case 0:
	val = (unsigned int)  utf8_attr->utf8_bytes[0];
	break;
    default:
	val = 32; // Unknown character
    }

    if (val > 65535) //Convert everything outside UCS-2 to space
        val = 32;

    return val;
}

static char *
UTF8_Encoder_EncodeText_Func  (HPDF_Encoder        encoder,
			       const char         *text,
			       HPDF_UINT           len,
			       HPDF_UINT          *length)
{
    char *result = malloc(len * 2);
    char *c = result;
    HPDF_ParseText_Rec  parse_state;
    HPDF_UINT i;

    HPDF_Encoder_SetParseText (encoder, &parse_state,
			       (const HPDF_BYTE *)text, len);

    for (i = 0; i < len; i++) {
	HPDF_UNICODE tmp_unicode;
	HPDF_ByteType btype = HPDF_Encoder_ByteType (encoder, &parse_state);

	if (btype != HPDF_BYTE_TYPE_TRIAL) {
	    tmp_unicode = HPDF_Encoder_ToUnicode (encoder, 0);

	    HPDF_UInt16Swap (&tmp_unicode);
	    HPDF_MemCpy ((HPDF_BYTE *)c, (const HPDF_BYTE*)&tmp_unicode, 2);
	    c += 2;
        }
    }

    *length = c - result;

    return result;
}

static HPDF_STATUS
UTF8_Init  (HPDF_Encoder  encoder)
{
    HPDF_CMapEncoderAttr attr;
    HPDF_STATUS ret;

    if ((ret = HPDF_CMapEncoder_InitAttr (encoder)) != HPDF_OK)
        return ret;

    /*
     * We override these two
     */
    encoder->byte_type_fn = UTF8_Encoder_ByteType_Func;
    encoder->to_unicode_fn = UTF8_Encoder_ToUnicode_Func;
    encoder->encode_text_fn = UTF8_Encoder_EncodeText_Func;

    attr = (HPDF_CMapEncoderAttr)encoder->attr;

    if (HPDF_CMapEncoder_AddCMap (encoder, UTF8_CID_RANGE) != HPDF_OK)
        return encoder->error->error_no;

    if (HPDF_CMapEncoder_AddCodeSpaceRange (encoder, UTF8_SPACE_RANGE)
	       != HPDF_OK)
      return encoder->error->error_no;

    if (HPDF_CMapEncoder_AddNotDefRange (encoder, UTF8_NOTDEF_RANGE)
                != HPDF_OK)
        return encoder->error->error_no;

    attr->is_lead_byte_fn = NULL;
    attr->is_trial_byte_fn = NULL;

    HPDF_StrCpy (attr->registry, "Adobe", attr->registry +
                HPDF_LIMIT_MAX_NAME_LEN);
    HPDF_StrCpy (attr->ordering, "Identity-H", attr->ordering +
                HPDF_LIMIT_MAX_NAME_LEN);
    attr->suppliment = 0;
    attr->writing_mode = HPDF_WMODE_HORIZONTAL;

    /* Not sure about this
    attr->uid_offset = 0;
    attr->xuid[0] = 0;
    attr->xuid[1] = 0;
    attr->xuid[2] = 0;
    */

    encoder->type = HPDF_ENCODER_TYPE_DOUBLE_BYTE;

    return HPDF_OK;
}

/*--------------------------------------------------------------------------*/

HPDF_EXPORT(HPDF_STATUS)
HPDF_UseUTFEncodings   (HPDF_Doc   pdf)
{
    HPDF_Encoder encoder;
    HPDF_STATUS ret;

    if (!HPDF_HasDoc (pdf))
        return HPDF_INVALID_DOCUMENT;

    encoder = HPDF_CMapEncoder_New (pdf->mmgr,  "UTF-8",
                UTF8_Init);

    if ((ret = HPDF_Doc_RegisterEncoder (pdf, encoder)) != HPDF_OK)
        return ret;

    return HPDF_OK;
}
Major plugin refactor and cleanup. Switched to POCO library for unified platform/library interface. Deprecated the external module API. It was creating more problems than solving. Removed most built-in libraries in favor of system libraries for easier maintenance. Cleaned and secured code with help from static analyzers. 2021-01-30 08:51:39 +02:00			`/*`
			`* << Haru Free PDF Library >> -- hpdf_encoder_utf.c`
			`*`
			`* URL: http://libharu.org`
			`*`
			`* Copyright (c) 1999-2006 Takeshi Kanno <takeshi_kanno@est.hi-ho.ne.jp>`
			`* Copyright (c) 2007-2008 Antony Dovgal <tony@daylessday.org>`
			`* Copyright (c) 2010 Sergey Konovalov <webmaster@crynet.ru>`
			`* Copyright (c) 2011 Koen Deforche <koen@emweb.be>`
			`*`
			`* Permission to use, copy, modify, distribute and sell this software`
			`* and its documentation for any purpose is hereby granted without fee,`
			`* provided that the above copyright notice appear in all copies and`
			`* that both that copyright notice and this permission notice appear`
			`* in supporting documentation.`
			`* It is provided "as is" without express or implied warranty.`
			`*`
			`*/`

			`#include "hpdf_conf.h"`
			`#include "hpdf_utils.h"`
			`#include "hpdf_encoder.h"`
			`#include "hpdf.h"`

			`typedef struct _UTF8_EncoderAttr_Rec *UTF8_EncoderAttr;`
			`typedef struct _UTF8_EncoderAttr_Rec {`
			`HPDF_BYTE current_byte;`
			`HPDF_BYTE end_byte;`
			`HPDF_BYTE utf8_bytes[8];`
			`} UTF8_EncoderAttr_Rec;`

			`static const HPDF_CidRange_Rec UTF8_NOTDEF_RANGE = {0x0000, 0x001F, 1};`
			`static const HPDF_CidRange_Rec UTF8_SPACE_RANGE = {0x0000, 0xFFFF, 0};`
			`static const HPDF_CidRange_Rec UTF8_CID_RANGE[] = {`
			`{ 0x0000, 0xFFFF, 0x0 },`
			`{ 0xFFFF, 0xFFFF, 0x0 }`
			`};`

			`static HPDF_ByteType`
			`UTF8_Encoder_ByteType_Func (HPDF_Encoder encoder,`
			`HPDF_ParseText_Rec *state);`

			`static HPDF_UNICODE`
			`UTF8_Encoder_ToUnicode_Func (HPDF_Encoder encoder,`
			`HPDF_UINT16 code);`

			`static char *`
			`UTF8_Encoder_EncodeText_Func (HPDF_Encoder encoder,`
			`const char *text,`
			`HPDF_UINT len,`
			`HPDF_UINT *length);`

			`static HPDF_STATUS`
			`UTF8_Init (HPDF_Encoder encoder);`

			`/--------------------------------------------------------------------------/`


			`/*`
			`* This function is taken from hpdf_encoder_utf8.c, originally submitted`
			`* to libharu by 'Mirco'`
			`*/`
			`static HPDF_ByteType`
			`UTF8_Encoder_ByteType_Func (HPDF_Encoder encoder,`
			`HPDF_ParseText_Rec *state)`
			`{`
			`// This function is supposed to increment state->index`
			`// Not logical ! (look at function HPDF_String_Write in hpdf_string.c)`

			`// When HPDF_BYTE_TYPE_SINGLE is returned, the current byte is the`
			`// CODE argument in call ToUnicode_Func`
			`// When HPDF_BYTE_TYPE_LEAD is returned, the current byte (msb) and the`
			`// next byte (lsb) is the CODE arguement in call ToUnicodeFunc`
			`// When HPDF_BYTE_TYPE_TRIAL is returned, the current byte is ignored`

			`HPDF_CMapEncoderAttr encoder_attr;`
			`HPDF_BYTE byte;`
			`UTF8_EncoderAttr utf8_attr;`

			`encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr;`
			`utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[0]);`

			`if (state->index == 0) {`
			`//First byte, initialize.`
			`HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Initialize: (%u) %s\n",`
			`state->len, state->text));`

			`utf8_attr->current_byte = 0;`
			`}`

			`byte = state->text[state->index];`
			`state->index++;`

			`HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Byte: %hx\n", byte));`

			`if (utf8_attr->current_byte == 0) {`
			`utf8_attr->utf8_bytes[0] = byte;`
			`utf8_attr->current_byte = 1;`

			`if (!(byte & 0x80)) {`
			`utf8_attr->current_byte = 0;`
			`utf8_attr->end_byte = 0;`
			`return HPDF_BYTE_TYPE_SINGLE;`
			`}`

			`if ((byte & 0xf8) == 0xf0)`
			`utf8_attr->end_byte = 3;`
			`else if ((byte & 0xf0) == 0xe0)`
			`utf8_attr->end_byte = 2;`
			`else if ((byte & 0xe0) == 0xc0)`
			`utf8_attr->end_byte = 1;`
			`else`
			`utf8_attr->current_byte = 0; //ERROR, skip this byte`
			`} else {`
			`utf8_attr->utf8_bytes[utf8_attr->current_byte] = byte;`
			`if (utf8_attr->current_byte == utf8_attr->end_byte) {`
			`utf8_attr->current_byte = 0;`
			`return HPDF_BYTE_TYPE_SINGLE;`
			`}`

			`utf8_attr->current_byte++;`
			`}`

			`return HPDF_BYTE_TYPE_TRIAL;`
			`}`

			`/*`
			`* This function is taken from hpdf_encoder_utf8.c, originally submitted`
			`* to libharu by 'Mirco'`
			`*/`
			`static HPDF_UNICODE`
			`UTF8_Encoder_ToUnicode_Func (HPDF_Encoder encoder,`
			`HPDF_UINT16 code)`
			`{`
			`// Supposed to convert CODE to unicode.`
			`// This function is allways called after ByteType_Func.`
			`// ByteType_Func recognizes the utf-8 bytes belonging to one character.`

			`HPDF_CMapEncoderAttr encoder_attr;`
			`UTF8_EncoderAttr utf8_attr;`
			`unsigned int val;`

			`encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr;`
			`utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[0]);`

			`switch (utf8_attr->end_byte) {`
			`case 3:`
			`val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0x7) << 18) +`
			`(unsigned int) ((utf8_attr->utf8_bytes[1]) << 12) +`
			`(unsigned int) ((utf8_attr->utf8_bytes[2] & 0x3f) << 6) +`
			`(unsigned int) ((utf8_attr->utf8_bytes[3] & 0x3f));`
			`break;`
			`case 2:`
			`val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0xf) << 12) +`
			`(unsigned int) ((utf8_attr->utf8_bytes[1] & 0x3f) << 6) +`
			`(unsigned int) ((utf8_attr->utf8_bytes[2] & 0x3f));`
			`break;`
			`case 1:`
			`val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0x1f) << 6) +`
			`(unsigned int) ((utf8_attr->utf8_bytes[1] & 0x3f));`
			`break;`
			`case 0:`
			`val = (unsigned int) utf8_attr->utf8_bytes[0];`
			`break;`
			`default:`
			`val = 32; // Unknown character`
			`}`

			`if (val > 65535) //Convert everything outside UCS-2 to space`
			`val = 32;`

			`return val;`
			`}`

			`static char *`
			`UTF8_Encoder_EncodeText_Func (HPDF_Encoder encoder,`
			`const char *text,`
			`HPDF_UINT len,`
			`HPDF_UINT *length)`
			`{`
			`char result = malloc(len 2);`
			`char *c = result;`
			`HPDF_ParseText_Rec parse_state;`
			`HPDF_UINT i;`

			`HPDF_Encoder_SetParseText (encoder, &parse_state,`
			`(const HPDF_BYTE *)text, len);`

			`for (i = 0; i < len; i++) {`
			`HPDF_UNICODE tmp_unicode;`
			`HPDF_ByteType btype = HPDF_Encoder_ByteType (encoder, &parse_state);`

			`if (btype != HPDF_BYTE_TYPE_TRIAL) {`
			`tmp_unicode = HPDF_Encoder_ToUnicode (encoder, 0);`

			`HPDF_UInt16Swap (&tmp_unicode);`
			`HPDF_MemCpy ((HPDF_BYTE )c, (const HPDF_BYTE)&tmp_unicode, 2);`
			`c += 2;`
			`}`
			`}`

			`*length = c - result;`

			`return result;`
			`}`

			`static HPDF_STATUS`
			`UTF8_Init (HPDF_Encoder encoder)`
			`{`
			`HPDF_CMapEncoderAttr attr;`
			`HPDF_STATUS ret;`

			`if ((ret = HPDF_CMapEncoder_InitAttr (encoder)) != HPDF_OK)`
			`return ret;`

			`/*`
			`* We override these two`
			`*/`
			`encoder->byte_type_fn = UTF8_Encoder_ByteType_Func;`
			`encoder->to_unicode_fn = UTF8_Encoder_ToUnicode_Func;`
			`encoder->encode_text_fn = UTF8_Encoder_EncodeText_Func;`

			`attr = (HPDF_CMapEncoderAttr)encoder->attr;`

			`if (HPDF_CMapEncoder_AddCMap (encoder, UTF8_CID_RANGE) != HPDF_OK)`
			`return encoder->error->error_no;`

			`if (HPDF_CMapEncoder_AddCodeSpaceRange (encoder, UTF8_SPACE_RANGE)`
			`!= HPDF_OK)`
			`return encoder->error->error_no;`

			`if (HPDF_CMapEncoder_AddNotDefRange (encoder, UTF8_NOTDEF_RANGE)`
			`!= HPDF_OK)`
			`return encoder->error->error_no;`

			`attr->is_lead_byte_fn = NULL;`
			`attr->is_trial_byte_fn = NULL;`

			`HPDF_StrCpy (attr->registry, "Adobe", attr->registry +`
			`HPDF_LIMIT_MAX_NAME_LEN);`
			`HPDF_StrCpy (attr->ordering, "Identity-H", attr->ordering +`
			`HPDF_LIMIT_MAX_NAME_LEN);`
			`attr->suppliment = 0;`
			`attr->writing_mode = HPDF_WMODE_HORIZONTAL;`
Update POCO library. 2023-03-23 20:19:11 +02:00
Major plugin refactor and cleanup. Switched to POCO library for unified platform/library interface. Deprecated the external module API. It was creating more problems than solving. Removed most built-in libraries in favor of system libraries for easier maintenance. Cleaned and secured code with help from static analyzers. 2021-01-30 08:51:39 +02:00			`/* Not sure about this`
			`attr->uid_offset = 0;`
			`attr->xuid[0] = 0;`
			`attr->xuid[1] = 0;`
			`attr->xuid[2] = 0;`
			`*/`

			`encoder->type = HPDF_ENCODER_TYPE_DOUBLE_BYTE;`

			`return HPDF_OK;`
			`}`

			`/--------------------------------------------------------------------------/`

			`HPDF_EXPORT(HPDF_STATUS)`
			`HPDF_UseUTFEncodings (HPDF_Doc pdf)`
			`{`
			`HPDF_Encoder encoder;`
			`HPDF_STATUS ret;`

			`if (!HPDF_HasDoc (pdf))`
			`return HPDF_INVALID_DOCUMENT;`

			`encoder = HPDF_CMapEncoder_New (pdf->mmgr, "UTF-8",`
			`UTF8_Init);`

			`if ((ret = HPDF_Doc_RegisterEncoder (pdf, encoder)) != HPDF_OK)`
			`return ret;`

			`return HPDF_OK;`
			`}`