/* * << Haru Free PDF Library >> -- hpdf_encoder_utf.c * * URL: http://libharu.org * * Copyright (c) 1999-2006 Takeshi Kanno * Copyright (c) 2007-2008 Antony Dovgal * Copyright (c) 2010 Sergey Konovalov * Copyright (c) 2011 Koen Deforche * * Permission to use, copy, modify, distribute and sell this software * and its documentation for any purpose is hereby granted without fee, * provided that the above copyright notice appear in all copies and * that both that copyright notice and this permission notice appear * in supporting documentation. * It is provided "as is" without express or implied warranty. * */ #include "hpdf_conf.h" #include "hpdf_utils.h" #include "hpdf_encoder.h" #include "hpdf.h" typedef struct _UTF8_EncoderAttr_Rec *UTF8_EncoderAttr; typedef struct _UTF8_EncoderAttr_Rec { HPDF_BYTE current_byte; HPDF_BYTE end_byte; HPDF_BYTE utf8_bytes[8]; } UTF8_EncoderAttr_Rec; static const HPDF_CidRange_Rec UTF8_NOTDEF_RANGE = {0x0000, 0x001F, 1}; static const HPDF_CidRange_Rec UTF8_SPACE_RANGE = {0x0000, 0xFFFF, 0}; static const HPDF_CidRange_Rec UTF8_CID_RANGE[] = { { 0x0000, 0xFFFF, 0x0 }, { 0xFFFF, 0xFFFF, 0x0 } }; static HPDF_ByteType UTF8_Encoder_ByteType_Func (HPDF_Encoder encoder, HPDF_ParseText_Rec *state); static HPDF_UNICODE UTF8_Encoder_ToUnicode_Func (HPDF_Encoder encoder, HPDF_UINT16 code); static char * UTF8_Encoder_EncodeText_Func (HPDF_Encoder encoder, const char *text, HPDF_UINT len, HPDF_UINT *length); static HPDF_STATUS UTF8_Init (HPDF_Encoder encoder); /*--------------------------------------------------------------------------*/ /* * This function is taken from hpdf_encoder_utf8.c, originally submitted * to libharu by 'Mirco' */ static HPDF_ByteType UTF8_Encoder_ByteType_Func (HPDF_Encoder encoder, HPDF_ParseText_Rec *state) { // This function is supposed to increment state->index // Not logical ! (look at function HPDF_String_Write in hpdf_string.c) // When HPDF_BYTE_TYPE_SINGLE is returned, the current byte is the // CODE argument in call ToUnicode_Func // When HPDF_BYTE_TYPE_LEAD is returned, the current byte (msb) and the // next byte (lsb) is the CODE arguement in call ToUnicodeFunc // When HPDF_BYTE_TYPE_TRIAL is returned, the current byte is ignored HPDF_CMapEncoderAttr encoder_attr; HPDF_BYTE byte; UTF8_EncoderAttr utf8_attr; encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr; utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[0]); if (state->index == 0) { //First byte, initialize. HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Initialize: (%u) %s\n", state->len, state->text)); utf8_attr->current_byte = 0; } byte = state->text[state->index]; state->index++; HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Byte: %hx\n", byte)); if (utf8_attr->current_byte == 0) { utf8_attr->utf8_bytes[0] = byte; utf8_attr->current_byte = 1; if (!(byte & 0x80)) { utf8_attr->current_byte = 0; utf8_attr->end_byte = 0; return HPDF_BYTE_TYPE_SINGLE; } if ((byte & 0xf8) == 0xf0) utf8_attr->end_byte = 3; else if ((byte & 0xf0) == 0xe0) utf8_attr->end_byte = 2; else if ((byte & 0xe0) == 0xc0) utf8_attr->end_byte = 1; else utf8_attr->current_byte = 0; //ERROR, skip this byte } else { utf8_attr->utf8_bytes[utf8_attr->current_byte] = byte; if (utf8_attr->current_byte == utf8_attr->end_byte) { utf8_attr->current_byte = 0; return HPDF_BYTE_TYPE_SINGLE; } utf8_attr->current_byte++; } return HPDF_BYTE_TYPE_TRIAL; } /* * This function is taken from hpdf_encoder_utf8.c, originally submitted * to libharu by 'Mirco' */ static HPDF_UNICODE UTF8_Encoder_ToUnicode_Func (HPDF_Encoder encoder, HPDF_UINT16 code) { // Supposed to convert CODE to unicode. // This function is allways called after ByteType_Func. // ByteType_Func recognizes the utf-8 bytes belonging to one character. HPDF_CMapEncoderAttr encoder_attr; UTF8_EncoderAttr utf8_attr; unsigned int val; encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr; utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[0]); switch (utf8_attr->end_byte) { case 3: val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0x7) << 18) + (unsigned int) ((utf8_attr->utf8_bytes[1]) << 12) + (unsigned int) ((utf8_attr->utf8_bytes[2] & 0x3f) << 6) + (unsigned int) ((utf8_attr->utf8_bytes[3] & 0x3f)); break; case 2: val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0xf) << 12) + (unsigned int) ((utf8_attr->utf8_bytes[1] & 0x3f) << 6) + (unsigned int) ((utf8_attr->utf8_bytes[2] & 0x3f)); break; case 1: val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0x1f) << 6) + (unsigned int) ((utf8_attr->utf8_bytes[1] & 0x3f)); break; case 0: val = (unsigned int) utf8_attr->utf8_bytes[0]; break; default: val = 32; // Unknown character } if (val > 65535) //Convert everything outside UCS-2 to space val = 32; return val; } static char * UTF8_Encoder_EncodeText_Func (HPDF_Encoder encoder, const char *text, HPDF_UINT len, HPDF_UINT *length) { char *result = malloc(len * 2); char *c = result; HPDF_ParseText_Rec parse_state; HPDF_UINT i; HPDF_Encoder_SetParseText (encoder, &parse_state, (const HPDF_BYTE *)text, len); for (i = 0; i < len; i++) { HPDF_UNICODE tmp_unicode; HPDF_ByteType btype = HPDF_Encoder_ByteType (encoder, &parse_state); if (btype != HPDF_BYTE_TYPE_TRIAL) { tmp_unicode = HPDF_Encoder_ToUnicode (encoder, 0); HPDF_UInt16Swap (&tmp_unicode); HPDF_MemCpy ((HPDF_BYTE *)c, (const HPDF_BYTE*)&tmp_unicode, 2); c += 2; } } *length = c - result; return result; } static HPDF_STATUS UTF8_Init (HPDF_Encoder encoder) { HPDF_CMapEncoderAttr attr; HPDF_STATUS ret; if ((ret = HPDF_CMapEncoder_InitAttr (encoder)) != HPDF_OK) return ret; /* * We override these two */ encoder->byte_type_fn = UTF8_Encoder_ByteType_Func; encoder->to_unicode_fn = UTF8_Encoder_ToUnicode_Func; encoder->encode_text_fn = UTF8_Encoder_EncodeText_Func; attr = (HPDF_CMapEncoderAttr)encoder->attr; if (HPDF_CMapEncoder_AddCMap (encoder, UTF8_CID_RANGE) != HPDF_OK) return encoder->error->error_no; if (HPDF_CMapEncoder_AddCodeSpaceRange (encoder, UTF8_SPACE_RANGE) != HPDF_OK) return encoder->error->error_no; if (HPDF_CMapEncoder_AddNotDefRange (encoder, UTF8_NOTDEF_RANGE) != HPDF_OK) return encoder->error->error_no; attr->is_lead_byte_fn = NULL; attr->is_trial_byte_fn = NULL; HPDF_StrCpy (attr->registry, "Adobe", attr->registry + HPDF_LIMIT_MAX_NAME_LEN); HPDF_StrCpy (attr->ordering, "Identity-H", attr->ordering + HPDF_LIMIT_MAX_NAME_LEN); attr->suppliment = 0; attr->writing_mode = HPDF_WMODE_HORIZONTAL; /* Not sure about this attr->uid_offset = 0; attr->xuid[0] = 0; attr->xuid[1] = 0; attr->xuid[2] = 0; */ encoder->type = HPDF_ENCODER_TYPE_DOUBLE_BYTE; return HPDF_OK; } /*--------------------------------------------------------------------------*/ HPDF_EXPORT(HPDF_STATUS) HPDF_UseUTFEncodings (HPDF_Doc pdf) { HPDF_Encoder encoder; HPDF_STATUS ret; if (!HPDF_HasDoc (pdf)) return HPDF_INVALID_DOCUMENT; encoder = HPDF_CMapEncoder_New (pdf->mmgr, "UTF-8", UTF8_Init); if ((ret = HPDF_Doc_RegisterEncoder (pdf, encoder)) != HPDF_OK) return ret; return HPDF_OK; }