Changeset 27 for trunk/poppler/mypoppler/poppler/UnicodeTypeTable.cc
 Timestamp:
 May 25, 2006, 4:09:55 PM (16 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

trunk/poppler/mypoppler/poppler/UnicodeTypeTable.cc
r2 r27 10 10 #include "CharTypes.h" 11 11 #include "UnicodeTypeTable.h" 12 #include "goo/gmem.h" 12 13 13 14 struct UnicodeMapTableEntry { … … 948 949 } 949 950 951 #define UNICODE_LAST_CHAR 0x10FFFF 952 #define UNICODE_MAX_TABLE_INDEX (UNICODE_LAST_CHAR / 256 + 1) 953 // large empty block between U+2FA1D and U+E0001 954 #define UNICODE_LAST_CHAR_PART1 0x2FAFF 955 #define UNICODE_LAST_PAGE_PART1 (UNICODE_LAST_CHAR_PART1 / 256) 956 #define UNICODE_PART2_START 0xE0000 957 958 #include "UnicodeCClassTables.h" 959 #include "UnicodeCompTables.h" 960 #include "UnicodeDecompTables.h" 961 962 #define CC_PART1(Page, Char) \ 963 ((combining_class_table_part1[Page] >= UNICODE_MAX_TABLE_INDEX) \ 964 ? (combining_class_table_part1[Page]  UNICODE_MAX_TABLE_INDEX) \ 965 : (cclass_data[combining_class_table_part1[Page]][Char])) 966 967 #define CC_PART2(Page, Char) \ 968 ((combining_class_table_part2[Page] >= UNICODE_MAX_TABLE_INDEX) \ 969 ? (combining_class_table_part2[Page]  UNICODE_MAX_TABLE_INDEX) \ 970 : (cclass_data[combining_class_table_part2[Page]][Char])) 971 972 #define COMBINING_CLASS(u) (((u) <= UNICODE_LAST_CHAR_PART1) \ 973 ? CC_PART1((u) / 256, (u) % 256) \ 974 : (((u) >= UNICODE_PART2_START && (u) <= UNICODE_LAST_CHAR) \ 975 ? CC_PART2(((u)  UNICODE_PART2_START) / 256, (u) % 256) \ 976 : 0)) 977 978 // Write the compatibility decomposition of @u into @buf, returning the number 979 // of characters written. @buf may be NULL, in which case the length of the 980 // decomposition is returned but nothing is written. If @u is its own 981 // decomposition, write @u into @buf and return 1. 982 static int decomp_compat(Unicode u, Unicode *buf) { 983 // decomposition tables stored as lists {character, decomp_length, offset} 984 // so we do a binary search 985 int start = 0, end = DECOMP_TABLE_LENGTH; 986 if (u >= decomp_table[start].character 987 && u <= decomp_table[end  1].character) 988 while (gTrue) { 989 int midpoint = (start + end) / 2; 990 if (u == decomp_table[midpoint].character) { 991 int offset = decomp_table[midpoint].offset; 992 if (offset == 1) 993 break; 994 else { 995 int length = decomp_table[midpoint].length, i; 996 if (buf) 997 for (i = 0; i < length; ++i) 998 buf[i] = decomp_expansion[offset + i]; 999 return length; 1000 } 1001 } else if (midpoint == start) 1002 break; 1003 else if (u > decomp_table[midpoint].character) 1004 start = midpoint; 1005 else 1006 end = midpoint; 1007 } 1008 if (buf) 1009 *buf = u; 1010 return 1; 1011 } 1012 1013 #define CI(Page, Char) \ 1014 ((compose_table[Page] >= UNICODE_MAX_TABLE_INDEX) \ 1015 ? (compose_table[Page]  UNICODE_MAX_TABLE_INDEX) \ 1016 : (compose_data[compose_table[Page]][Char])) 1017 1018 #define COMPOSE_INDEX(u) \ 1019 ((((u) / 256) > (COMPOSE_TABLE_LAST)) ? 0 : CI((u) / 256, (u) % 255)) 1020 1021 // If @add combines with @base, write the combination to @out and return 1022 // gTrue. Otherwise return gFalse. 1023 static GBool combine(Unicode base, Unicode add, Unicode *out) { 1024 unsigned short idx_base, idx_add; 1025 1026 idx_base = COMPOSE_INDEX(base); 1027 if (idx_base >= COMPOSE_FIRST_SINGLE_START 1028 && idx_base < COMPOSE_SECOND_START) { 1029 if (compose_first_single[idx_base  COMPOSE_FIRST_SINGLE_START][0] 1030 == add) { 1031 *out = compose_first_single[idx_base  COMPOSE_FIRST_SINGLE_START][1]; 1032 return gTrue; 1033 } else 1034 return gFalse; 1035 } 1036 1037 idx_add = COMPOSE_INDEX(add); 1038 if (idx_add >= COMPOSE_SECOND_SINGLE_START) { 1039 if (compose_second_single[idx_add  COMPOSE_SECOND_SINGLE_START][0] 1040 == base) { 1041 *out = compose_second_single[idx_add  COMPOSE_SECOND_SINGLE_START][1]; 1042 return gTrue; 1043 } else 1044 return gFalse; 1045 } 1046 1047 if (idx_base >= COMPOSE_FIRST_START && idx_base < COMPOSE_FIRST_SINGLE_START 1048 && idx_add >= COMPOSE_SECOND_START 1049 && idx_add < COMPOSE_SECOND_SINGLE_START) { 1050 Unicode o = compose_array[idx_base  COMPOSE_FIRST_START] 1051 [idx_add  COMPOSE_SECOND_START]; 1052 if (o) { 1053 *out = o; 1054 return gTrue; 1055 } 1056 } 1057 1058 return gFalse; 1059 } 1060 1061 #define HANGUL_S_BASE 0xAC00 1062 #define HANGUL_L_BASE 0x1100 1063 #define HANGUL_V_BASE 0x1161 1064 #define HANGUL_T_BASE 0x11A7 1065 #define HANGUL_L_COUNT 19 1066 #define HANGUL_V_COUNT 21 1067 #define HANGUL_T_COUNT 28 1068 #define HANGUL_S_COUNT (HANGUL_L_COUNT * HANGUL_V_COUNT * HANGUL_T_COUNT) 1069 #define HANGUL_N_COUNT (HANGUL_V_COUNT * HANGUL_T_COUNT) 1070 #define HANGUL_IS_L(u) (((u) >= HANGUL_L_BASE) \ 1071 && ((u) < HANGUL_L_BASE + HANGUL_L_COUNT)) 1072 #define HANGUL_IS_V(u) (((u) >= HANGUL_V_BASE) \ 1073 && ((u) < HANGUL_V_BASE + HANGUL_V_COUNT)) 1074 #define HANGUL_IS_T(u) (((u) >= HANGUL_T_BASE) \ 1075 && ((u) < HANGUL_T_BASE + HANGUL_T_COUNT)) 1076 #define HANGUL_IS_SYLLABLE(u) (((u) >= HANGUL_S_BASE) \ 1077 && ((u) < HANGUL_S_BASE + HANGUL_S_COUNT)) 1078 #define HANGUL_SYLLABLE_IS_LV(u) (((u)  HANGUL_S_BASE) % HANGUL_T_COUNT == 0) 1079 #define IS_HANGUL(u) (HANGUL_IS_L(u)  HANGUL_IS_V(u)  HANGUL_IS_T(u) \ 1080  HANGUL_IS_SYLLABLE(u)) 1081 #define HANGUL_COMPOSE_L_V(l, v) (HANGUL_S_BASE + (HANGUL_T_COUNT * \ 1082 (((v)  HANGUL_V_BASE) + (HANGUL_V_COUNT * ((l)  HANGUL_L_BASE))))) 1083 #define HANGUL_COMPOSE_LV_T(lv, t) ((lv) + ((t)  HANGUL_T_BASE)) 1084 1085 // Converts Unicode string @in of length @len to its normalization in form 1086 // NFKC (compatibility decomposition + canonical composition). The length of 1087 // the resulting Unicode string is returned in @out_len. If nonNULL, @indices 1088 // is assigned the location of a newlyallocated array of length @out_len + 1, 1089 // for each character in the normalized string giving the index in @in of the 1090 // corresponding unnormalized character. @indices is not guaranteed monotone or 1091 // onto. 1092 Unicode *unicodeNormalizeNFKC(Unicode *in, int len, 1093 int *out_len, int **indices) { 1094 Unicode *out; 1095 int i, o, *classes, *idx = NULL; 1096 1097 for (i = 0, o = 0; i < len; ++i) { 1098 if (HANGUL_IS_L(in[i])  HANGUL_IS_SYLLABLE(in[i])) { 1099 o += 1; 1100 } else 1101 o += decomp_compat(in[i], NULL); 1102 } 1103 1104 out = (Unicode *) gmallocn(o, sizeof(Unicode)); 1105 classes = (int *) gmallocn(o, sizeof(int)); 1106 if (indices) 1107 idx = (int *) gmallocn(o + 1, sizeof(int)); 1108 1109 for (i = 0, o = 0; i < len; ) { 1110 Unicode u = in[i]; 1111 if (IS_HANGUL(u)) { 1112 if (HANGUL_IS_L(u)) { 1113 Unicode l = u; 1114 if (i+1 < len && HANGUL_IS_V(in[i+1])) { 1115 Unicode lv = HANGUL_COMPOSE_L_V(l, in[++i]); 1116 if (i+1 < len && HANGUL_IS_T(in[i+1])) 1117 out[o] = HANGUL_COMPOSE_LV_T(lv, in[++i]); 1118 else 1119 out[o] = lv; 1120 } else 1121 out[o] = l; 1122 } else if (HANGUL_SYLLABLE_IS_LV(u)) { 1123 Unicode lv = u; 1124 if (i+1 < len && HANGUL_IS_T(in[i+1])) 1125 out[o] = HANGUL_COMPOSE_LV_T(lv, in[++i]); 1126 else 1127 out[o] = lv; 1128 } else 1129 out[o] = u; 1130 if (indices) 1131 idx[o] = i; 1132 ++i; ++o; 1133 } else { 1134 int j, p, q, r, s, dlen; 1135 // write compatibility decompositions into out (we have enough space) 1136 // chomp in until a starter is reached 1137 for (j = i, p = o; j < len; ++j) { 1138 u = in[j]; 1139 if (j != i && COMBINING_CLASS(u) == 0) 1140 break; 1141 dlen = decomp_compat(u, out + p); 1142 for (q = p; q < p + dlen; ++q) { 1143 classes[q] = COMBINING_CLASS(out[q]); 1144 if (indices) 1145 idx[q] = j; 1146 } 1147 p += dlen; 1148 } 1149 // put out[o, p) in canonical ordering 1150 for (q = o + 1; q < p; ++q) 1151 for (r = q; r > o + 1; r) { // FIXME worth using a better sort? 1152 int swap; 1153 if (classes[r] >= classes[r1]) 1154 break; 1155 u = out[r]; out[r] = out[r  1]; out[r  1] = u; 1156 swap = classes[r]; classes[r] = classes[r  1]; classes[r  1] = swap; 1157 if (indices) 1158 swap = idx[r]; idx[r] = idx[r  1]; idx[r  1] = swap; 1159 } 1160 // canonical compose out[o, p) 1161 for (q = o + 1; q < p; ++q) 1162 if (!combine(out[o], out[q], &out[o])) 1163 break; 1164 // move out[q, p) back to [o+1, ?) 1165 if (q != o + 1) 1166 for (r = q, s = o + 1; r < p; ++r, ++s) { 1167 out[s] = out[r]; 1168 if (indices) 1169 idx[s] = idx[r]; 1170 } 1171 else 1172 s = p; 1173 i = j; o = s; 1174 } 1175 } 1176 1177 *out_len = o; 1178 gfree(classes); 1179 if (indices) { 1180 idx[o] = len; 1181 *indices = idx; 1182 } 1183 return out; 1184 }
Note: See TracChangeset
for help on using the changeset viewer.