]>
Commit | Line | Data |
---|---|---|
f9995f31 MG |
1 | /* Copyright 2013 Google Inc. All Rights Reserved. |
2 | ||
3 | Distributed under MIT license. | |
4 | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | |
5 | */ | |
6 | ||
7 | /* Transformations on dictionary words. */ | |
8 | ||
9 | #ifndef BROTLI_DEC_TRANSFORM_H_ | |
10 | #define BROTLI_DEC_TRANSFORM_H_ | |
11 | ||
12 | #include <stdio.h> | |
13 | #include <ctype.h> | |
14 | #include "./port.h" | |
15 | #include "./types.h" | |
16 | ||
17 | #if defined(__cplusplus) || defined(c_plusplus) | |
18 | extern "C" { | |
19 | #endif | |
20 | ||
21 | enum WordTransformType { | |
22 | kIdentity = 0, | |
23 | kOmitLast1 = 1, | |
24 | kOmitLast2 = 2, | |
25 | kOmitLast3 = 3, | |
26 | kOmitLast4 = 4, | |
27 | kOmitLast5 = 5, | |
28 | kOmitLast6 = 6, | |
29 | kOmitLast7 = 7, | |
30 | kOmitLast8 = 8, | |
31 | kOmitLast9 = 9, | |
32 | kUppercaseFirst = 10, | |
33 | kUppercaseAll = 11, | |
34 | kOmitFirst1 = 12, | |
35 | kOmitFirst2 = 13, | |
36 | kOmitFirst3 = 14, | |
37 | kOmitFirst4 = 15, | |
38 | kOmitFirst5 = 16, | |
39 | kOmitFirst6 = 17, | |
40 | kOmitFirst7 = 18, | |
41 | kOmitFirst8 = 19, | |
42 | kOmitFirst9 = 20 | |
43 | }; | |
44 | ||
45 | typedef struct { | |
46 | const uint8_t prefix_id; | |
47 | const uint8_t transform; | |
48 | const uint8_t suffix_id; | |
49 | } Transform; | |
50 | ||
51 | static const char kPrefixSuffix[208] = | |
52 | "\0 \0, \0 of the \0 of \0s \0.\0 and \0 in \0\"\0 to \0\">\0\n\0. \0]\0" | |
53 | " for \0 a \0 that \0\'\0 with \0 from \0 by \0(\0. The \0 on \0 as \0" | |
54 | " is \0ing \0\n\t\0:\0ed \0=\"\0 at \0ly \0,\0=\'\0.com/\0. This \0" | |
55 | " not \0er \0al \0ful \0ive \0less \0est \0ize \0\xc2\xa0\0ous "; | |
56 | ||
57 | enum { | |
58 | /* EMPTY = "" | |
59 | SP = " " | |
60 | DQUOT = "\"" | |
61 | SQUOT = "'" | |
62 | CLOSEBR = "]" | |
63 | OPEN = "(" | |
64 | SLASH = "/" | |
65 | NBSP = non-breaking space "\0xc2\xa0" | |
66 | */ | |
67 | kPFix_EMPTY = 0, | |
68 | kPFix_SP = 1, | |
69 | kPFix_COMMASP = 3, | |
70 | kPFix_SPofSPtheSP = 6, | |
71 | kPFix_SPtheSP = 9, | |
72 | kPFix_eSP = 12, | |
73 | kPFix_SPofSP = 15, | |
74 | kPFix_sSP = 20, | |
75 | kPFix_DOT = 23, | |
76 | kPFix_SPandSP = 25, | |
77 | kPFix_SPinSP = 31, | |
78 | kPFix_DQUOT = 36, | |
79 | kPFix_SPtoSP = 38, | |
80 | kPFix_DQUOTGT = 43, | |
81 | kPFix_NEWLINE = 46, | |
82 | kPFix_DOTSP = 48, | |
83 | kPFix_CLOSEBR = 51, | |
84 | kPFix_SPforSP = 53, | |
85 | kPFix_SPaSP = 59, | |
86 | kPFix_SPthatSP = 63, | |
87 | kPFix_SQUOT = 70, | |
88 | kPFix_SPwithSP = 72, | |
89 | kPFix_SPfromSP = 79, | |
90 | kPFix_SPbySP = 86, | |
91 | kPFix_OPEN = 91, | |
92 | kPFix_DOTSPTheSP = 93, | |
93 | kPFix_SPonSP = 100, | |
94 | kPFix_SPasSP = 105, | |
95 | kPFix_SPisSP = 110, | |
96 | kPFix_ingSP = 115, | |
97 | kPFix_NEWLINETAB = 120, | |
98 | kPFix_COLON = 123, | |
99 | kPFix_edSP = 125, | |
100 | kPFix_EQDQUOT = 129, | |
101 | kPFix_SPatSP = 132, | |
102 | kPFix_lySP = 137, | |
103 | kPFix_COMMA = 141, | |
104 | kPFix_EQSQUOT = 143, | |
105 | kPFix_DOTcomSLASH = 146, | |
106 | kPFix_DOTSPThisSP = 152, | |
107 | kPFix_SPnotSP = 160, | |
108 | kPFix_erSP = 166, | |
109 | kPFix_alSP = 170, | |
110 | kPFix_fulSP = 174, | |
111 | kPFix_iveSP = 179, | |
112 | kPFix_lessSP = 184, | |
113 | kPFix_estSP = 190, | |
114 | kPFix_izeSP = 195, | |
115 | kPFix_NBSP = 200, | |
116 | kPFix_ousSP = 203 | |
117 | }; | |
118 | ||
119 | ||
120 | static const Transform kTransforms[] = { | |
121 | { kPFix_EMPTY, kIdentity, kPFix_EMPTY }, | |
122 | { kPFix_EMPTY, kIdentity, kPFix_SP }, | |
123 | { kPFix_SP, kIdentity, kPFix_SP }, | |
124 | { kPFix_EMPTY, kOmitFirst1, kPFix_EMPTY }, | |
125 | { kPFix_EMPTY, kUppercaseFirst, kPFix_SP }, | |
126 | { kPFix_EMPTY, kIdentity, kPFix_SPtheSP }, | |
127 | { kPFix_SP, kIdentity, kPFix_EMPTY }, | |
128 | { kPFix_sSP, kIdentity, kPFix_SP }, | |
129 | { kPFix_EMPTY, kIdentity, kPFix_SPofSP }, | |
130 | { kPFix_EMPTY, kUppercaseFirst, kPFix_EMPTY }, | |
131 | { kPFix_EMPTY, kIdentity, kPFix_SPandSP }, | |
132 | { kPFix_EMPTY, kOmitFirst2, kPFix_EMPTY }, | |
133 | { kPFix_EMPTY, kOmitLast1, kPFix_EMPTY }, | |
134 | { kPFix_COMMASP, kIdentity, kPFix_SP }, | |
135 | { kPFix_EMPTY, kIdentity, kPFix_COMMASP }, | |
136 | { kPFix_SP, kUppercaseFirst, kPFix_SP }, | |
137 | { kPFix_EMPTY, kIdentity, kPFix_SPinSP }, | |
138 | { kPFix_EMPTY, kIdentity, kPFix_SPtoSP }, | |
139 | { kPFix_eSP, kIdentity, kPFix_SP }, | |
140 | { kPFix_EMPTY, kIdentity, kPFix_DQUOT }, | |
141 | { kPFix_EMPTY, kIdentity, kPFix_DOT }, | |
142 | { kPFix_EMPTY, kIdentity, kPFix_DQUOTGT }, | |
143 | { kPFix_EMPTY, kIdentity, kPFix_NEWLINE }, | |
144 | { kPFix_EMPTY, kOmitLast3, kPFix_EMPTY }, | |
145 | { kPFix_EMPTY, kIdentity, kPFix_CLOSEBR }, | |
146 | { kPFix_EMPTY, kIdentity, kPFix_SPforSP }, | |
147 | { kPFix_EMPTY, kOmitFirst3, kPFix_EMPTY }, | |
148 | { kPFix_EMPTY, kOmitLast2, kPFix_EMPTY }, | |
149 | { kPFix_EMPTY, kIdentity, kPFix_SPaSP }, | |
150 | { kPFix_EMPTY, kIdentity, kPFix_SPthatSP }, | |
151 | { kPFix_SP, kUppercaseFirst, kPFix_EMPTY }, | |
152 | { kPFix_EMPTY, kIdentity, kPFix_DOTSP }, | |
153 | { kPFix_DOT, kIdentity, kPFix_EMPTY }, | |
154 | { kPFix_SP, kIdentity, kPFix_COMMASP }, | |
155 | { kPFix_EMPTY, kOmitFirst4, kPFix_EMPTY }, | |
156 | { kPFix_EMPTY, kIdentity, kPFix_SPwithSP }, | |
157 | { kPFix_EMPTY, kIdentity, kPFix_SQUOT }, | |
158 | { kPFix_EMPTY, kIdentity, kPFix_SPfromSP }, | |
159 | { kPFix_EMPTY, kIdentity, kPFix_SPbySP }, | |
160 | { kPFix_EMPTY, kOmitFirst5, kPFix_EMPTY }, | |
161 | { kPFix_EMPTY, kOmitFirst6, kPFix_EMPTY }, | |
162 | { kPFix_SPtheSP, kIdentity, kPFix_EMPTY }, | |
163 | { kPFix_EMPTY, kOmitLast4, kPFix_EMPTY }, | |
164 | { kPFix_EMPTY, kIdentity, kPFix_DOTSPTheSP }, | |
165 | { kPFix_EMPTY, kUppercaseAll, kPFix_EMPTY }, | |
166 | { kPFix_EMPTY, kIdentity, kPFix_SPonSP }, | |
167 | { kPFix_EMPTY, kIdentity, kPFix_SPasSP }, | |
168 | { kPFix_EMPTY, kIdentity, kPFix_SPisSP }, | |
169 | { kPFix_EMPTY, kOmitLast7, kPFix_EMPTY }, | |
170 | { kPFix_EMPTY, kOmitLast1, kPFix_ingSP }, | |
171 | { kPFix_EMPTY, kIdentity, kPFix_NEWLINETAB }, | |
172 | { kPFix_EMPTY, kIdentity, kPFix_COLON }, | |
173 | { kPFix_SP, kIdentity, kPFix_DOTSP }, | |
174 | { kPFix_EMPTY, kIdentity, kPFix_edSP }, | |
175 | { kPFix_EMPTY, kOmitFirst9, kPFix_EMPTY }, | |
176 | { kPFix_EMPTY, kOmitFirst7, kPFix_EMPTY }, | |
177 | { kPFix_EMPTY, kOmitLast6, kPFix_EMPTY }, | |
178 | { kPFix_EMPTY, kIdentity, kPFix_OPEN }, | |
179 | { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMASP }, | |
180 | { kPFix_EMPTY, kOmitLast8, kPFix_EMPTY }, | |
181 | { kPFix_EMPTY, kIdentity, kPFix_SPatSP }, | |
182 | { kPFix_EMPTY, kIdentity, kPFix_lySP }, | |
183 | { kPFix_SPtheSP, kIdentity, kPFix_SPofSP }, | |
184 | { kPFix_EMPTY, kOmitLast5, kPFix_EMPTY }, | |
185 | { kPFix_EMPTY, kOmitLast9, kPFix_EMPTY }, | |
186 | { kPFix_SP, kUppercaseFirst, kPFix_COMMASP }, | |
187 | { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOT }, | |
188 | { kPFix_DOT, kIdentity, kPFix_OPEN }, | |
189 | { kPFix_EMPTY, kUppercaseAll, kPFix_SP }, | |
190 | { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOTGT }, | |
191 | { kPFix_EMPTY, kIdentity, kPFix_EQDQUOT }, | |
192 | { kPFix_SP, kIdentity, kPFix_DOT }, | |
193 | { kPFix_DOTcomSLASH, kIdentity, kPFix_EMPTY }, | |
194 | { kPFix_SPtheSP, kIdentity, kPFix_SPofSPtheSP }, | |
195 | { kPFix_EMPTY, kUppercaseFirst, kPFix_SQUOT }, | |
196 | { kPFix_EMPTY, kIdentity, kPFix_DOTSPThisSP }, | |
197 | { kPFix_EMPTY, kIdentity, kPFix_COMMA }, | |
198 | { kPFix_DOT, kIdentity, kPFix_SP }, | |
199 | { kPFix_EMPTY, kUppercaseFirst, kPFix_OPEN }, | |
200 | { kPFix_EMPTY, kUppercaseFirst, kPFix_DOT }, | |
201 | { kPFix_EMPTY, kIdentity, kPFix_SPnotSP }, | |
202 | { kPFix_SP, kIdentity, kPFix_EQDQUOT }, | |
203 | { kPFix_EMPTY, kIdentity, kPFix_erSP }, | |
204 | { kPFix_SP, kUppercaseAll, kPFix_SP }, | |
205 | { kPFix_EMPTY, kIdentity, kPFix_alSP }, | |
206 | { kPFix_SP, kUppercaseAll, kPFix_EMPTY }, | |
207 | { kPFix_EMPTY, kIdentity, kPFix_EQSQUOT }, | |
208 | { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOT }, | |
209 | { kPFix_EMPTY, kUppercaseFirst, kPFix_DOTSP }, | |
210 | { kPFix_SP, kIdentity, kPFix_OPEN }, | |
211 | { kPFix_EMPTY, kIdentity, kPFix_fulSP }, | |
212 | { kPFix_SP, kUppercaseFirst, kPFix_DOTSP }, | |
213 | { kPFix_EMPTY, kIdentity, kPFix_iveSP }, | |
214 | { kPFix_EMPTY, kIdentity, kPFix_lessSP }, | |
215 | { kPFix_EMPTY, kUppercaseAll, kPFix_SQUOT }, | |
216 | { kPFix_EMPTY, kIdentity, kPFix_estSP }, | |
217 | { kPFix_SP, kUppercaseFirst, kPFix_DOT }, | |
218 | { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOTGT }, | |
219 | { kPFix_SP, kIdentity, kPFix_EQSQUOT }, | |
220 | { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMA }, | |
221 | { kPFix_EMPTY, kIdentity, kPFix_izeSP }, | |
222 | { kPFix_EMPTY, kUppercaseAll, kPFix_DOT }, | |
223 | { kPFix_NBSP, kIdentity, kPFix_EMPTY }, | |
224 | { kPFix_SP, kIdentity, kPFix_COMMA }, | |
225 | { kPFix_EMPTY, kUppercaseFirst, kPFix_EQDQUOT }, | |
226 | { kPFix_EMPTY, kUppercaseAll, kPFix_EQDQUOT }, | |
227 | { kPFix_EMPTY, kIdentity, kPFix_ousSP }, | |
228 | { kPFix_EMPTY, kUppercaseAll, kPFix_COMMASP }, | |
229 | { kPFix_EMPTY, kUppercaseFirst, kPFix_EQSQUOT }, | |
230 | { kPFix_SP, kUppercaseFirst, kPFix_COMMA }, | |
231 | { kPFix_SP, kUppercaseAll, kPFix_EQDQUOT }, | |
232 | { kPFix_SP, kUppercaseAll, kPFix_COMMASP }, | |
233 | { kPFix_EMPTY, kUppercaseAll, kPFix_COMMA }, | |
234 | { kPFix_EMPTY, kUppercaseAll, kPFix_OPEN }, | |
235 | { kPFix_EMPTY, kUppercaseAll, kPFix_DOTSP }, | |
236 | { kPFix_SP, kUppercaseAll, kPFix_DOT }, | |
237 | { kPFix_EMPTY, kUppercaseAll, kPFix_EQSQUOT }, | |
238 | { kPFix_SP, kUppercaseAll, kPFix_DOTSP }, | |
239 | { kPFix_SP, kUppercaseFirst, kPFix_EQDQUOT }, | |
240 | { kPFix_SP, kUppercaseAll, kPFix_EQSQUOT }, | |
241 | { kPFix_SP, kUppercaseFirst, kPFix_EQSQUOT }, | |
242 | }; | |
243 | ||
244 | static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]); | |
245 | ||
246 | static int ToUpperCase(uint8_t *p) { | |
247 | if (p[0] < 0xc0) { | |
248 | if (p[0] >= 'a' && p[0] <= 'z') { | |
249 | p[0] ^= 32; | |
250 | } | |
251 | return 1; | |
252 | } | |
253 | /* An overly simplified uppercasing model for utf-8. */ | |
254 | if (p[0] < 0xe0) { | |
255 | p[1] ^= 32; | |
256 | return 2; | |
257 | } | |
258 | /* An arbitrary transform for three byte characters. */ | |
259 | p[2] ^= 5; | |
260 | return 3; | |
261 | } | |
262 | ||
263 | static BROTLI_NOINLINE int TransformDictionaryWord( | |
264 | uint8_t* dst, const uint8_t* word, int len, int transform) { | |
265 | int idx = 0; | |
266 | { | |
267 | const char* prefix = &kPrefixSuffix[kTransforms[transform].prefix_id]; | |
268 | while (*prefix) { dst[idx++] = (uint8_t)*prefix++; } | |
269 | } | |
270 | { | |
271 | const int t = kTransforms[transform].transform; | |
272 | int skip = t < kOmitFirst1 ? 0 : t - (kOmitFirst1 - 1); | |
273 | int i = 0; | |
274 | uint8_t* uppercase; | |
275 | if (skip > len) { | |
276 | skip = len; | |
277 | } | |
278 | word += skip; | |
279 | len -= skip; | |
280 | if (t <= kOmitLast9) { | |
281 | len -= t; | |
282 | } | |
283 | while (i < len) { dst[idx++] = word[i++]; } | |
284 | uppercase = &dst[idx - len]; | |
285 | if (t == kUppercaseFirst) { | |
286 | ToUpperCase(uppercase); | |
287 | } else if (t == kUppercaseAll) { | |
288 | while (len > 0) { | |
289 | int step = ToUpperCase(uppercase); | |
290 | uppercase += step; | |
291 | len -= step; | |
292 | } | |
293 | } | |
294 | } | |
295 | { | |
296 | const char* suffix = &kPrefixSuffix[kTransforms[transform].suffix_id]; | |
297 | while (*suffix) { dst[idx++] = (uint8_t)*suffix++; } | |
298 | return idx; | |
299 | } | |
300 | } | |
301 | ||
302 | #if defined(__cplusplus) || defined(c_plusplus) | |
303 | } /* extern "C" */ | |
304 | #endif | |
305 | ||
306 | #endif /* BROTLI_DEC_TRANSFORM_H_ */ |