File: | libs/apr-util/xml/expat/lib/xmltok.c |
Location: | line 1252, column 17 |
Description: | Assigned value is garbage or undefined |
1 | /* | |||
2 | Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd | |||
3 | See the file COPYING for copying permission. | |||
4 | */ | |||
5 | ||||
6 | static char RCSId[] | |||
7 | = "$Header: /home/cvs/apr-util/xml/expat/lib/xmltok.c,v 1.1 2001/02/28 14:41:26 gstein Exp $"; | |||
8 | ||||
9 | #ifdef COMPILED_FROM_DSP | |||
10 | # include "winconfig.h" | |||
11 | #else | |||
12 | # include <config.h> | |||
13 | #endif /* ndef COMPILED_FROM_DSP */ | |||
14 | ||||
15 | #include "xmltok.h" | |||
16 | #include "nametab.h" | |||
17 | ||||
18 | #ifdef XML_DTD | |||
19 | #define IGNORE_SECTION_TOK_VTABLE, PREFIX(ignoreSectionTok) , PREFIX(ignoreSectionTok) | |||
20 | #else | |||
21 | #define IGNORE_SECTION_TOK_VTABLE, PREFIX(ignoreSectionTok) /* as nothing */ | |||
22 | #endif | |||
23 | ||||
24 | #define VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId) \ | |||
25 | { PREFIX(prologTok), PREFIX(contentTok), \ | |||
26 | PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE, PREFIX(ignoreSectionTok) }, \ | |||
27 | { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ | |||
28 | PREFIX(sameName), \ | |||
29 | PREFIX(nameMatchesAscii), \ | |||
30 | PREFIX(nameLength), \ | |||
31 | PREFIX(skipS), \ | |||
32 | PREFIX(getAtts), \ | |||
33 | PREFIX(charRefNumber), \ | |||
34 | PREFIX(predefinedEntityName), \ | |||
35 | PREFIX(updatePosition), \ | |||
36 | PREFIX(isPublicId) | |||
37 | ||||
38 | #define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) | |||
39 | ||||
40 | #define UCS2_GET_NAMING(pages, hi, lo)(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) \ | |||
41 | (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) | |||
42 | ||||
43 | /* A 2 byte UTF-8 representation splits the characters 11 bits | |||
44 | between the bottom 5 and 6 bits of the bytes. | |||
45 | We need 8 bits to index into pages, 3 bits to add to that index and | |||
46 | 5 bits to generate the mask. */ | |||
47 | #define UTF8_GET_NAMING2(pages, byte)(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] & (1 << (((byte)[1]) & 0x1F))) \ | |||
48 | (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ | |||
49 | + ((((byte)[0]) & 3) << 1) \ | |||
50 | + ((((byte)[1]) >> 5) & 1)] \ | |||
51 | & (1 << (((byte)[1]) & 0x1F))) | |||
52 | ||||
53 | /* A 3 byte UTF-8 representation splits the characters 16 bits | |||
54 | between the bottom 4, 6 and 6 bits of the bytes. | |||
55 | We need 8 bits to index into pages, 3 bits to add to that index and | |||
56 | 5 bits to generate the mask. */ | |||
57 | #define UTF8_GET_NAMING3(pages, byte)(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] << 3) + ((((byte )[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1 )] & (1 << (((byte)[2]) & 0x1F))) \ | |||
58 | (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ | |||
59 | + ((((byte)[1]) >> 2) & 0xF)] \ | |||
60 | << 3) \ | |||
61 | + ((((byte)[1]) & 3) << 1) \ | |||
62 | + ((((byte)[2]) >> 5) & 1)] \ | |||
63 | & (1 << (((byte)[2]) & 0x1F))) | |||
64 | ||||
65 | #define UTF8_GET_NAMING(pages, p, n)((n) == 2 ? (namingBitmap[((pages)[((((const unsigned char *) (p))[0]) >> 2) & 7] << 3) + (((((const unsigned char *)(p))[0]) & 3) << 1) + (((((const unsigned char *)(p))[1]) >> 5) & 1)] & (1 << ((((const unsigned char *)(p))[1]) & 0x1F))) : ((n) == 3 ? (namingBitmap [((pages)[(((((const unsigned char *)(p))[0]) & 0xF) << 4) + (((((const unsigned char *)(p))[1]) >> 2) & 0xF )] << 3) + (((((const unsigned char *)(p))[1]) & 3) << 1) + (((((const unsigned char *)(p))[2]) >> 5 ) & 1)] & (1 << ((((const unsigned char *)(p))[ 2]) & 0x1F))) : 0)) \ | |||
66 | ((n) == 2 \ | |||
67 | ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))(namingBitmap[((pages)[((((const unsigned char *)(p))[0]) >> 2) & 7] << 3) + (((((const unsigned char *)(p))[0] ) & 3) << 1) + (((((const unsigned char *)(p))[1]) >> 5) & 1)] & (1 << ((((const unsigned char *)(p) )[1]) & 0x1F))) \ | |||
68 | : ((n) == 3 \ | |||
69 | ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p))(namingBitmap[((pages)[(((((const unsigned char *)(p))[0]) & 0xF) << 4) + (((((const unsigned char *)(p))[1]) >> 2) & 0xF)] << 3) + (((((const unsigned char *)(p)) [1]) & 3) << 1) + (((((const unsigned char *)(p))[2 ]) >> 5) & 1)] & (1 << ((((const unsigned char *)(p))[2]) & 0x1F))) \ | |||
70 | : 0)) | |||
71 | ||||
72 | #define UTF8_INVALID3(p)((*p) == 0xED ? (((p)[1] & 0x20) != 0) : ((*p) == 0xEF ? ( (p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) : 0)) \ | |||
73 | ((*p) == 0xED \ | |||
74 | ? (((p)[1] & 0x20) != 0) \ | |||
75 | : ((*p) == 0xEF \ | |||
76 | ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \ | |||
77 | : 0)) | |||
78 | ||||
79 | #define UTF8_INVALID4(p)((*p) == 0xF4 && ((p)[1] & 0x30) != 0) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0) | |||
80 | ||||
81 | static | |||
82 | int isNever(const ENCODING *enc, const char *p) | |||
83 | { | |||
84 | return 0; | |||
85 | } | |||
86 | ||||
87 | static | |||
88 | int utf8_isName2(const ENCODING *enc, const char *p) | |||
89 | { | |||
90 | return UTF8_GET_NAMING2(namePages, (const unsigned char *)p)(namingBitmap[((namePages)[((((const unsigned char *)p)[0]) >> 2) & 7] << 3) + (((((const unsigned char *)p)[0]) & 3) << 1) + (((((const unsigned char *)p)[1]) >> 5 ) & 1)] & (1 << ((((const unsigned char *)p)[1] ) & 0x1F))); | |||
91 | } | |||
92 | ||||
93 | static | |||
94 | int utf8_isName3(const ENCODING *enc, const char *p) | |||
95 | { | |||
96 | return UTF8_GET_NAMING3(namePages, (const unsigned char *)p)(namingBitmap[((namePages)[(((((const unsigned char *)p)[0]) & 0xF) << 4) + (((((const unsigned char *)p)[1]) >> 2) & 0xF)] << 3) + (((((const unsigned char *)p)[1 ]) & 3) << 1) + (((((const unsigned char *)p)[2]) >> 5) & 1)] & (1 << ((((const unsigned char *)p)[ 2]) & 0x1F))); | |||
97 | } | |||
98 | ||||
99 | #define utf8_isName4isNever isNever | |||
100 | ||||
101 | static | |||
102 | int utf8_isNmstrt2(const ENCODING *enc, const char *p) | |||
103 | { | |||
104 | return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p)(namingBitmap[((nmstrtPages)[((((const unsigned char *)p)[0]) >> 2) & 7] << 3) + (((((const unsigned char * )p)[0]) & 3) << 1) + (((((const unsigned char *)p)[ 1]) >> 5) & 1)] & (1 << ((((const unsigned char *)p)[1]) & 0x1F))); | |||
105 | } | |||
106 | ||||
107 | static | |||
108 | int utf8_isNmstrt3(const ENCODING *enc, const char *p) | |||
109 | { | |||
110 | return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p)(namingBitmap[((nmstrtPages)[(((((const unsigned char *)p)[0] ) & 0xF) << 4) + (((((const unsigned char *)p)[1]) >> 2) & 0xF)] << 3) + (((((const unsigned char *)p)[1 ]) & 3) << 1) + (((((const unsigned char *)p)[2]) >> 5) & 1)] & (1 << ((((const unsigned char *)p)[ 2]) & 0x1F))); | |||
111 | } | |||
112 | ||||
113 | #define utf8_isNmstrt4isNever isNever | |||
114 | ||||
115 | #define utf8_isInvalid2isNever isNever | |||
116 | ||||
117 | static | |||
118 | int utf8_isInvalid3(const ENCODING *enc, const char *p) | |||
119 | { | |||
120 | return UTF8_INVALID3((const unsigned char *)p)((*(const unsigned char *)p) == 0xED ? ((((const unsigned char *)p)[1] & 0x20) != 0) : ((*(const unsigned char *)p) == 0xEF ? (((const unsigned char *)p)[1] == 0xBF && (((const unsigned char *)p)[2] == 0xBF || ((const unsigned char *)p)[ 2] == 0xBE)) : 0)); | |||
121 | } | |||
122 | ||||
123 | static | |||
124 | int utf8_isInvalid4(const ENCODING *enc, const char *p) | |||
125 | { | |||
126 | return UTF8_INVALID4((const unsigned char *)p)((*(const unsigned char *)p) == 0xF4 && (((const unsigned char *)p)[1] & 0x30) != 0); | |||
127 | } | |||
128 | ||||
129 | struct normal_encoding { | |||
130 | ENCODING enc; | |||
131 | unsigned char type[256]; | |||
132 | #ifdef XML_MIN_SIZE | |||
133 | int (*byteType)(const ENCODING *, const char *); | |||
134 | int (*isNameMin)(const ENCODING *, const char *); | |||
135 | int (*isNmstrtMin)(const ENCODING *, const char *); | |||
136 | int (*byteToAscii)(const ENCODING *, const char *); | |||
137 | int (*charMatches)(const ENCODING *, const char *, int); | |||
138 | #endif /* XML_MIN_SIZE */ | |||
139 | int (*isName2)(const ENCODING *, const char *); | |||
140 | int (*isName3)(const ENCODING *, const char *); | |||
141 | int (*isName4)(const ENCODING *, const char *); | |||
142 | int (*isNmstrt2)(const ENCODING *, const char *); | |||
143 | int (*isNmstrt3)(const ENCODING *, const char *); | |||
144 | int (*isNmstrt4)(const ENCODING *, const char *); | |||
145 | int (*isInvalid2)(const ENCODING *, const char *); | |||
146 | int (*isInvalid3)(const ENCODING *, const char *); | |||
147 | int (*isInvalid4)(const ENCODING *, const char *); | |||
148 | }; | |||
149 | ||||
150 | #ifdef XML_MIN_SIZE | |||
151 | ||||
152 | #define STANDARD_VTABLE(E) \ | |||
153 | E ## byteType, \ | |||
154 | E ## isNameMin, \ | |||
155 | E ## isNmstrtMin, \ | |||
156 | E ## byteToAscii, \ | |||
157 | E ## charMatches, | |||
158 | ||||
159 | #else | |||
160 | ||||
161 | #define STANDARD_VTABLE(E) /* as nothing */ | |||
162 | ||||
163 | #endif | |||
164 | ||||
165 | #define NORMAL_VTABLE(E)EisName2, EisName3, EisName4, EisNmstrt2, EisNmstrt3, EisNmstrt4 , EisInvalid2, EisInvalid3, EisInvalid4 \ | |||
166 | E ## isName2, \ | |||
167 | E ## isName3, \ | |||
168 | E ## isName4, \ | |||
169 | E ## isNmstrt2, \ | |||
170 | E ## isNmstrt3, \ | |||
171 | E ## isNmstrt4, \ | |||
172 | E ## isInvalid2, \ | |||
173 | E ## isInvalid3, \ | |||
174 | E ## isInvalid4 | |||
175 | ||||
176 | static int checkCharRefNumber(int); | |||
177 | ||||
178 | #include "xmltok_impl.h" | |||
179 | #include "ascii.h" | |||
180 | ||||
181 | #ifdef XML_MIN_SIZE | |||
182 | #define sb_isNameMin isNever | |||
183 | #define sb_isNmstrtMin isNever | |||
184 | #endif | |||
185 | ||||
186 | #ifdef XML_MIN_SIZE | |||
187 | #define MINBPC(enc) ((enc)->minBytesPerChar) | |||
188 | #else | |||
189 | /* minimum bytes per character */ | |||
190 | #define MINBPC(enc) 1 | |||
191 | #endif | |||
192 | ||||
193 | #define SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*( p)]) \ | |||
194 | (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) | |||
195 | ||||
196 | #ifdef XML_MIN_SIZE | |||
197 | static | |||
198 | int sb_byteType(const ENCODING *enc, const char *p) | |||
199 | { | |||
200 | return SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*( p)]); | |||
201 | } | |||
202 | #define BYTE_TYPE(enc, p) \ | |||
203 | (((const struct normal_encoding *)(enc))->byteType(enc, p)) | |||
204 | #else | |||
205 | #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)(((struct normal_encoding *)(enc))->type[(unsigned char)*( p)]) | |||
206 | #endif | |||
207 | ||||
208 | #ifdef XML_MIN_SIZE | |||
209 | #define BYTE_TO_ASCII(enc, p) \ | |||
210 | (((const struct normal_encoding *)(enc))->byteToAscii(enc, p)) | |||
211 | static | |||
212 | int sb_byteToAscii(const ENCODING *enc, const char *p) | |||
213 | { | |||
214 | return *p; | |||
215 | } | |||
216 | #else | |||
217 | #define BYTE_TO_ASCII(enc, p) (*(p)) | |||
218 | #endif | |||
219 | ||||
220 | #define IS_NAME_CHAR(enc, p, n) \ | |||
221 | (((const struct normal_encoding *)(enc))->isName ## n(enc, p)) | |||
222 | #define IS_NMSTRT_CHAR(enc, p, n) \ | |||
223 | (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p)) | |||
224 | #define IS_INVALID_CHAR(enc, p, n) \ | |||
225 | (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p)) | |||
226 | ||||
227 | #ifdef XML_MIN_SIZE | |||
228 | #define IS_NAME_CHAR_MINBPC(enc, p) \ | |||
229 | (((const struct normal_encoding *)(enc))->isNameMin(enc, p)) | |||
230 | #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ | |||
231 | (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p)) | |||
232 | #else | |||
233 | #define IS_NAME_CHAR_MINBPC(enc, p) (0) | |||
234 | #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) | |||
235 | #endif | |||
236 | ||||
237 | #ifdef XML_MIN_SIZE | |||
238 | #define CHAR_MATCHES(enc, p, c) \ | |||
239 | (((const struct normal_encoding *)(enc))->charMatches(enc, p, c)) | |||
240 | static | |||
241 | int sb_charMatches(const ENCODING *enc, const char *p, int c) | |||
242 | { | |||
243 | return *p == c; | |||
244 | } | |||
245 | #else | |||
246 | /* c is an ASCII character */ | |||
247 | #define CHAR_MATCHES(enc, p, c) (*(p) == c) | |||
248 | #endif | |||
249 | ||||
250 | #define PREFIX(ident) normal_ ## ident | |||
251 | #include "xmltok_impl.c" | |||
252 | ||||
253 | #undef MINBPC | |||
254 | #undef BYTE_TYPE | |||
255 | #undef BYTE_TO_ASCII | |||
256 | #undef CHAR_MATCHES | |||
257 | #undef IS_NAME_CHAR | |||
258 | #undef IS_NAME_CHAR_MINBPC | |||
259 | #undef IS_NMSTRT_CHAR | |||
260 | #undef IS_NMSTRT_CHAR_MINBPC | |||
261 | #undef IS_INVALID_CHAR | |||
262 | ||||
263 | enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ | |||
264 | UTF8_cval1 = 0x00, | |||
265 | UTF8_cval2 = 0xc0, | |||
266 | UTF8_cval3 = 0xe0, | |||
267 | UTF8_cval4 = 0xf0 | |||
268 | }; | |||
269 | ||||
270 | static | |||
271 | void utf8_toUtf8(const ENCODING *enc, | |||
272 | const char **fromP, const char *fromLim, | |||
273 | char **toP, const char *toLim) | |||
274 | { | |||
275 | char *to; | |||
276 | const char *from; | |||
277 | if (fromLim - *fromP > toLim - *toP) { | |||
278 | /* Avoid copying partial characters. */ | |||
279 | for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) | |||
280 | if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) | |||
281 | break; | |||
282 | } | |||
283 | for (to = *toP, from = *fromP; from != fromLim; from++, to++) | |||
284 | *to = *from; | |||
285 | *fromP = from; | |||
286 | *toP = to; | |||
287 | } | |||
288 | ||||
289 | static | |||
290 | void utf8_toUtf16(const ENCODING *enc, | |||
291 | const char **fromP, const char *fromLim, | |||
292 | unsigned short **toP, const unsigned short *toLim) | |||
293 | { | |||
294 | unsigned short *to = *toP; | |||
295 | const char *from = *fromP; | |||
296 | while (from != fromLim && to != toLim) { | |||
297 | switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { | |||
298 | case BT_LEAD2: | |||
299 | *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f); | |||
300 | from += 2; | |||
301 | break; | |||
302 | case BT_LEAD3: | |||
303 | *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f); | |||
304 | from += 3; | |||
305 | break; | |||
306 | case BT_LEAD4: | |||
307 | { | |||
308 | unsigned long n; | |||
309 | if (to + 1 == toLim) | |||
310 | break; | |||
311 | n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); | |||
312 | n -= 0x10000; | |||
313 | to[0] = (unsigned short)((n >> 10) | 0xD800); | |||
314 | to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); | |||
315 | to += 2; | |||
316 | from += 4; | |||
317 | } | |||
318 | break; | |||
319 | default: | |||
320 | *to++ = *from++; | |||
321 | break; | |||
322 | } | |||
323 | } | |||
324 | *fromP = from; | |||
325 | *toP = to; | |||
326 | } | |||
327 | ||||
328 | #ifdef XML_NS | |||
329 | static const struct normal_encoding utf8_encoding_ns = { | |||
330 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |||
331 | { | |||
332 | #include "asciitab.h" | |||
333 | #include "utf8tab.h" | |||
334 | }, | |||
335 | STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3 , isNever, isNever, utf8_isInvalid3, utf8_isInvalid4 | |||
336 | }; | |||
337 | #endif | |||
338 | ||||
339 | static const struct normal_encoding utf8_encoding = { | |||
340 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |||
341 | { | |||
342 | #define BT_COLON BT_NMSTRT | |||
343 | #include "asciitab.h" | |||
344 | #undef BT_COLON | |||
345 | #include "utf8tab.h" | |||
346 | }, | |||
347 | STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3 , isNever, isNever, utf8_isInvalid3, utf8_isInvalid4 | |||
348 | }; | |||
349 | ||||
350 | #ifdef XML_NS | |||
351 | ||||
352 | static const struct normal_encoding internal_utf8_encoding_ns = { | |||
353 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |||
354 | { | |||
355 | #include "iasciitab.h" | |||
356 | #include "utf8tab.h" | |||
357 | }, | |||
358 | STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3 , isNever, isNever, utf8_isInvalid3, utf8_isInvalid4 | |||
359 | }; | |||
360 | ||||
361 | #endif | |||
362 | ||||
363 | static const struct normal_encoding internal_utf8_encoding = { | |||
364 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, | |||
365 | { | |||
366 | #define BT_COLON BT_NMSTRT | |||
367 | #include "iasciitab.h" | |||
368 | #undef BT_COLON | |||
369 | #include "utf8tab.h" | |||
370 | }, | |||
371 | STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)utf8_isName2, utf8_isName3, isNever, utf8_isNmstrt2, utf8_isNmstrt3 , isNever, isNever, utf8_isInvalid3, utf8_isInvalid4 | |||
372 | }; | |||
373 | ||||
374 | static | |||
375 | void latin1_toUtf8(const ENCODING *enc, | |||
376 | const char **fromP, const char *fromLim, | |||
377 | char **toP, const char *toLim) | |||
378 | { | |||
379 | for (;;) { | |||
380 | unsigned char c; | |||
381 | if (*fromP == fromLim) | |||
382 | break; | |||
383 | c = (unsigned char)**fromP; | |||
384 | if (c & 0x80) { | |||
385 | if (toLim - *toP < 2) | |||
386 | break; | |||
387 | *(*toP)++ = ((c >> 6) | UTF8_cval2); | |||
388 | *(*toP)++ = ((c & 0x3f) | 0x80); | |||
389 | (*fromP)++; | |||
390 | } | |||
391 | else { | |||
392 | if (*toP == toLim) | |||
393 | break; | |||
394 | *(*toP)++ = *(*fromP)++; | |||
395 | } | |||
396 | } | |||
397 | } | |||
398 | ||||
399 | static | |||
400 | void latin1_toUtf16(const ENCODING *enc, | |||
401 | const char **fromP, const char *fromLim, | |||
402 | unsigned short **toP, const unsigned short *toLim) | |||
403 | { | |||
404 | while (*fromP != fromLim && *toP != toLim) | |||
405 | *(*toP)++ = (unsigned char)*(*fromP)++; | |||
406 | } | |||
407 | ||||
408 | #ifdef XML_NS | |||
409 | ||||
410 | static const struct normal_encoding latin1_encoding_ns = { | |||
411 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, | |||
412 | { | |||
413 | #include "asciitab.h" | |||
414 | #include "latin1tab.h" | |||
415 | }, | |||
416 | STANDARD_VTABLE(sb_) | |||
417 | }; | |||
418 | ||||
419 | #endif | |||
420 | ||||
421 | static const struct normal_encoding latin1_encoding = { | |||
422 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, | |||
423 | { | |||
424 | #define BT_COLON BT_NMSTRT | |||
425 | #include "asciitab.h" | |||
426 | #undef BT_COLON | |||
427 | #include "latin1tab.h" | |||
428 | }, | |||
429 | STANDARD_VTABLE(sb_) | |||
430 | }; | |||
431 | ||||
432 | static | |||
433 | void ascii_toUtf8(const ENCODING *enc, | |||
434 | const char **fromP, const char *fromLim, | |||
435 | char **toP, const char *toLim) | |||
436 | { | |||
437 | while (*fromP != fromLim && *toP != toLim) | |||
438 | *(*toP)++ = *(*fromP)++; | |||
439 | } | |||
440 | ||||
441 | #ifdef XML_NS | |||
442 | ||||
443 | static const struct normal_encoding ascii_encoding_ns = { | |||
444 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, | |||
445 | { | |||
446 | #include "asciitab.h" | |||
447 | /* BT_NONXML == 0 */ | |||
448 | }, | |||
449 | STANDARD_VTABLE(sb_) | |||
450 | }; | |||
451 | ||||
452 | #endif | |||
453 | ||||
454 | static const struct normal_encoding ascii_encoding = { | |||
455 | { VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, | |||
456 | { | |||
457 | #define BT_COLON BT_NMSTRT | |||
458 | #include "asciitab.h" | |||
459 | #undef BT_COLON | |||
460 | /* BT_NONXML == 0 */ | |||
461 | }, | |||
462 | STANDARD_VTABLE(sb_) | |||
463 | }; | |||
464 | ||||
465 | static int unicode_byte_type(char hi, char lo) | |||
466 | { | |||
467 | switch ((unsigned char)hi) { | |||
468 | case 0xD8: case 0xD9: case 0xDA: case 0xDB: | |||
469 | return BT_LEAD4; | |||
470 | case 0xDC: case 0xDD: case 0xDE: case 0xDF: | |||
471 | return BT_TRAIL; | |||
472 | case 0xFF: | |||
473 | switch ((unsigned char)lo) { | |||
474 | case 0xFF: | |||
475 | case 0xFE: | |||
476 | return BT_NONXML; | |||
477 | } | |||
478 | break; | |||
479 | } | |||
480 | return BT_NONASCII; | |||
481 | } | |||
482 | ||||
483 | #define DEFINE_UTF16_TO_UTF8(E)static void EtoUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) { const char * from; for (from = *fromP; from != fromLim; from += 2) { int plane ; unsigned char lo2; unsigned char lo = GET_LO(from); unsigned char hi = GET_HI(from); switch (hi) { case 0: if (lo < 0x80 ) { if (*toP == toLim) { *fromP = from; return; } *(*toP)++ = lo; break; } case 0x1: case 0x2: case 0x3: case 0x4: case 0x5 : case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP = from ; return; } *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2 ); *(*toP)++ = ((lo & 0x3f) | 0x80); break; default: if ( toLim - *toP < 3) { *fromP = from; return; } *(*toP)++ = ( (hi >> 4) | UTF8_cval3); *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); *(*toP)++ = ((lo & 0x3f) | 0x80); break; case 0xD8: case 0xD9: case 0xDA: case 0xDB: if (toLim - *toP < 4) { *fromP = from; return; } plane = ((( hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; *(*toP)++ = ((plane >> 2) | UTF8_cval4); *(*toP)++ = ( ((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); from += 2; lo2 = GET_LO(from); *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) | ( lo2 >> 6) | 0x80); *(*toP)++ = ((lo2 & 0x3f) | 0x80 ); break; } } *fromP = from; } \ | |||
484 | static \ | |||
485 | void E ## toUtf8(const ENCODING *enc, \ | |||
486 | const char **fromP, const char *fromLim, \ | |||
487 | char **toP, const char *toLim) \ | |||
488 | { \ | |||
489 | const char *from; \ | |||
490 | for (from = *fromP; from != fromLim; from += 2) { \ | |||
491 | int plane; \ | |||
492 | unsigned char lo2; \ | |||
493 | unsigned char lo = GET_LO(from); \ | |||
494 | unsigned char hi = GET_HI(from); \ | |||
495 | switch (hi) { \ | |||
496 | case 0: \ | |||
497 | if (lo < 0x80) { \ | |||
498 | if (*toP == toLim) { \ | |||
499 | *fromP = from; \ | |||
500 | return; \ | |||
501 | } \ | |||
502 | *(*toP)++ = lo; \ | |||
503 | break; \ | |||
504 | } \ | |||
505 | /* fall through */ \ | |||
506 | case 0x1: case 0x2: case 0x3: \ | |||
507 | case 0x4: case 0x5: case 0x6: case 0x7: \ | |||
508 | if (toLim - *toP < 2) { \ | |||
509 | *fromP = from; \ | |||
510 | return; \ | |||
511 | } \ | |||
512 | *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ | |||
513 | *(*toP)++ = ((lo & 0x3f) | 0x80); \ | |||
514 | break; \ | |||
515 | default: \ | |||
516 | if (toLim - *toP < 3) { \ | |||
517 | *fromP = from; \ | |||
518 | return; \ | |||
519 | } \ | |||
520 | /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ | |||
521 | *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ | |||
522 | *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ | |||
523 | *(*toP)++ = ((lo & 0x3f) | 0x80); \ | |||
524 | break; \ | |||
525 | case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ | |||
526 | if (toLim - *toP < 4) { \ | |||
527 | *fromP = from; \ | |||
528 | return; \ | |||
529 | } \ | |||
530 | plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ | |||
531 | *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ | |||
532 | *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ | |||
533 | from += 2; \ | |||
534 | lo2 = GET_LO(from); \ | |||
535 | *(*toP)++ = (((lo & 0x3) << 4) \ | |||
536 | | ((GET_HI(from) & 0x3) << 2) \ | |||
537 | | (lo2 >> 6) \ | |||
538 | | 0x80); \ | |||
539 | *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ | |||
540 | break; \ | |||
541 | } \ | |||
542 | } \ | |||
543 | *fromP = from; \ | |||
544 | } | |||
545 | ||||
546 | #define DEFINE_UTF16_TO_UTF16(E)static void EtoUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { if (fromLim - *fromP > ((toLim - *toP) << 1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim -= 2; for (; *fromP != fromLim && *toP != toLim; *fromP += 2) *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP ); } \ | |||
547 | static \ | |||
548 | void E ## toUtf16(const ENCODING *enc, \ | |||
549 | const char **fromP, const char *fromLim, \ | |||
550 | unsigned short **toP, const unsigned short *toLim) \ | |||
551 | { \ | |||
552 | /* Avoid copying first half only of surrogate */ \ | |||
553 | if (fromLim - *fromP > ((toLim - *toP) << 1) \ | |||
554 | && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ | |||
555 | fromLim -= 2; \ | |||
556 | for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ | |||
557 | *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ | |||
558 | } | |||
559 | ||||
560 | #define SET2(ptr, ch) \ | |||
561 | (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) | |||
562 | #define GET_LO(ptr) ((unsigned char)(ptr)[0]) | |||
563 | #define GET_HI(ptr) ((unsigned char)(ptr)[1]) | |||
564 | ||||
565 | DEFINE_UTF16_TO_UTF8(little2_)static void little2_toUtf8(const ENCODING *enc, const char ** fromP, const char *fromLim, char **toP, const char *toLim) { const char *from; for (from = *fromP; from != fromLim; from += 2) { int plane; unsigned char lo2; unsigned char lo = GET_LO(from ); unsigned char hi = GET_HI(from); switch (hi) { case 0: if ( lo < 0x80) { if (*toP == toLim) { *fromP = from; return; } *(*toP)++ = lo; break; } case 0x1: case 0x2: case 0x3: case 0x4 : case 0x5: case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP = from; return; } *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); *(*toP)++ = ((lo & 0x3f) | 0x80); break ; default: if (toLim - *toP < 3) { *fromP = from; return; } *(*toP)++ = ((hi >> 4) | UTF8_cval3); *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); *(*toP)++ = ((lo & 0x3f) | 0x80); break; case 0xD8: case 0xD9: case 0xDA : case 0xDB: if (toLim - *toP < 4) { *fromP = from; return ; } plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; *(*toP)++ = ((plane >> 2) | UTF8_cval4); *( *toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); from += 2; lo2 = GET_LO(from); *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2 ) | (lo2 >> 6) | 0x80); *(*toP)++ = ((lo2 & 0x3f) | 0x80); break; } } *fromP = from; } | |||
566 | DEFINE_UTF16_TO_UTF16(little2_)static void little2_toUtf16(const ENCODING *enc, const char * *fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { if (fromLim - *fromP > ((toLim - *toP) << 1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim -= 2; for (; *fromP != fromLim && *toP != toLim; *fromP += 2) *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP ); } | |||
567 | ||||
568 | #undef SET2 | |||
569 | #undef GET_LO | |||
570 | #undef GET_HI | |||
571 | ||||
572 | #define SET2(ptr, ch) \ | |||
573 | (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) | |||
574 | #define GET_LO(ptr) ((unsigned char)(ptr)[1]) | |||
575 | #define GET_HI(ptr) ((unsigned char)(ptr)[0]) | |||
576 | ||||
577 | DEFINE_UTF16_TO_UTF8(big2_)static void big2_toUtf8(const ENCODING *enc, const char **fromP , const char *fromLim, char **toP, const char *toLim) { const char *from; for (from = *fromP; from != fromLim; from += 2) { int plane; unsigned char lo2; unsigned char lo = GET_LO(from ); unsigned char hi = GET_HI(from); switch (hi) { case 0: if ( lo < 0x80) { if (*toP == toLim) { *fromP = from; return; } *(*toP)++ = lo; break; } case 0x1: case 0x2: case 0x3: case 0x4 : case 0x5: case 0x6: case 0x7: if (toLim - *toP < 2) { *fromP = from; return; } *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); *(*toP)++ = ((lo & 0x3f) | 0x80); break ; default: if (toLim - *toP < 3) { *fromP = from; return; } *(*toP)++ = ((hi >> 4) | UTF8_cval3); *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); *(*toP)++ = ((lo & 0x3f) | 0x80); break; case 0xD8: case 0xD9: case 0xDA : case 0xDB: if (toLim - *toP < 4) { *fromP = from; return ; } plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; *(*toP)++ = ((plane >> 2) | UTF8_cval4); *( *toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); from += 2; lo2 = GET_LO(from); *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2 ) | (lo2 >> 6) | 0x80); *(*toP)++ = ((lo2 & 0x3f) | 0x80); break; } } *fromP = from; } | |||
578 | DEFINE_UTF16_TO_UTF16(big2_)static void big2_toUtf16(const ENCODING *enc, const char **fromP , const char *fromLim, unsigned short **toP, const unsigned short *toLim) { if (fromLim - *fromP > ((toLim - *toP) << 1) && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) fromLim -= 2; for (; *fromP != fromLim && *toP != toLim; *fromP += 2) *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP ); } | |||
579 | ||||
580 | #undef SET2 | |||
581 | #undef GET_LO | |||
582 | #undef GET_HI | |||
583 | ||||
584 | #define LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0])) \ | |||
585 | ((p)[1] == 0 \ | |||
586 | ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ | |||
587 | : unicode_byte_type((p)[1], (p)[0])) | |||
588 | #define LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1) ((p)[1] == 0 ? (p)[0] : -1) | |||
589 | #define LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c) ((p)[1] == 0 && (p)[0] == c) | |||
590 | #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + ( ((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) \ | |||
591 | UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + ( ((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) | |||
592 | #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) + (((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) \ | |||
593 | UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) + (((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) | |||
594 | ||||
595 | #ifdef XML_MIN_SIZE | |||
596 | ||||
597 | static | |||
598 | int little2_byteType(const ENCODING *enc, const char *p) | |||
599 | { | |||
600 | return LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0])); | |||
601 | } | |||
602 | ||||
603 | static | |||
604 | int little2_byteToAscii(const ENCODING *enc, const char *p) | |||
605 | { | |||
606 | return LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1); | |||
607 | } | |||
608 | ||||
609 | static | |||
610 | int little2_charMatches(const ENCODING *enc, const char *p, int c) | |||
611 | { | |||
612 | return LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c); | |||
613 | } | |||
614 | ||||
615 | static | |||
616 | int little2_isNameMin(const ENCODING *enc, const char *p) | |||
617 | { | |||
618 | return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + ( ((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))); | |||
619 | } | |||
620 | ||||
621 | static | |||
622 | int little2_isNmstrtMin(const ENCODING *enc, const char *p) | |||
623 | { | |||
624 | return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) + (((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))); | |||
625 | } | |||
626 | ||||
627 | #undef VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) | |||
628 | #define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), little2_toUtf8, little2_toUtf16 | |||
629 | ||||
630 | #else /* not XML_MIN_SIZE */ | |||
631 | ||||
632 | #undef PREFIX | |||
633 | #define PREFIX(ident) little2_ ## ident | |||
634 | #define MINBPC(enc) 2 | |||
635 | /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ | |||
636 | #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0])) | |||
637 | #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)((p)[1] == 0 ? (p)[0] : -1) | |||
638 | #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)((p)[1] == 0 && (p)[0] == c) | |||
639 | #define IS_NAME_CHAR(enc, p, n) 0 | |||
640 | #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[1]] << 3) + ( ((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) | |||
641 | #define IS_NMSTRT_CHAR(enc, p, n) (0) | |||
642 | #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[1]] << 3) + (((unsigned char)p[0]) >> 5)] & (1 << (((unsigned char)p[0]) & 0x1F))) | |||
643 | ||||
644 | #include "xmltok_impl.c" | |||
645 | ||||
646 | #undef MINBPC | |||
647 | #undef BYTE_TYPE | |||
648 | #undef BYTE_TO_ASCII | |||
649 | #undef CHAR_MATCHES | |||
650 | #undef IS_NAME_CHAR | |||
651 | #undef IS_NAME_CHAR_MINBPC | |||
652 | #undef IS_NMSTRT_CHAR | |||
653 | #undef IS_NMSTRT_CHAR_MINBPC | |||
654 | #undef IS_INVALID_CHAR | |||
655 | ||||
656 | #endif /* not XML_MIN_SIZE */ | |||
657 | ||||
658 | #ifdef XML_NS | |||
659 | ||||
660 | static const struct normal_encoding little2_encoding_ns = { | |||
661 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, | |||
662 | #if XML_BYTE_ORDER12 == 12 | |||
663 | 1 | |||
664 | #else | |||
665 | 0 | |||
666 | #endif | |||
667 | }, | |||
668 | { | |||
669 | #include "asciitab.h" | |||
670 | #include "latin1tab.h" | |||
671 | }, | |||
672 | STANDARD_VTABLE(little2_) | |||
673 | }; | |||
674 | ||||
675 | #endif | |||
676 | ||||
677 | static const struct normal_encoding little2_encoding = { | |||
678 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, | |||
679 | #if XML_BYTE_ORDER12 == 12 | |||
680 | 1 | |||
681 | #else | |||
682 | 0 | |||
683 | #endif | |||
684 | }, | |||
685 | { | |||
686 | #define BT_COLON BT_NMSTRT | |||
687 | #include "asciitab.h" | |||
688 | #undef BT_COLON | |||
689 | #include "latin1tab.h" | |||
690 | }, | |||
691 | STANDARD_VTABLE(little2_) | |||
692 | }; | |||
693 | ||||
694 | #if XML_BYTE_ORDER12 != 21 | |||
695 | ||||
696 | #ifdef XML_NS | |||
697 | ||||
698 | static const struct normal_encoding internal_little2_encoding_ns = { | |||
699 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, 1 }, | |||
700 | { | |||
701 | #include "iasciitab.h" | |||
702 | #include "latin1tab.h" | |||
703 | }, | |||
704 | STANDARD_VTABLE(little2_) | |||
705 | }; | |||
706 | ||||
707 | #endif | |||
708 | ||||
709 | static const struct normal_encoding internal_little2_encoding = { | |||
710 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, 1 }, | |||
711 | { | |||
712 | #define BT_COLON BT_NMSTRT | |||
713 | #include "iasciitab.h" | |||
714 | #undef BT_COLON | |||
715 | #include "latin1tab.h" | |||
716 | }, | |||
717 | STANDARD_VTABLE(little2_) | |||
718 | }; | |||
719 | ||||
720 | #endif | |||
721 | ||||
722 | ||||
723 | #define BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1])) \ | |||
724 | ((p)[0] == 0 \ | |||
725 | ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ | |||
726 | : unicode_byte_type((p)[0], (p)[1])) | |||
727 | #define BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1) ((p)[0] == 0 ? (p)[1] : -1) | |||
728 | #define BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c) ((p)[0] == 0 && (p)[1] == c) | |||
729 | #define BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + ( ((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) \ | |||
730 | UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + ( ((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) | |||
731 | #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) + (((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) \ | |||
732 | UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) + (((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) | |||
733 | ||||
734 | #ifdef XML_MIN_SIZE | |||
735 | ||||
736 | static | |||
737 | int big2_byteType(const ENCODING *enc, const char *p) | |||
738 | { | |||
739 | return BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1])); | |||
740 | } | |||
741 | ||||
742 | static | |||
743 | int big2_byteToAscii(const ENCODING *enc, const char *p) | |||
744 | { | |||
745 | return BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1); | |||
746 | } | |||
747 | ||||
748 | static | |||
749 | int big2_charMatches(const ENCODING *enc, const char *p, int c) | |||
750 | { | |||
751 | return BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c); | |||
752 | } | |||
753 | ||||
754 | static | |||
755 | int big2_isNameMin(const ENCODING *enc, const char *p) | |||
756 | { | |||
757 | return BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + ( ((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))); | |||
758 | } | |||
759 | ||||
760 | static | |||
761 | int big2_isNmstrtMin(const ENCODING *enc, const char *p) | |||
762 | { | |||
763 | return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) + (((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))); | |||
764 | } | |||
765 | ||||
766 | #undef VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) | |||
767 | #define VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16) VTABLE1{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), big2_toUtf8, big2_toUtf16 | |||
768 | ||||
769 | #else /* not XML_MIN_SIZE */ | |||
770 | ||||
771 | #undef PREFIX | |||
772 | #define PREFIX(ident) big2_ ## ident | |||
773 | #define MINBPC(enc) 2 | |||
774 | /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ | |||
775 | #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)((p)[0] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1])) | |||
776 | #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)((p)[0] == 0 ? (p)[1] : -1) | |||
777 | #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)((p)[0] == 0 && (p)[1] == c) | |||
778 | #define IS_NAME_CHAR(enc, p, n) 0 | |||
779 | #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)(namingBitmap[(namePages[(unsigned char)p[0]] << 3) + ( ((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) | |||
780 | #define IS_NMSTRT_CHAR(enc, p, n) (0) | |||
781 | #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)(namingBitmap[(nmstrtPages[(unsigned char)p[0]] << 3) + (((unsigned char)p[1]) >> 5)] & (1 << (((unsigned char)p[1]) & 0x1F))) | |||
782 | ||||
783 | #include "xmltok_impl.c" | |||
784 | ||||
785 | #undef MINBPC | |||
786 | #undef BYTE_TYPE | |||
787 | #undef BYTE_TO_ASCII | |||
788 | #undef CHAR_MATCHES | |||
789 | #undef IS_NAME_CHAR | |||
790 | #undef IS_NAME_CHAR_MINBPC | |||
791 | #undef IS_NMSTRT_CHAR | |||
792 | #undef IS_NMSTRT_CHAR_MINBPC | |||
793 | #undef IS_INVALID_CHAR | |||
794 | ||||
795 | #endif /* not XML_MIN_SIZE */ | |||
796 | ||||
797 | #ifdef XML_NS | |||
798 | ||||
799 | static const struct normal_encoding big2_encoding_ns = { | |||
800 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, | |||
801 | #if XML_BYTE_ORDER12 == 21 | |||
802 | 1 | |||
803 | #else | |||
804 | 0 | |||
805 | #endif | |||
806 | }, | |||
807 | { | |||
808 | #include "asciitab.h" | |||
809 | #include "latin1tab.h" | |||
810 | }, | |||
811 | STANDARD_VTABLE(big2_) | |||
812 | }; | |||
813 | ||||
814 | #endif | |||
815 | ||||
816 | static const struct normal_encoding big2_encoding = { | |||
817 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, | |||
818 | #if XML_BYTE_ORDER12 == 21 | |||
819 | 1 | |||
820 | #else | |||
821 | 0 | |||
822 | #endif | |||
823 | }, | |||
824 | { | |||
825 | #define BT_COLON BT_NMSTRT | |||
826 | #include "asciitab.h" | |||
827 | #undef BT_COLON | |||
828 | #include "latin1tab.h" | |||
829 | }, | |||
830 | STANDARD_VTABLE(big2_) | |||
831 | }; | |||
832 | ||||
833 | #if XML_BYTE_ORDER12 != 12 | |||
834 | ||||
835 | #ifdef XML_NS | |||
836 | ||||
837 | static const struct normal_encoding internal_big2_encoding_ns = { | |||
838 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, 1 }, | |||
839 | { | |||
840 | #include "iasciitab.h" | |||
841 | #include "latin1tab.h" | |||
842 | }, | |||
843 | STANDARD_VTABLE(big2_) | |||
844 | }; | |||
845 | ||||
846 | #endif | |||
847 | ||||
848 | static const struct normal_encoding internal_big2_encoding = { | |||
849 | { VTABLE{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok ) , PREFIX(ignoreSectionTok) }, { PREFIX(attributeValueTok), PREFIX (entityValueTok) }, PREFIX(sameName), PREFIX(nameMatchesAscii ), PREFIX(nameLength), PREFIX(skipS), PREFIX(getAtts), PREFIX (charRefNumber), PREFIX(predefinedEntityName), PREFIX(updatePosition ), PREFIX(isPublicId), PREFIX(toUtf8), PREFIX(toUtf16), 2, 0, 1 }, | |||
850 | { | |||
851 | #define BT_COLON BT_NMSTRT | |||
852 | #include "iasciitab.h" | |||
853 | #undef BT_COLON | |||
854 | #include "latin1tab.h" | |||
855 | }, | |||
856 | STANDARD_VTABLE(big2_) | |||
857 | }; | |||
858 | ||||
859 | #endif | |||
860 | ||||
861 | #undef PREFIX | |||
862 | ||||
863 | static | |||
864 | int streqci(const char *s1, const char *s2) | |||
865 | { | |||
866 | for (;;) { | |||
867 | char c1 = *s1++; | |||
868 | char c2 = *s2++; | |||
869 | if (ASCII_a0x61 <= c1 && c1 <= ASCII_z0x7A) | |||
870 | c1 += ASCII_A0x41 - ASCII_a0x61; | |||
871 | if (ASCII_a0x61 <= c2 && c2 <= ASCII_z0x7A) | |||
872 | c2 += ASCII_A0x41 - ASCII_a0x61; | |||
873 | if (c1 != c2) | |||
874 | return 0; | |||
875 | if (!c1) | |||
876 | break; | |||
877 | } | |||
878 | return 1; | |||
879 | } | |||
880 | ||||
881 | static | |||
882 | void initUpdatePosition(const ENCODING *enc, const char *ptr, | |||
883 | const char *end, POSITION *pos) | |||
884 | { | |||
885 | normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); | |||
886 | } | |||
887 | ||||
888 | static | |||
889 | int toAscii(const ENCODING *enc, const char *ptr, const char *end) | |||
890 | { | |||
891 | char buf[1]; | |||
892 | char *p = buf; | |||
893 | XmlUtf8Convert(enc, &ptr, end, &p, p + 1)(((enc)->utf8Convert)(enc, &ptr, end, &p, p + 1)); | |||
894 | if (p == buf) | |||
895 | return -1; | |||
896 | else | |||
897 | return buf[0]; | |||
898 | } | |||
899 | ||||
900 | static | |||
901 | int isSpace(int c) | |||
902 | { | |||
903 | switch (c) { | |||
904 | case 0x20: | |||
905 | case 0xD: | |||
906 | case 0xA: | |||
907 | case 0x9: | |||
908 | return 1; | |||
909 | } | |||
910 | return 0; | |||
911 | } | |||
912 | ||||
913 | /* Return 1 if there's just optional white space | |||
914 | or there's an S followed by name=val. */ | |||
915 | static | |||
916 | int parsePseudoAttribute(const ENCODING *enc, | |||
917 | const char *ptr, | |||
918 | const char *end, | |||
919 | const char **namePtr, | |||
920 | const char **nameEndPtr, | |||
921 | const char **valPtr, | |||
922 | const char **nextTokPtr) | |||
923 | { | |||
924 | int c; | |||
925 | char open; | |||
926 | if (ptr == end) { | |||
927 | *namePtr = 0; | |||
928 | return 1; | |||
929 | } | |||
930 | if (!isSpace(toAscii(enc, ptr, end))) { | |||
931 | *nextTokPtr = ptr; | |||
932 | return 0; | |||
933 | } | |||
934 | do { | |||
935 | ptr += enc->minBytesPerChar; | |||
936 | } while (isSpace(toAscii(enc, ptr, end))); | |||
937 | if (ptr == end) { | |||
938 | *namePtr = 0; | |||
939 | return 1; | |||
940 | } | |||
941 | *namePtr = ptr; | |||
942 | for (;;) { | |||
943 | c = toAscii(enc, ptr, end); | |||
944 | if (c == -1) { | |||
945 | *nextTokPtr = ptr; | |||
946 | return 0; | |||
947 | } | |||
948 | if (c == ASCII_EQUALS0x3D) { | |||
949 | *nameEndPtr = ptr; | |||
950 | break; | |||
951 | } | |||
952 | if (isSpace(c)) { | |||
953 | *nameEndPtr = ptr; | |||
954 | do { | |||
955 | ptr += enc->minBytesPerChar; | |||
956 | } while (isSpace(c = toAscii(enc, ptr, end))); | |||
957 | if (c != ASCII_EQUALS0x3D) { | |||
958 | *nextTokPtr = ptr; | |||
959 | return 0; | |||
960 | } | |||
961 | break; | |||
962 | } | |||
963 | ptr += enc->minBytesPerChar; | |||
964 | } | |||
965 | if (ptr == *namePtr) { | |||
966 | *nextTokPtr = ptr; | |||
967 | return 0; | |||
968 | } | |||
969 | ptr += enc->minBytesPerChar; | |||
970 | c = toAscii(enc, ptr, end); | |||
971 | while (isSpace(c)) { | |||
972 | ptr += enc->minBytesPerChar; | |||
973 | c = toAscii(enc, ptr, end); | |||
974 | } | |||
975 | if (c != ASCII_QUOT0x22 && c != ASCII_APOS0x27) { | |||
976 | *nextTokPtr = ptr; | |||
977 | return 0; | |||
978 | } | |||
979 | open = c; | |||
980 | ptr += enc->minBytesPerChar; | |||
981 | *valPtr = ptr; | |||
982 | for (;; ptr += enc->minBytesPerChar) { | |||
983 | c = toAscii(enc, ptr, end); | |||
984 | if (c == open) | |||
985 | break; | |||
986 | if (!(ASCII_a0x61 <= c && c <= ASCII_z0x7A) | |||
987 | && !(ASCII_A0x41 <= c && c <= ASCII_Z0x5A) | |||
988 | && !(ASCII_00x30 <= c && c <= ASCII_90x39) | |||
989 | && c != ASCII_PERIOD0x2E | |||
990 | && c != ASCII_MINUS0x2D | |||
991 | && c != ASCII_UNDERSCORE0x5F) { | |||
992 | *nextTokPtr = ptr; | |||
993 | return 0; | |||
994 | } | |||
995 | } | |||
996 | *nextTokPtr = ptr + enc->minBytesPerChar; | |||
997 | return 1; | |||
998 | } | |||
999 | ||||
1000 | static const char KW_version[] = { | |||
1001 | ASCII_v0x76, ASCII_e0x65, ASCII_r0x72, ASCII_s0x73, ASCII_i0x69, ASCII_o0x6F, ASCII_n0x6E, '\0' | |||
1002 | }; | |||
1003 | ||||
1004 | static const char KW_encoding[] = { | |||
1005 | ASCII_e0x65, ASCII_n0x6E, ASCII_c0x63, ASCII_o0x6F, ASCII_d0x64, ASCII_i0x69, ASCII_n0x6E, ASCII_g0x67, '\0' | |||
1006 | }; | |||
1007 | ||||
1008 | static const char KW_standalone[] = { | |||
1009 | ASCII_s0x73, ASCII_t0x74, ASCII_a0x61, ASCII_n0x6E, ASCII_d0x64, ASCII_a0x61, ASCII_l0x6C, ASCII_o0x6F, ASCII_n0x6E, ASCII_e0x65, '\0' | |||
1010 | }; | |||
1011 | ||||
1012 | static const char KW_yes[] = { | |||
1013 | ASCII_y0x79, ASCII_e0x65, ASCII_s0x73, '\0' | |||
1014 | }; | |||
1015 | ||||
1016 | static const char KW_no[] = { | |||
1017 | ASCII_n0x6E, ASCII_o0x6F, '\0' | |||
1018 | }; | |||
1019 | ||||
1020 | static | |||
1021 | int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, | |||
1022 | const char *, | |||
1023 | const char *), | |||
1024 | int isGeneralTextEntity, | |||
1025 | const ENCODING *enc, | |||
1026 | const char *ptr, | |||
1027 | const char *end, | |||
1028 | const char **badPtr, | |||
1029 | const char **versionPtr, | |||
1030 | const char **versionEndPtr, | |||
1031 | const char **encodingName, | |||
1032 | const ENCODING **encoding, | |||
1033 | int *standalone) | |||
1034 | { | |||
1035 | const char *val = 0; | |||
1036 | const char *name = 0; | |||
1037 | const char *nameEnd = 0; | |||
1038 | ptr += 5 * enc->minBytesPerChar; | |||
1039 | end -= 2 * enc->minBytesPerChar; | |||
1040 | if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name) { | |||
1041 | *badPtr = ptr; | |||
1042 | return 0; | |||
1043 | } | |||
1044 | if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_version) )) { | |||
1045 | if (!isGeneralTextEntity) { | |||
1046 | *badPtr = name; | |||
1047 | return 0; | |||
1048 | } | |||
1049 | } | |||
1050 | else { | |||
1051 | if (versionPtr) | |||
1052 | *versionPtr = val; | |||
1053 | if (versionEndPtr) | |||
1054 | *versionEndPtr = ptr; | |||
1055 | if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { | |||
1056 | *badPtr = ptr; | |||
1057 | return 0; | |||
1058 | } | |||
1059 | if (!name) { | |||
1060 | if (isGeneralTextEntity) { | |||
1061 | /* a TextDecl must have an EncodingDecl */ | |||
1062 | *badPtr = ptr; | |||
1063 | return 0; | |||
1064 | } | |||
1065 | return 1; | |||
1066 | } | |||
1067 | } | |||
1068 | if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_encoding ))) { | |||
1069 | int c = toAscii(enc, val, end); | |||
1070 | if (!(ASCII_a0x61 <= c && c <= ASCII_z0x7A) && !(ASCII_A0x41 <= c && c <= ASCII_Z0x5A)) { | |||
1071 | *badPtr = val; | |||
1072 | return 0; | |||
1073 | } | |||
1074 | if (encodingName) | |||
1075 | *encodingName = val; | |||
1076 | if (encoding) | |||
1077 | *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar); | |||
1078 | if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) { | |||
1079 | *badPtr = ptr; | |||
1080 | return 0; | |||
1081 | } | |||
1082 | if (!name) | |||
1083 | return 1; | |||
1084 | } | |||
1085 | if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)(((enc)->nameMatchesAscii)(enc, name, nameEnd, KW_standalone )) || isGeneralTextEntity) { | |||
1086 | *badPtr = name; | |||
1087 | return 0; | |||
1088 | } | |||
1089 | if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)(((enc)->nameMatchesAscii)(enc, val, ptr - enc->minBytesPerChar , KW_yes))) { | |||
1090 | if (standalone) | |||
1091 | *standalone = 1; | |||
1092 | } | |||
1093 | else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)(((enc)->nameMatchesAscii)(enc, val, ptr - enc->minBytesPerChar , KW_no))) { | |||
1094 | if (standalone) | |||
1095 | *standalone = 0; | |||
1096 | } | |||
1097 | else { | |||
1098 | *badPtr = val; | |||
1099 | return 0; | |||
1100 | } | |||
1101 | while (isSpace(toAscii(enc, ptr, end))) | |||
1102 | ptr += enc->minBytesPerChar; | |||
1103 | if (ptr != end) { | |||
1104 | *badPtr = ptr; | |||
1105 | return 0; | |||
1106 | } | |||
1107 | return 1; | |||
1108 | } | |||
1109 | ||||
1110 | static | |||
1111 | int checkCharRefNumber(int result) | |||
1112 | { | |||
1113 | switch (result >> 8) { | |||
1114 | case 0xD8: case 0xD9: case 0xDA: case 0xDB: | |||
1115 | case 0xDC: case 0xDD: case 0xDE: case 0xDF: | |||
1116 | return -1; | |||
1117 | case 0: | |||
1118 | if (latin1_encoding.type[result] == BT_NONXML) | |||
1119 | return -1; | |||
1120 | break; | |||
1121 | case 0xFF: | |||
1122 | if (result == 0xFFFE || result == 0xFFFF) | |||
1123 | return -1; | |||
1124 | break; | |||
1125 | } | |||
1126 | return result; | |||
1127 | } | |||
1128 | ||||
1129 | int XmlUtf8Encode(int c, char *buf) | |||
1130 | { | |||
1131 | enum { | |||
1132 | /* minN is minimum legal resulting value for N byte sequence */ | |||
1133 | min2 = 0x80, | |||
1134 | min3 = 0x800, | |||
1135 | min4 = 0x10000 | |||
1136 | }; | |||
1137 | ||||
1138 | if (c < 0) | |||
1139 | return 0; | |||
1140 | if (c < min2) { | |||
1141 | buf[0] = (c | UTF8_cval1); | |||
1142 | return 1; | |||
1143 | } | |||
1144 | if (c < min3) { | |||
1145 | buf[0] = ((c >> 6) | UTF8_cval2); | |||
1146 | buf[1] = ((c & 0x3f) | 0x80); | |||
1147 | return 2; | |||
1148 | } | |||
1149 | if (c < min4) { | |||
1150 | buf[0] = ((c >> 12) | UTF8_cval3); | |||
1151 | buf[1] = (((c >> 6) & 0x3f) | 0x80); | |||
1152 | buf[2] = ((c & 0x3f) | 0x80); | |||
1153 | return 3; | |||
1154 | } | |||
1155 | if (c < 0x110000) { | |||
1156 | buf[0] = ((c >> 18) | UTF8_cval4); | |||
1157 | buf[1] = (((c >> 12) & 0x3f) | 0x80); | |||
1158 | buf[2] = (((c >> 6) & 0x3f) | 0x80); | |||
1159 | buf[3] = ((c & 0x3f) | 0x80); | |||
1160 | return 4; | |||
1161 | } | |||
1162 | return 0; | |||
1163 | } | |||
1164 | ||||
1165 | int XmlUtf16Encode(int charNum, unsigned short *buf) | |||
1166 | { | |||
1167 | if (charNum < 0) | |||
1168 | return 0; | |||
1169 | if (charNum < 0x10000) { | |||
1170 | buf[0] = charNum; | |||
1171 | return 1; | |||
1172 | } | |||
1173 | if (charNum < 0x110000) { | |||
1174 | charNum -= 0x10000; | |||
1175 | buf[0] = (charNum >> 10) + 0xD800; | |||
1176 | buf[1] = (charNum & 0x3FF) + 0xDC00; | |||
1177 | return 2; | |||
1178 | } | |||
1179 | return 0; | |||
1180 | } | |||
1181 | ||||
1182 | struct unknown_encoding { | |||
1183 | struct normal_encoding normal; | |||
1184 | int (*convert)(void *userData, const char *p); | |||
1185 | void *userData; | |||
1186 | unsigned short utf16[256]; | |||
1187 | char utf8[256][4]; | |||
1188 | }; | |||
1189 | ||||
1190 | int XmlSizeOfUnknownEncoding(void) | |||
1191 | { | |||
1192 | return sizeof(struct unknown_encoding); | |||
1193 | } | |||
1194 | ||||
1195 | static | |||
1196 | int unknown_isName(const ENCODING *enc, const char *p) | |||
1197 | { | |||
1198 | int c = ((const struct unknown_encoding *)enc) | |||
1199 | ->convert(((const struct unknown_encoding *)enc)->userData, p); | |||
1200 | if (c & ~0xFFFF) | |||
1201 | return 0; | |||
1202 | return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF)(namingBitmap[(namePages[c >> 8] << 3) + ((c & 0xFF) >> 5)] & (1 << ((c & 0xFF) & 0x1F ))); | |||
1203 | } | |||
1204 | ||||
1205 | static | |||
1206 | int unknown_isNmstrt(const ENCODING *enc, const char *p) | |||
1207 | { | |||
1208 | int c = ((const struct unknown_encoding *)enc) | |||
1209 | ->convert(((const struct unknown_encoding *)enc)->userData, p); | |||
1210 | if (c & ~0xFFFF) | |||
1211 | return 0; | |||
1212 | return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF)(namingBitmap[(nmstrtPages[c >> 8] << 3) + ((c & 0xFF) >> 5)] & (1 << ((c & 0xFF) & 0x1F ))); | |||
1213 | } | |||
1214 | ||||
1215 | static | |||
1216 | int unknown_isInvalid(const ENCODING *enc, const char *p) | |||
1217 | { | |||
1218 | int c = ((const struct unknown_encoding *)enc) | |||
1219 | ->convert(((const struct unknown_encoding *)enc)->userData, p); | |||
1220 | return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; | |||
1221 | } | |||
1222 | ||||
1223 | static | |||
1224 | void unknown_toUtf8(const ENCODING *enc, | |||
1225 | const char **fromP, const char *fromLim, | |||
1226 | char **toP, const char *toLim) | |||
1227 | { | |||
1228 | char buf[XML_UTF8_ENCODE_MAX4]; | |||
1229 | for (;;) { | |||
| ||||
1230 | const char *utf8; | |||
1231 | int n; | |||
1232 | if (*fromP == fromLim) | |||
1233 | break; | |||
1234 | utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP]; | |||
1235 | n = *utf8++; | |||
1236 | if (n == 0) { | |||
1237 | int c = ((const struct unknown_encoding *)enc) | |||
1238 | ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); | |||
1239 | n = XmlUtf8Encode(c, buf); | |||
1240 | if (n > toLim - *toP) | |||
1241 | break; | |||
1242 | utf8 = buf; | |||
1243 | *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] | |||
1244 | - (BT_LEAD2 - 2); | |||
1245 | } | |||
1246 | else { | |||
1247 | if (n > toLim - *toP) | |||
1248 | break; | |||
1249 | (*fromP)++; | |||
1250 | } | |||
1251 | do { | |||
1252 | *(*toP)++ = *utf8++; | |||
| ||||
1253 | } while (--n != 0); | |||
1254 | } | |||
1255 | } | |||
1256 | ||||
1257 | static | |||
1258 | void unknown_toUtf16(const ENCODING *enc, | |||
1259 | const char **fromP, const char *fromLim, | |||
1260 | unsigned short **toP, const unsigned short *toLim) | |||
1261 | { | |||
1262 | while (*fromP != fromLim && *toP != toLim) { | |||
1263 | unsigned short c | |||
1264 | = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP]; | |||
1265 | if (c == 0) { | |||
1266 | c = (unsigned short)((const struct unknown_encoding *)enc) | |||
1267 | ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); | |||
1268 | *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] | |||
1269 | - (BT_LEAD2 - 2); | |||
1270 | } | |||
1271 | else | |||
1272 | (*fromP)++; | |||
1273 | *(*toP)++ = c; | |||
1274 | } | |||
1275 | } | |||
1276 | ||||
1277 | ENCODING * | |||
1278 | XmlInitUnknownEncoding(void *mem, | |||
1279 | int *table, | |||
1280 | int (*convert)(void *userData, const char *p), | |||
1281 | void *userData) | |||
1282 | { | |||
1283 | int i; | |||
1284 | struct unknown_encoding *e = mem; | |||
1285 | for (i = 0; i < (int)sizeof(struct normal_encoding); i++) | |||
1286 | ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; | |||
1287 | for (i = 0; i < 128; i++) | |||
1288 | if (latin1_encoding.type[i] != BT_OTHER | |||
1289 | && latin1_encoding.type[i] != BT_NONXML | |||
1290 | && table[i] != i) | |||
1291 | return 0; | |||
1292 | for (i = 0; i < 256; i++) { | |||
1293 | int c = table[i]; | |||
1294 | if (c == -1) { | |||
1295 | e->normal.type[i] = BT_MALFORM; | |||
1296 | /* This shouldn't really get used. */ | |||
1297 | e->utf16[i] = 0xFFFF; | |||
1298 | e->utf8[i][0] = 1; | |||
1299 | e->utf8[i][1] = 0; | |||
1300 | } | |||
1301 | else if (c < 0) { | |||
1302 | if (c < -4) | |||
1303 | return 0; | |||
1304 | e->normal.type[i] = BT_LEAD2 - (c + 2); | |||
1305 | e->utf8[i][0] = 0; | |||
1306 | e->utf16[i] = 0; | |||
1307 | } | |||
1308 | else if (c < 0x80) { | |||
1309 | if (latin1_encoding.type[c] != BT_OTHER | |||
1310 | && latin1_encoding.type[c] != BT_NONXML | |||
1311 | && c != i) | |||
1312 | return 0; | |||
1313 | e->normal.type[i] = latin1_encoding.type[c]; | |||
1314 | e->utf8[i][0] = 1; | |||
1315 | e->utf8[i][1] = (char)c; | |||
1316 | e->utf16[i] = c == 0 ? 0xFFFF : c; | |||
1317 | } | |||
1318 | else if (checkCharRefNumber(c) < 0) { | |||
1319 | e->normal.type[i] = BT_NONXML; | |||
1320 | /* This shouldn't really get used. */ | |||
1321 | e->utf16[i] = 0xFFFF; | |||
1322 | e->utf8[i][0] = 1; | |||
1323 | e->utf8[i][1] = 0; | |||
1324 | } | |||
1325 | else { | |||
1326 | if (c > 0xFFFF) | |||
1327 | return 0; | |||
1328 | if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)(namingBitmap[(nmstrtPages[c >> 8] << 3) + ((c & 0xff) >> 5)] & (1 << ((c & 0xff) & 0x1F )))) | |||
1329 | e->normal.type[i] = BT_NMSTRT; | |||
1330 | else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)(namingBitmap[(namePages[c >> 8] << 3) + ((c & 0xff) >> 5)] & (1 << ((c & 0xff) & 0x1F )))) | |||
1331 | e->normal.type[i] = BT_NAME; | |||
1332 | else | |||
1333 | e->normal.type[i] = BT_OTHER; | |||
1334 | e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); | |||
1335 | e->utf16[i] = c; | |||
1336 | } | |||
1337 | } | |||
1338 | e->userData = userData; | |||
1339 | e->convert = convert; | |||
1340 | if (convert) { | |||
1341 | e->normal.isName2 = unknown_isName; | |||
1342 | e->normal.isName3 = unknown_isName; | |||
1343 | e->normal.isName4 = unknown_isName; | |||
1344 | e->normal.isNmstrt2 = unknown_isNmstrt; | |||
1345 | e->normal.isNmstrt3 = unknown_isNmstrt; | |||
1346 | e->normal.isNmstrt4 = unknown_isNmstrt; | |||
1347 | e->normal.isInvalid2 = unknown_isInvalid; | |||
1348 | e->normal.isInvalid3 = unknown_isInvalid; | |||
1349 | e->normal.isInvalid4 = unknown_isInvalid; | |||
1350 | } | |||
1351 | e->normal.enc.utf8Convert = unknown_toUtf8; | |||
1352 | e->normal.enc.utf16Convert = unknown_toUtf16; | |||
1353 | return &(e->normal.enc); | |||
1354 | } | |||
1355 | ||||
1356 | /* If this enumeration is changed, getEncodingIndex and encodings | |||
1357 | must also be changed. */ | |||
1358 | enum { | |||
1359 | UNKNOWN_ENC = -1, | |||
1360 | ISO_8859_1_ENC = 0, | |||
1361 | US_ASCII_ENC, | |||
1362 | UTF_8_ENC, | |||
1363 | UTF_16_ENC, | |||
1364 | UTF_16BE_ENC, | |||
1365 | UTF_16LE_ENC, | |||
1366 | /* must match encodingNames up to here */ | |||
1367 | NO_ENC | |||
1368 | }; | |||
1369 | ||||
1370 | static const char KW_ISO_8859_1[] = { | |||
1371 | ASCII_I0x49, ASCII_S0x53, ASCII_O0x4F, ASCII_MINUS0x2D, ASCII_80x38, ASCII_80x38, ASCII_50x35, ASCII_90x39, ASCII_MINUS0x2D, ASCII_10x31, '\0' | |||
1372 | }; | |||
1373 | static const char KW_US_ASCII[] = { | |||
1374 | ASCII_U0x55, ASCII_S0x53, ASCII_MINUS0x2D, ASCII_A0x41, ASCII_S0x53, ASCII_C0x43, ASCII_I0x49, ASCII_I0x49, '\0' | |||
1375 | }; | |||
1376 | static const char KW_UTF_8[] = { | |||
1377 | ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_80x38, '\0' | |||
1378 | }; | |||
1379 | static const char KW_UTF_16[] = { | |||
1380 | ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, '\0' | |||
1381 | }; | |||
1382 | static const char KW_UTF_16BE[] = { | |||
1383 | ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, ASCII_B0x42, ASCII_E0x45, '\0' | |||
1384 | }; | |||
1385 | static const char KW_UTF_16LE[] = { | |||
1386 | ASCII_U0x55, ASCII_T0x54, ASCII_F0x46, ASCII_MINUS0x2D, ASCII_10x31, ASCII_60x36, ASCII_L0x4C, ASCII_E0x45, '\0' | |||
1387 | }; | |||
1388 | ||||
1389 | static | |||
1390 | int getEncodingIndex(const char *name) | |||
1391 | { | |||
1392 | static const char *encodingNames[] = { | |||
1393 | KW_ISO_8859_1, | |||
1394 | KW_US_ASCII, | |||
1395 | KW_UTF_8, | |||
1396 | KW_UTF_16, | |||
1397 | KW_UTF_16BE, | |||
1398 | KW_UTF_16LE, | |||
1399 | }; | |||
1400 | int i; | |||
1401 | if (name == 0) | |||
1402 | return NO_ENC; | |||
1403 | for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++) | |||
1404 | if (streqci(name, encodingNames[i])) | |||
1405 | return i; | |||
1406 | return UNKNOWN_ENC; | |||
1407 | } | |||
1408 | ||||
1409 | /* For binary compatibility, we store the index of the encoding specified | |||
1410 | at initialization in the isUtf16 member. */ | |||
1411 | ||||
1412 | #define INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) ((int)(enc)->initEnc.isUtf16) | |||
1413 | #define SET_INIT_ENC_INDEX(enc, i)((enc)->initEnc.isUtf16 = (char)i) ((enc)->initEnc.isUtf16 = (char)i) | |||
1414 | ||||
1415 | /* This is what detects the encoding. | |||
1416 | encodingTable maps from encoding indices to encodings; | |||
1417 | INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding; | |||
1418 | state is XML_CONTENT_STATE if we're parsing an external text entity, | |||
1419 | and XML_PROLOG_STATE otherwise. | |||
1420 | */ | |||
1421 | ||||
1422 | ||||
1423 | static | |||
1424 | int initScan(const ENCODING **encodingTable, | |||
1425 | const INIT_ENCODING *enc, | |||
1426 | int state, | |||
1427 | const char *ptr, | |||
1428 | const char *end, | |||
1429 | const char **nextTokPtr) | |||
1430 | { | |||
1431 | const ENCODING **encPtr; | |||
1432 | ||||
1433 | if (ptr == end) | |||
1434 | return XML_TOK_NONE-4; | |||
1435 | encPtr = enc->encPtr; | |||
1436 | if (ptr + 1 == end) { | |||
1437 | /* only a single byte available for auto-detection */ | |||
1438 | #ifndef XML_DTD /* FIXME */ | |||
1439 | /* a well-formed document entity must have more than one byte */ | |||
1440 | if (state != XML_CONTENT_STATE1) | |||
1441 | return XML_TOK_PARTIAL-1; | |||
1442 | #endif | |||
1443 | /* so we're parsing an external text entity... */ | |||
1444 | /* if UTF-16 was externally specified, then we need at least 2 bytes */ | |||
1445 | switch (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16)) { | |||
1446 | case UTF_16_ENC: | |||
1447 | case UTF_16LE_ENC: | |||
1448 | case UTF_16BE_ENC: | |||
1449 | return XML_TOK_PARTIAL-1; | |||
1450 | } | |||
1451 | switch ((unsigned char)*ptr) { | |||
1452 | case 0xFE: | |||
1453 | case 0xFF: | |||
1454 | case 0xEF: /* possibly first byte of UTF-8 BOM */ | |||
1455 | if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC | |||
1456 | && state == XML_CONTENT_STATE1) | |||
1457 | break; | |||
1458 | /* fall through */ | |||
1459 | case 0x00: | |||
1460 | case 0x3C: | |||
1461 | return XML_TOK_PARTIAL-1; | |||
1462 | } | |||
1463 | } | |||
1464 | else { | |||
1465 | switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { | |||
1466 | case 0xFEFF: | |||
1467 | if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC | |||
1468 | && state == XML_CONTENT_STATE1) | |||
1469 | break; | |||
1470 | *nextTokPtr = ptr + 2; | |||
1471 | *encPtr = encodingTable[UTF_16BE_ENC]; | |||
1472 | return XML_TOK_BOM14; | |||
1473 | /* 00 3C is handled in the default case */ | |||
1474 | case 0x3C00: | |||
1475 | if ((INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16BE_ENC | |||
1476 | || INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16_ENC) | |||
1477 | && state == XML_CONTENT_STATE1) | |||
1478 | break; | |||
1479 | *encPtr = encodingTable[UTF_16LE_ENC]; | |||
1480 | return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr )); | |||
1481 | case 0xFFFE: | |||
1482 | if (INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == ISO_8859_1_ENC | |||
1483 | && state == XML_CONTENT_STATE1) | |||
1484 | break; | |||
1485 | *nextTokPtr = ptr + 2; | |||
1486 | *encPtr = encodingTable[UTF_16LE_ENC]; | |||
1487 | return XML_TOK_BOM14; | |||
1488 | case 0xEFBB: | |||
1489 | /* Maybe a UTF-8 BOM (EF BB BF) */ | |||
1490 | /* If there's an explicitly specified (external) encoding | |||
1491 | of ISO-8859-1 or some flavour of UTF-16 | |||
1492 | and this is an external text entity, | |||
1493 | don't look for the BOM, | |||
1494 | because it might be a legal data. */ | |||
1495 | if (state == XML_CONTENT_STATE1) { | |||
1496 | int e = INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16); | |||
1497 | if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC) | |||
1498 | break; | |||
1499 | } | |||
1500 | if (ptr + 2 == end) | |||
1501 | return XML_TOK_PARTIAL-1; | |||
1502 | if ((unsigned char)ptr[2] == 0xBF) { | |||
1503 | *nextTokPtr = ptr + 3; | |||
1504 | *encPtr = encodingTable[UTF_8_ENC]; | |||
1505 | return XML_TOK_BOM14; | |||
1506 | } | |||
1507 | break; | |||
1508 | default: | |||
1509 | if (ptr[0] == '\0') { | |||
1510 | /* 0 isn't a legal data character. Furthermore a document entity can only | |||
1511 | start with ASCII characters. So the only way this can fail to be big-endian | |||
1512 | UTF-16 if it it's an external parsed general entity that's labelled as | |||
1513 | UTF-16LE. */ | |||
1514 | if (state == XML_CONTENT_STATE1 && INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16) == UTF_16LE_ENC) | |||
1515 | break; | |||
1516 | *encPtr = encodingTable[UTF_16BE_ENC]; | |||
1517 | return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr )); | |||
1518 | } | |||
1519 | else if (ptr[1] == '\0') { | |||
1520 | /* We could recover here in the case: | |||
1521 | - parsing an external entity | |||
1522 | - second byte is 0 | |||
1523 | - no externally specified encoding | |||
1524 | - no encoding declaration | |||
1525 | by assuming UTF-16LE. But we don't, because this would mean when | |||
1526 | presented just with a single byte, we couldn't reliably determine | |||
1527 | whether we needed further bytes. */ | |||
1528 | if (state == XML_CONTENT_STATE1) | |||
1529 | break; | |||
1530 | *encPtr = encodingTable[UTF_16LE_ENC]; | |||
1531 | return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr )); | |||
1532 | } | |||
1533 | break; | |||
1534 | } | |||
1535 | } | |||
1536 | *encPtr = encodingTable[INIT_ENC_INDEX(enc)((int)(enc)->initEnc.isUtf16)]; | |||
1537 | return XmlTok(*encPtr, state, ptr, end, nextTokPtr)(((*encPtr)->scanners[state])(*encPtr, ptr, end, nextTokPtr )); | |||
1538 | } | |||
1539 | ||||
1540 | ||||
1541 | #define NS(x) x | |||
1542 | #define ns(x) x | |||
1543 | #include "xmltok_ns.c" | |||
1544 | #undef NS | |||
1545 | #undef ns | |||
1546 | ||||
1547 | #ifdef XML_NS | |||
1548 | ||||
1549 | #define NS(x) x ## NS | |||
1550 | #define ns(x) x ## _ns | |||
1551 | ||||
1552 | #include "xmltok_ns.c" | |||
1553 | ||||
1554 | #undef NS | |||
1555 | #undef ns | |||
1556 | ||||
1557 | ENCODING * | |||
1558 | XmlInitUnknownEncodingNS(void *mem, | |||
1559 | int *table, | |||
1560 | int (*convert)(void *userData, const char *p), | |||
1561 | void *userData) | |||
1562 | { | |||
1563 | ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData); | |||
1564 | if (enc) | |||
1565 | ((struct normal_encoding *)enc)->type[ASCII_COLON0x3A] = BT_COLON; | |||
1566 | return enc; | |||
1567 | } | |||
1568 | ||||
1569 | #endif /* XML_NS */ |