Line data Source code
1 : /* valamarkupreader.vala
2 : *
3 : * Copyright (C) 2008-2009 Jürg Billeter
4 : *
5 : * This library is free software; you can redistribute it and/or
6 : * modify it under the terms of the GNU Lesser General Public
7 : * License as published by the Free Software Foundation; either
8 : * version 2.1 of the License, or (at your option) any later version.
9 :
10 : * This library is distributed in the hope that it will be useful,
11 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 : * Lesser General Public License for more details.
14 :
15 : * You should have received a copy of the GNU Lesser General Public
16 : * License along with this library; if not, write to the Free Software
17 : * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 : *
19 : * Author:
20 : * Jürg Billeter <j@bitron.ch>
21 : */
22 :
23 : using GLib;
24 :
25 : /**
26 : * Simple reader for a subset of XML.
27 : */
28 612 : public class Vala.MarkupReader {
29 558 : public string filename { get; private set; }
30 :
31 41252 : public string name { get; private set; }
32 :
33 13832 : public string content { get; private set; }
34 :
35 186 : MappedFile mapped_file;
36 :
37 : char* begin;
38 : char* current;
39 : char* end;
40 :
41 : int line;
42 : int column;
43 :
44 372 : Map<string,string> attributes = new HashMap<string,string> (str_hash, str_equal);
45 : bool empty_element;
46 :
47 372 : public MarkupReader (string filename) {
48 186 : this.filename = filename;
49 :
50 186 : try {
51 186 : mapped_file = new MappedFile (filename, false);
52 186 : begin = mapped_file.get_contents ();
53 186 : end = begin + mapped_file.get_length ();
54 :
55 186 : current = begin;
56 :
57 186 : line = 1;
58 186 : column = 1;
59 : } catch (FileError e) {
60 0 : Report.error (null, "Unable to map file `%s': %s", filename, e.message);
61 : }
62 : }
63 :
64 0 : public MarkupReader.from_string (string filename, string content) {
65 0 : this.filename = filename;
66 :
67 0 : begin = content;
68 0 : end = begin + content.length;
69 :
70 0 : current = begin;
71 :
72 0 : line = 1;
73 0 : column = 1;
74 : }
75 :
76 242 : public bool has_attribute (string attr) {
77 242 : return attributes.contains (attr);
78 : }
79 :
80 12767 : public string? get_attribute (string attr) {
81 12767 : return attributes[attr];
82 : }
83 :
84 : /*
85 : * Returns a copy of the current attributes.
86 : *
87 : * @return map of current attributes
88 : */
89 501 : public Map<string,string> get_attributes () {
90 501 : var result = new HashMap<string,string> (str_hash, str_equal);
91 2464 : foreach (var key in attributes.get_keys ()) {
92 1462 : result.set (key, attributes.get (key));
93 : }
94 : return result;
95 : }
96 :
97 12734 : string read_name () {
98 12734 : char* begin = current;
99 112910 : while (current < end) {
100 112910 : if (current[0] == ' ' || current[0] == '\t' || current[0] == '>'
101 : || current[0] == '/' || current[0] == '=' || current[0] == '\n') {
102 : break;
103 : }
104 100176 : unichar u = ((string) current).get_char_validated ((long) (end - current));
105 100176 : if (u != (unichar) (-1)) {
106 100176 : current += u.to_utf8 (null);
107 : } else {
108 0 : Report.error (null, "invalid UTF-8 character");
109 : }
110 : }
111 12734 : if (current == begin) {
112 : // syntax error: invalid name
113 : }
114 12734 : return ((string) begin).substring (0, (int) (current - begin));
115 : }
116 :
117 7839 : public MarkupTokenType read_token (out SourceLocation token_begin, out SourceLocation token_end) {
118 7839 : attributes.clear ();
119 :
120 7839 : if (empty_element) {
121 1401 : empty_element = false;
122 1401 : token_begin = SourceLocation (begin, line, column);
123 1401 : token_end = SourceLocation (begin, line, column);
124 1401 : return MarkupTokenType.END_ELEMENT;
125 : }
126 :
127 6438 : content = null;
128 6438 : name = null;
129 :
130 6438 : space ();
131 :
132 6438 : MarkupTokenType type = MarkupTokenType.NONE;
133 6438 : char* begin = current;
134 6438 : token_begin = SourceLocation (begin, line, column);
135 :
136 6438 : if (current >= end) {
137 : type = MarkupTokenType.EOF;
138 6376 : } else if (current[0] == '<') {
139 5996 : current++;
140 5996 : if (current >= end) {
141 : // error
142 5996 : } else if (current[0] == '?') {
143 : // processing instruction
144 5810 : } else if (current[0] == '!') {
145 : // comment or doctype
146 133 : current++;
147 133 : if (current < end - 1 && current[0] == '-' && current[1] == '-') {
148 : // comment
149 133 : current += 2;
150 21745 : while (current < end - 2) {
151 21745 : if (current[0] == '-' && current[1] == '-' && current[2] == '>') {
152 : // end of comment
153 133 : current += 3;
154 133 : break;
155 21612 : } else if (current[0] == '\n') {
156 248 : line++;
157 248 : column = 0;
158 : }
159 21612 : current++;
160 : }
161 :
162 : // ignore comment, read next token
163 133 : return read_token (out token_begin, out token_end);
164 : }
165 5677 : } else if (current[0] == '/') {
166 2014 : type = MarkupTokenType.END_ELEMENT;
167 2014 : current++;
168 2014 : name = read_name ();
169 2014 : if (current >= end || current[0] != '>') {
170 : // error
171 : }
172 2014 : current++;
173 : } else {
174 3663 : type = MarkupTokenType.START_ELEMENT;
175 3663 : name = read_name ();
176 3663 : space ();
177 10720 : while (current < end && current[0] != '>' && current[0] != '/') {
178 7057 : string attr_name = read_name ();
179 7057 : space ();
180 7057 : if (current >= end || current[0] != '=') {
181 : // error
182 : }
183 7057 : current++;
184 7057 : space ();
185 7057 : if (current >= end || current[0] != '"' || current[0] != '\'') {
186 : // error
187 : }
188 7057 : char quote = current[0];
189 7057 : current++;
190 :
191 7057 : string attr_value = text (quote, false);
192 :
193 7057 : if (current >= end || current[0] != quote) {
194 : // error
195 : }
196 7057 : current++;
197 7057 : attributes.set (attr_name, attr_value);
198 7057 : space ();
199 : }
200 3663 : if (current[0] == '/') {
201 1525 : empty_element = true;
202 1525 : current++;
203 1525 : space ();
204 : } else {
205 2138 : empty_element = false;
206 : }
207 3663 : if (current >= end || current[0] != '>') {
208 : // error
209 : }
210 3663 : current++;
211 : }
212 : } else {
213 380 : space ();
214 :
215 380 : if (current[0] != '<') {
216 380 : content = text ('<', true);
217 : } else {
218 : // no text
219 : // read next token
220 0 : return read_token (out token_begin, out token_end);
221 : }
222 :
223 380 : type = MarkupTokenType.TEXT;
224 : }
225 :
226 6305 : token_end = SourceLocation (current, line, column - 1);
227 :
228 6305 : return type;
229 : }
230 :
231 7437 : string text (char end_char, bool rm_trailing_whitespace) {
232 7437 : StringBuilder content = new StringBuilder ();
233 7437 : char* text_begin = current;
234 7437 : char* last_linebreak = current;
235 :
236 90555 : while (current < end && current[0] != end_char) {
237 83118 : unichar u = ((string) current).get_char_validated ((long) (end - current));
238 83118 : if (u == (unichar) (-1)) {
239 0 : Report.error (null, "invalid UTF-8 character");
240 83118 : } else if (u == '&') {
241 0 : char* next_pos = current + u.to_utf8 (null);
242 : char buffer[16];
243 0 : Memory.copy (buffer, next_pos, (end - next_pos >= buffer.length ? buffer.length - 1 : end - next_pos));
244 0 : if (((string) buffer).has_prefix ("amp;")) {
245 0 : content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
246 0 : content.append_c ('&');
247 0 : current += 5;
248 0 : text_begin = current;
249 0 : } else if (((string) buffer).has_prefix ("quot;")) {
250 0 : content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
251 0 : content.append_c ('"');
252 0 : current += 6;
253 0 : text_begin = current;
254 0 : } else if (((string) buffer).has_prefix ("apos;")) {
255 0 : content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
256 0 : content.append_c ('\'');
257 0 : current += 6;
258 0 : text_begin = current;
259 0 : } else if (((string) buffer).has_prefix ("lt;")) {
260 0 : content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
261 0 : content.append_c ('<');
262 0 : current += 4;
263 0 : text_begin = current;
264 0 : } else if (((string) buffer).has_prefix ("gt;")) {
265 0 : content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
266 0 : content.append_c ('>');
267 0 : current += 4;
268 0 : text_begin = current;
269 0 : } else if (((string) buffer).has_prefix ("percnt;")) {
270 0 : content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
271 0 : content.append_c ('%');
272 0 : current += 8;
273 0 : text_begin = current;
274 : } else {
275 0 : current += u.to_utf8 (null);
276 : }
277 : } else {
278 83118 : if (u == '\n') {
279 186 : line++;
280 186 : column = 0;
281 186 : last_linebreak = current;
282 : }
283 :
284 83118 : current += u.to_utf8 (null);
285 83118 : column++;
286 : }
287 : }
288 :
289 7437 : if (text_begin != current) {
290 14872 : content.append (((string) text_begin).substring (0, (int) (current - text_begin)));
291 : }
292 :
293 7437 : column += (int) (current - last_linebreak);
294 :
295 : // Removes trailing whitespace
296 7437 : if (rm_trailing_whitespace) {
297 380 : char* str_pos = ((char*)content.str) + content.len;
298 566 : for (str_pos--; str_pos > ((char*)content.str) && str_pos[0].isspace(); str_pos--);
299 380 : content.erase ((ssize_t) (str_pos-((char*) content.str) + 1), -1);
300 : }
301 :
302 14874 : return content.str;
303 : }
304 :
305 33177 : void space () {
306 71511 : while (current < end && current[0].isspace ()) {
307 38334 : if (current[0] == '\n') {
308 6045 : line++;
309 6045 : column = 0;
310 : }
311 38334 : current++;
312 38334 : column++;
313 : }
314 : }
315 : }
316 :
317 : public enum Vala.MarkupTokenType {
318 : NONE,
319 : START_ELEMENT,
320 : END_ELEMENT,
321 : TEXT,
322 : EOF;
323 :
324 : public unowned string to_string () {
325 0 : switch (this) {
326 0 : case START_ELEMENT: return "start element";
327 0 : case END_ELEMENT: return "end element";
328 0 : case TEXT: return "text";
329 0 : case EOF: return "end of file";
330 0 : default: return "unknown token type";
331 : }
332 : }
333 : }
334 :
|