2 Copyright (C) 2000-2004 SKYRIX Software AG
4 This file is part of OpenGroupware.org.
6 OGo is free software; you can redistribute it and/or modify it under
7 the terms of the GNU Lesser General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
11 OGo is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with OGo; see the file COPYING. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 #include "WOHTMLParser.h"
24 #include <NGObjWeb/WODynamicElement.h>
25 #include <NGObjWeb/WOElement.h>
31 The root parse function is _parseElement() which calls either
32 _parseWOElement() or _parseHashElement() if it finds a NGObjWeb tag at the
33 beginning of the buffer.
34 If it doesn't it collects all content till it encounteres an NGObjWeb tag,
35 and reports that content as "static text" to the callback.
37 Parsing a dynamic element is:
39 - parse the attributes
40 - parse the contents, static strings and elements
41 - add content to a children array
42 - produce WOElement by calling
43 -dynamicElementWithName:attributes:contentElements:
47 @interface WOElement(StaticStringElement)
48 - (id)initWithBuffer:(const char *)_buffer length:(unsigned)_len;
51 @implementation WOHTMLParser
53 static WOElement *_parseElement(NSZone *_zone,
54 const char *_buffer, unsigned *_idx,
55 unsigned _len, NSException **_exception,
58 static Class StrClass = Nil;
59 static Class DictClass = Nil;
60 static Class NumberClass = Nil;
61 static Class WOStringClass = Nil;
62 static BOOL skipPlainTags = NO; /* do process markers inside HTML tags ? */
63 static BOOL compressHTMLWhitespace = YES;
64 static BOOL useUTF8 = NO;
67 NSUserDefaults *ud = [NSUserDefaults standardUserDefaults];
69 StrClass = [NSString class];
70 DictClass = [NSMutableDictionary class];
71 NumberClass = [NSNumber class];
72 WOStringClass = NSClassFromString(@"_WOStaticHTMLElement");
74 useUTF8 = [ud boolForKey:@"WOParsersUseUTF8"];
77 - (id)initWithHandler:(id<NSObject,WOHTMLParserHandler>)_handler {
78 self->callback = [_handler retain];
82 [self->parsingException release];
83 [self->callback release];
89 - (NSException *)_makeSyntaxErrorException {
90 return [NSException exceptionWithName:@"SyntaxError"
91 reason:@"template syntax error"
95 - (WOElement *)dynamicElementWithName:(NSString *)_element
96 attributes:(NSDictionary *)_attributes // not the associations !
97 contentElements:(NSArray *)_subElements
99 return [self->callback dynamicElementWithName:_element
100 attributes:_attributes
101 contentElements:_subElements];
104 - (id)_makeConstantStringElementWithBuffer:(const unsigned char *)_buf
105 length:(unsigned)_len
107 return [[WOStringClass allocWithZone:NULL] initWithBuffer:_buf length:_len];
110 - (NSString *)_makeStringForBuffer:(const unsigned char *)_buf
111 length:(unsigned)_len
120 return [[StrClass alloc] initWithCString:_buf length:_len];
122 // Note: we cast the pointer because we are not going to modify _buf for the
123 // duration and we are never going to write the data - should work
124 // with any Foundation, but isn't strictly API compatible
125 data = [[NSData alloc] initWithBytesNoCopy:(void *)_buf length:_len
127 r = [[StrClass alloc] initWithData:data encoding:NSUTF8StringEncoding];
134 - (NSException *)parsingException {
135 return self->parsingException;
140 - (NSArray *)parseHTMLData:(NSData *)_html {
141 NSMutableArray *topLevel;
144 NSException *exception = nil;
146 if (![self->callback parser:self willParseHTMLData:_html])
149 [self->parsingException release]; self->parsingException = nil;
154 topLevel = [NSMutableArray arrayWithCapacity:64];
156 len = [_html length];
157 html = [_html bytes];
159 while ((idx < len) && (exception == nil)) {
162 if ((element = _parseElement(NULL, html, &idx, len, &exception, self))) {
163 [topLevel addObject:element];
164 [element release]; element = nil;
168 ASSIGN(self->parsingException, exception);
171 [self->callback parser:self
172 failedParsingHTMLData:_html exception:exception];
175 [self->callback parser:self
176 finishedParsingHTMLData:_html elements:topLevel];
179 return self->parsingException ? nil : topLevel;
182 /* internal parsing */
184 static int _numberOfLines(const char *_buffer, unsigned _lastIdx) {
185 register int pos, lineCount = 1;
187 for (pos = 0; (pos < (int)_lastIdx) && (_buffer[pos] != '\0'); pos++) {
188 if (_buffer[pos] == '\n')
194 static inline BOOL _isHTMLSpace(char c) {
196 case ' ': case '\t': case '\r': case '\n':
204 static NSException *_makeHtmlException(NSException *_exception,
205 const char *_buffer, unsigned _idx,
206 unsigned _len, NSString *_text,
209 NSMutableDictionary *ui = nil;
210 NSException *exception = nil;
211 int numLines = _numberOfLines(_buffer, _idx);
212 BOOL atEof = (_idx >= _len) ? YES : NO;
215 // error resulted from a previous error (exception already set)
218 exception = [self _makeSyntaxErrorException];
221 _text = [@"Unexpected end: " stringByAppendingString:[_text stringValue]];
223 _text = [StrClass stringWithFormat:@"Syntax error in line %i: %@",
227 [exception setReason:_text];
231 ui = [[exception userInfo] mutableCopy];
233 ui = [[DictClass alloc] initWithCapacity:8];
235 [ui setObject:[NumberClass numberWithInt:numLines] forKey:@"line"];
236 [ui setObject:[NumberClass numberWithInt:_len] forKey:@"size"];
237 [ui setObject:[NumberClass numberWithInt:_idx] forKey:@"position"];
240 [ui setObject:self forKey:@"handler"];
242 if (!atEof && (_idx > 0)) {
243 register unsigned pos;
244 const unsigned char *startPos, *endPos;
246 for (pos = _idx; (pos >= 0) && (_buffer[pos] != '\n'); pos--)
248 startPos = &(_buffer[pos + 1]);
250 for (pos = _idx; ((pos < _len) && (_buffer[pos] != '\n')); pos++)
252 endPos = &(_buffer[pos - 1]);
254 if (startPos < endPos) {
257 ll = [self _makeStringForBuffer:startPos length:(endPos - startPos)];
258 [ui setObject:ll forKey:@"lastLine"];
263 //NSLog(@"startPos=0x%08X endPos=0x%08X", startPos, endPos);
268 #if NeXT_Foundation_LIBRARY || APPLE_FOUNDATION_LIBRARY || \
269 COCOA_Foundation_LIBRARY
270 exception = [NSException exceptionWithName:[exception name] reason:[exception reason] userInfo:ui];
272 [exception setUserInfo:ui];
275 [ui release]; ui = nil;
282 _isComment(const char *_buffer, unsigned _idx, unsigned _len)
285 if ((_idx + 7) >= _len) // check whether it is long enough
287 if (_buffer[_idx] != '<') // check whether it is a tag
290 _idx++; if (_buffer[_idx] != '!') return NO;
291 _idx++; if (_buffer[_idx] != '-') return NO;
292 _idx++; if (_buffer[_idx] != '-') return NO;
297 static inline BOOL _isHashTag(const char *_buf, unsigned _idx, unsigned _len) {
298 /* check for "<#.>" (len 4) */
299 if ((_idx + 3) >= _len) // check whether it is long enough
301 return (_buf[_idx] == '<' && _buf[_idx + 1] == '#') ? YES : NO;
303 static inline BOOL _isHashCloseTag(const char *_buf,
304 unsigned _idx, unsigned _len)
306 /* check for "</#.>" (len 5) */
307 if ((_idx + 5) >= _len) // check whether it is long enough
309 return (_buf[_idx] == '<' && _buf[_idx + 1] == '/' && _buf[_idx + 2] == '#')
313 static inline BOOL _isWOTag(const char *_buf, unsigned _idx, unsigned _len) {
314 /* check for "<WEBOBJECT .......>" (len 19) (lowercase is allowed) */
315 if ((_idx + 18) >= _len) // check whether it is long enough
317 if (_buf[_idx] != '<') // check whether it is a tag
320 // now check for '<WEBOBJECT'
321 return (strncasecmp(&(_buf[_idx]), "<WEBOBJECT", 10) == 0) ? YES : NO;
325 _isWOCloseTag(const char *_buf, unsigned _idx, unsigned _len)
327 /* check for </WEBOBJECT> (len=12) */
328 if ((_idx + 12) > _len) // check whether it is long enough
330 if (_buf[_idx] != '<') // check whether it is a tag
333 return (strncasecmp(&(_buf[_idx]), "</WEBOBJECT>", 12) == 0) ? YES : NO;
336 static inline void _skipSpaces(register const char *_buffer, unsigned *_idx,
339 register unsigned pos = *_idx;
341 if (pos >= _len) return; // EOF
343 while ((pos < _len) && _isHTMLSpace(_buffer[pos]))
349 static NSString *_parseStringValue(NSZone *_zone,
350 register const char *_buffer,
351 unsigned *_idx, unsigned _len,
352 NSException **_exception,
355 register unsigned pos = *_idx;
357 _skipSpaces(_buffer, _idx, _len);
358 if (pos >= _len) return nil; // EOF
360 if (_buffer[pos] == '>') return nil;
361 if (_buffer[pos] == '/') return nil;
362 if (_buffer[pos] == '=') return nil;
364 if (_buffer[pos] == '"') { // quoted string
365 register unsigned len = 0;
366 unsigned startPos = pos + 1;
368 pos++; // skip starting quote ('"')
370 // loop until closing quote
371 while ((_buffer[pos] != '"') && (pos < _len)) {
376 if (pos == _len) { // syntax error, quote not closed
378 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
379 @"quoted string not closed (expected '\"')",
384 NSCAssert(_buffer[pos] == '"', @"invalid parser state ..");
385 pos++; // skip closing quote
386 *_idx = pos; // store pointer
388 if (len == 0) // empty string
391 return [self _makeStringForBuffer:&(_buffer[startPos]) length:len];
394 unsigned startPos = pos;
396 //NSLog(@"parsing id at '%c'[%i] ..", _buffer[pos], pos);
398 // loop until '>' or '=' or '/' or space
399 while ((_buffer[pos] != '>') &&
400 (_buffer[pos] != '=') &&
401 (_buffer[pos] != '/') &&
402 (!_isHTMLSpace(_buffer[pos])) &&
408 if ((pos - startPos) == 0) // wasn't a string ..
411 return [self _makeStringForBuffer:&(_buffer[startPos])
412 length:(pos - startPos)];
416 static WOElement *_parseHashElement(NSZone *_zone, const char *_buffer,
417 unsigned *_idx, unsigned _len,
423 <#dynelem>....</#dynelem>
427 static NSString *nameKey = @"NAME";
428 WOElement *element = nil;
429 BOOL foundEndTag = NO;
430 BOOL isAutoClose = NO;
431 NSMutableArray *children = nil;
433 NSDictionary *nameDict;
435 if (*_idx >= _len) return nil; // EOF
437 if (!_isHashTag(_buffer, *_idx, _len))
438 return nil; // not a hash tag ..
443 if ((name = _parseStringValue(_zone, _buffer, _idx,_len,_exc,self)) == nil) {
445 NSLog(@"ERROR: got no name for hash tag '<#NAME>'");
447 if (_exc) // if there was an error ..
451 _skipSpaces(_buffer, _idx, _len);
454 _makeHtmlException(*_exc, _buffer, *_idx, _len,
455 @"unexpected EOF: missing '>' in hash element tag (EOF).",
457 [name release]; name = nil;
458 return nil; // unexpected EOF
460 if (_buffer[*_idx] != '>' && _buffer[*_idx] != '/') {
461 *_exc = _makeHtmlException(*_exc, _buffer, *_idx, _len,
462 @"missing '>' in hash element tag.", self);
463 [name release]; name = nil;
464 return nil; // unexpected EOF
467 if (_buffer[*_idx] == '>') {
468 /* has sub-elements (<#name>...</#name>) */
469 *_idx += 1; // skip '>'
471 while ((*_idx < _len) && (*_exc == nil)) {
475 NSLog(@"subelement at '%c'[%i] ..", _buffer[*_idx], *_idx);
478 if (_isHashCloseTag(_buffer, *_idx, _len)) {
483 subElement = _parseElement(_zone, _buffer, _idx, _len, _exc, self);
486 NSLog(@" parsed subelement '%@' ..", subElement);
491 children = [NSMutableArray arrayWithCapacity:10];
492 [children addObject:subElement];
493 [subElement release]; subElement = nil;
498 /* has no sub-elements (<#name/>) */
499 *_idx += 1; // skip '/'
501 if (_buffer[*_idx] != '>') {
502 *_exc = _makeHtmlException(*_exc, _buffer, *_idx, _len,
503 @"missing '>' in hash element tag.", self);
504 [name release]; name = nil;
505 return nil; // unexpected EOF
507 *_idx += 1; // skip '>'
510 /* produce elements */
512 if ([name length] < 1) {
514 *_exc = _makeHtmlException(*_exc, NULL, 0, 0,
515 @"missing name in hash element tag.",
521 nameDict = [[NSDictionary alloc] initWithObjects:&name forKeys:&nameKey
523 element = [self dynamicElementWithName:name
525 contentElements:children];
526 [name release]; name = nil;
527 [nameDict release]; nameDict = nil;
529 if (element == nil) { // build error
530 *_exc = _makeHtmlException(*_exc, _buffer, *_idx, _len,
531 @"could not build hash element !.", self);
535 if (!foundEndTag && !isAutoClose) {
536 *_exc = _makeHtmlException(*_exc, _buffer, *_idx, _len,
537 @"did not find hash end tag (</#...>) ..",
539 [element release]; element = nil;
542 else if (!isAutoClose) {
543 /* skip close tag ('</#name>') */
544 NSCAssert(_isHashCloseTag(_buffer, *_idx, _len),
545 @"invalid parser state ..");
547 *_idx += 3; // skip '</#'
548 while ((*_idx < _len) && (_buffer[*_idx] != '>'))
550 *_idx += 1; // skip '>'
552 NSLog(@"parsed close tag, now at '%c'[%i] ..", _buffer[*_idx], *_idx);
558 static NSMutableDictionary *
559 _parseTagAttributes(NSZone *_zone, const char *_buffer,
560 unsigned *_idx, unsigned _len,
561 NSException **_exception, WOHTMLParser *self)
563 NSMutableDictionary *dict = nil;
565 _skipSpaces(_buffer, _idx, _len);
566 if (*_idx >= _len) return nil; // EOF
569 NSLog(@"parsing attributes at '%c'[%i] ..", _buffer[*_idx], *_idx);
574 NSString *value = nil;
576 _skipSpaces(_buffer, _idx, _len);
577 if (*_idx >= _len) break; // EOF
580 key = _parseStringValue(_zone, _buffer, _idx, _len, _exception, self);
581 if (key == nil) // ended
584 /* The following parses: space* '=' space* */
586 _skipSpaces(_buffer, _idx, _len);
588 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
589 @"expected '=' after key in attributes ..",
591 break; // unexpected EOF
593 if (_buffer[*_idx] != '=') {
594 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
595 @"expected '=' after key in attributes ..",
599 NSCAssert(_buffer[*_idx] == '=', @"invalid parser state ..");
600 *_idx += 1; // skip '='
601 _skipSpaces(_buffer, _idx, _len);
603 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
604 @"expected value after key in attributes ..",
606 break; // unexpected EOF
610 value = _parseStringValue(_zone, _buffer, _idx, _len, _exception, self);
612 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
613 @"expected value after key in attributes ..",
615 break; // unexpected EOF
618 NSCAssert(key, @"invalid key ..");
619 NSCAssert(value, @"invalid value ..");
622 dict = [[DictClass allocWithZone:_zone] init];
623 NSCAssert(dict, @"no attributes dictionary ?");
624 [dict setObject:value forKey:key];
626 [key release]; key = nil;
627 [value release]; value = nil;
629 while (*_idx < _len);
633 static WOElement *_parseWOElement(NSZone *_zone, const char *_buffer,
634 unsigned *_idx, unsigned _len,
635 NSException **_exception,
638 WOElement *element = nil;
639 NSMutableDictionary *attrs = nil;
640 BOOL foundEndTag = NO;
641 NSMutableArray *children = nil;
643 if (*_idx >= _len) return nil; // EOF
645 if (!_isWOTag(_buffer, *_idx, _len))
646 return nil; // not a WO tag ..
648 NSCAssert(strncasecmp("<WEBOBJECT", &(_buffer[*_idx]), 10) == 0,
649 @"invalid parser state ..");
654 attrs = _parseTagAttributes(_zone, _buffer, _idx, _len, _exception, self);
656 //NSLog(@"ERROR: got no attributes for WO tag (need at least 'NAME')..");
658 if (_exception) // if there was an error ..
662 _skipSpaces(_buffer, _idx, _len);
665 _makeHtmlException(*_exception, _buffer, *_idx, _len,
666 @"unexpected EOF: missing '>' in WEBOBJECT tag.",
668 [attrs release]; attrs = nil;
669 return nil; // unexpected EOF
671 if (_buffer[*_idx] != '>') {
672 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
673 @"missing '>' in WEBOBJECT tag.", self);
674 [attrs release]; attrs = nil;
675 return nil; // unexpected EOF
677 NSCAssert(_buffer[*_idx] == '>', @"invalid parser state ..");
679 *_idx += 1; // skip '>'
681 // parse sub-elements
683 while ((*_idx < _len) && (*_exception == nil)) {
686 //NSLog(@"subelement at '%c'[%i] ..", _buffer[*_idx], *_idx);
688 if (_isWOCloseTag(_buffer, *_idx, _len)) {
693 subElement = _parseElement(_zone, _buffer, _idx, _len, _exception, self);
695 //NSLog(@" parsed subelement '%@' ..", subElement);
699 children = [NSMutableArray arrayWithCapacity:10];
700 [children addObject:subElement];
701 [subElement release]; subElement = nil;
705 /* produce elements */
709 if ((name = [attrs objectForKey:@"NAME"]) == nil)
710 name = [attrs objectForKey:@"name"];
712 if ((name = [attrs objectForKey:@"name"])) {
713 NSLog(@"%s: missing 'name' attribute !",
714 __PRETTY_FUNCTION__);
718 if ([name length] < 1) {
720 *_exception = _makeHtmlException(*_exception, NULL, 0, 0,
721 @"no NAME attribute in WEBOBJECT tag.",
726 element = [self dynamicElementWithName:name
728 contentElements:children];
731 [attrs release]; attrs = nil;
733 if (element == nil) { // build error
734 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
735 @"could not build WEBOBJECT.", self);
740 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
741 @"did not find WEBOBJECT end tag ..",
743 [element release]; element = nil;
747 NSCAssert(_isWOCloseTag(_buffer, *_idx, _len), @"invalid parser state ..");
749 // skip close tag ('</WEBOBJECT>')
750 *_idx += 11; // skip '</WEBOBJECT'
751 while ((*_idx < _len) && (_buffer[*_idx] != '>'))
753 *_idx += 1; // skip '>'
755 //NSLog(@"parsed close tag, now at '%c'[%i] ..", _buffer[*_idx], *_idx);
760 static inline NSString *_makeTextString(NSZone *_zone, const char *_buffer,
761 unsigned _len, WOHTMLParser *self)
763 NSString *result = nil;
764 register unsigned char *buffer;
765 register unsigned pos, bufPos;
767 if (_len == 0) // empty string
770 if (!compressHTMLWhitespace)
771 /* deliver whitespace as in template */
772 return [self _makeStringForBuffer:_buffer length:_len];
774 buffer = malloc(_len + 3);
776 for (pos = 0, bufPos = 0; pos < _len; ) {
777 buffer[bufPos] = _buffer[pos];
779 if ((_buffer[pos] == ' ') || (_buffer[pos] == '\t')) {
783 while (((_buffer[pos] == ' ') || (_buffer[pos] =='\t')) &&
794 result = [self _makeStringForBuffer:buffer length:bufPos];
795 if (buffer) free(buffer);
799 static WOElement *_parseElement(NSZone *_zone,
800 const char *_buffer, unsigned *_idx,
801 unsigned _len, NSException **_exception,
804 register unsigned pos = *_idx;
805 unsigned startPos = pos;
807 if (*_idx >= _len) // EOF
810 if (_isHashTag(_buffer, *_idx, _len)) {
811 /* start parsing of dynamic content */
812 return _parseHashElement(_zone, _buffer, _idx, _len, _exception, self);
814 if (_isHashCloseTag(_buffer, *_idx, _len)) {
815 /* check for a common template syntax error */
816 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
817 @"unexpected hash close tag (</#...>).",
822 if (_isWOTag(_buffer, *_idx, _len)) {
823 /* start parsing of dynamic content */
824 return _parseWOElement(_zone, _buffer, _idx, _len, _exception, self);
826 if (_isWOCloseTag(_buffer, *_idx, _len)) {
827 /* check for a common template syntax error */
828 *_exception = _makeHtmlException(*_exception, _buffer, *_idx, _len,
829 @"unexpected WEBOBJECT close tag "
830 @"(</WEBOBJECT...>).",
835 /* parse text/tag content */
837 while ((_buffer[pos] != '<') && (pos < _len))
840 if (pos >= _len) // EOF was reached
843 NSCAssert(_buffer[pos] == '<', @"invalid parser state ..");
845 if (_isHashTag(_buffer, pos, _len)) /* found Hash */
847 if (_isHashCloseTag(_buffer, pos, _len))
849 if (_isWOTag(_buffer, pos, _len)) /* found Hash */
851 if (_isWOCloseTag(_buffer, pos, _len))
855 NSLog(@"is comment ? from '%c%c%c'[%i]",
856 _buffer[pos], _buffer[pos+1], _buffer[pos+2], pos);
858 if (_isComment(_buffer, pos, _len)) {
859 pos += 3; // skip '<--'
862 if (_buffer[pos] == '-') {
863 if (pos + 2 < _len) {
864 if ((_buffer[pos + 1] == '-') && (_buffer[pos + 2] == '>')) {
866 pos += 3; // skip '-->'
874 if (pos >= _len) // EOF was reached
878 // skip '<', read usual tag
880 if (pos >= _len) { // EOF was reached with opening '<'
881 NSLog(@"WARNING: reached EOF with '<' at end !");
886 /* skip until end of HTML tag (not #-tag) */
890 while ((_buffer[pos] != '>') && (pos < _len));
891 if (pos >= _len) break; // EOF
899 /* store back position */
903 NSLog(@"Debug: stopped parsing at '%c'[%i]", _buffer[pos], pos);
906 if ((pos - startPos) > 0) {
907 return [self _makeConstantStringElementWithBuffer:&(_buffer[startPos])
908 length:(pos - startPos)];
914 @end /* WOHTMLParser */