2 Copyright (C) 2000-2005 SKYRIX Software AG
4 This file is part of SOPE.
6 SOPE is free software; you can redistribute it and/or modify it under
7 the terms of the GNU Lesser General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
11 SOPE is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with SOPE; see the file COPYING. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
22 #import "libxmlHTMLSAXDriver.h"
23 #import "libxmlSAXLocator.h"
24 #include "TableCallbacks.h"
25 #include <SaxObjC/SaxObjC.h>
26 #include <SaxObjC/SaxException.h>
29 #include <libxml/HTMLparser.h>
30 #include <libxml/HTMLtree.h>
32 @interface libxmlHTMLSAXDriver(PrivateMethods)
34 - (void)tearDownParser;
36 - (BOOL)walkDocumentTree:(xmlDocPtr)_doc;
37 - (BOOL)processNode:(xmlNodePtr)_node;
38 - (BOOL)processChildren:(xmlNodePtr)children;
42 static int _UTF8ToUTF16(unsigned char **sourceStart, unsigned char *sourceEnd,
43 unichar **targetStart, const unichar *targetEnd);
45 static BOOL logUnsupportedFeatures = NO;
46 static BOOL reportInvalidTags = NO;
47 static BOOL reportUnclosedEntities = NO;
48 static NSMapTable *uniqueStrings = NULL; // THREAD
49 static Class NSStringClass = Nil;
51 /* error string detection */
53 TODO: obviously this may change between libxml versions or even
54 localisations ... why doesn't libxml support error codes ?
57 static const char *tagInvalidMsg = "tag %s invalid";
58 static const char *unclosedEntityInvalidMsg =
59 "htmlParseEntityRef: expecting ';'";
61 static const char *unexpectedNobrCloseMsg =
62 "Unexpected end tag : %s";
65 static inline NSString *xmlCharsToString(const xmlChar *_s) {
69 if (_s == NULL) return nil;
71 if (uniqueStrings == NULL) {
72 uniqueStrings = NSCreateMapTable(libxmlNonOwnedCStringMapKeyCallBacks,
73 NSObjectMapValueCallBacks,
76 else if ((s = NSMapGet(uniqueStrings, _s))) {
77 /* found a string in cache ... */
81 newkey = malloc(strlen((char *)_s) + 2);
82 strcpy(newkey, (char *)_s);
84 if (NSStringClass == Nil)
85 NSStringClass = [NSString class];
87 s = [[NSStringClass alloc] initWithUTF8String:(const char *)_s];
88 NSMapInsert(uniqueStrings, newkey, s);
92 static NSString *SaxDeclHandlerProperty =
93 @"http://xml.org/sax/properties/declaration-handler";
94 static NSString *SaxLexicalHandlerProperty =
95 @"http://xml.org/sax/properties/lexical-handler";
97 static NSString *XMLNS_XHTML = @"http://www.w3.org/1999/xhtml";
99 @implementation libxmlHTMLSAXDriver
101 static libxmlHTMLSAXDriver *activeDriver = nil;
102 static void warning(void *udata, const char *msg, ...);
103 static void error(void *udata, const char *msg, ...);
104 static void fatalError(void *udata, const char *msg, ...);
105 static void setLocator(void *udata, xmlSAXLocatorPtr _locator);
108 NSUserDefaults *ud = [NSUserDefaults standardUserDefaults];
110 reportInvalidTags = [ud boolForKey:@"libxmlHTMLSAXDriverReportInvalidTags"];
111 reportUnclosedEntities =
112 [ud boolForKey:@"libxmlHTMLSAXDriverReportUnclosedEntityRefs"];
116 if ((self = [super init])) {
117 self->namespaceURI = [XMLNS_XHTML copy];
118 self->encodeEntities = NO;
124 [self tearDownParser];
126 [self->attributes release];
127 [self->namespaceURI release];
128 [self->lexicalHandler release];
129 [self->declHandler release];
130 [self->contentHandler release];
131 [self->dtdHandler release];
132 [self->errorHandler release];
133 [self->entityResolver release];
137 /* features & properties */
139 - (void)setFeature:(NSString *)_name to:(BOOL)_value {
140 if (logUnsupportedFeatures)
141 NSLog(@"%s: don't know feature %@", __PRETTY_FUNCTION__, _name);
143 - (BOOL)feature:(NSString *)_name {
144 if (logUnsupportedFeatures)
145 NSLog(@"%s: don't know feature %@", __PRETTY_FUNCTION__, _name);
149 - (void)setProperty:(NSString *)_name to:(id)_value {
150 if ([_name isEqualToString:SaxLexicalHandlerProperty]) {
151 ASSIGN(self->lexicalHandler, _value);
154 if ([_name isEqualToString:SaxDeclHandlerProperty]) {
155 ASSIGN(self->declHandler, _value);
159 [SaxNotRecognizedException raise:@"PropertyException"
160 format:@"don't know property %@", _name];
162 - (id)property:(NSString *)_name {
163 if ([_name isEqualToString:SaxLexicalHandlerProperty])
164 return self->lexicalHandler;
165 if ([_name isEqualToString:SaxDeclHandlerProperty])
166 return self->declHandler;
168 [SaxNotRecognizedException raise:@"PropertyException"
169 format:@"don't know property %@", _name];
175 - (void)setDTDHandler:(id<NSObject,SaxDTDHandler>)_handler {
176 ASSIGN(self->dtdHandler, _handler);
178 - (id<NSObject,SaxDTDHandler>)dtdHandler {
179 return self->dtdHandler;
182 - (void)setErrorHandler:(id<NSObject,SaxErrorHandler>)_handler {
183 ASSIGN(self->errorHandler, _handler);
185 - (id<NSObject,SaxErrorHandler>)errorHandler {
186 return self->errorHandler;
189 - (void)setEntityResolver:(id<NSObject,SaxEntityResolver>)_handler {
190 ASSIGN(self->entityResolver, _handler);
192 - (id<NSObject,SaxEntityResolver>)entityResolver {
193 return self->entityResolver;
196 - (void)setContentHandler:(id<NSObject,SaxContentHandler>)_handler {
197 ASSIGN(self->contentHandler, _handler);
199 - (id<NSObject,SaxContentHandler>)contentHandler {
200 return self->contentHandler;
205 - (void)setupParserWithDocumentPath:(NSString *)_path {
208 if (self->ctxt != NULL) {
209 NSLog(@"WARNING(%s): HTML parser context already setup !",
210 __PRETTY_FUNCTION__);
211 [self tearDownParser];
214 memcpy(&sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandler));
216 sax.warning = warning;
217 sax.fatalError = fatalError;
218 sax.setDocumentLocator = setLocator;
220 if (activeDriver != nil) {
221 NSLog(@"WARNING(%s): %@ there is an active driver set (%@), override !",
222 __PRETTY_FUNCTION__, self, activeDriver);
226 self->ctxt = htmlCreatePushParserCtxt(&sax /* sax */,
227 NULL /*self*/ /* userdata */,
230 [_path cString] /* filename */,
231 XML_CHAR_ENCODING_8859_1
235 - (void)tearDownParser {
236 if (activeDriver == self)
240 xmlFreeDoc(self->doc);
244 htmlFreeParserCtxt(self->ctxt);
251 - (void)pushBytes:(const char *)_bytes count:(unsigned)_len {
252 if (_len == 0) return;
253 NSAssert(self->ctxt, @"missing HTML parser context");
254 htmlParseChunk(self->ctxt, _bytes, _len, 0);
258 htmlParseChunk(self->ctxt, &dummyByte, 0, 1 /* terminate */);
259 self->doc = ((xmlParserCtxtPtr)ctxt)->myDoc;
264 - (void)_handleEmptyDataInSystemId:(NSString *)_sysId {
266 An empty HTML file _is_ valid?!
267 I guess it equals to <html><body></body></html>, wrong? => hh
269 [self->contentHandler startDocument];
270 [self->contentHandler startPrefixMapping:@"" uri:self->namespaceURI];
272 [self->contentHandler
273 startElement:@"html" namespace:XMLNS_XHTML
274 rawName:@"html" attributes:nil];
275 [self->contentHandler
276 startElement:@"body" namespace:XMLNS_XHTML
277 rawName:@"body" attributes:nil];
279 [self->contentHandler
280 endElement:@"body" namespace:XMLNS_XHTML rawName:@"body"];
281 [self->contentHandler
282 endElement:@"html" namespace:XMLNS_XHTML rawName:@"html"];
284 [self->contentHandler endPrefixMapping:@""];
285 [self->contentHandler endDocument];
288 - (void)_parseFromData:(NSData *)_data systemId:(NSString *)_sysId {
289 NSAutoreleasePool *pool;
291 if ([_data length] == 0) {
292 [self _handleEmptyDataInSystemId:_sysId];
296 pool = [[NSAutoreleasePool alloc] init];
298 /* parse into structure */
299 [self setupParserWithDocumentPath:_sysId];
300 [self pushBytes:[_data bytes] count:[_data length]];
303 if (self->doc == NULL) {
304 NSLog(@"Could not parse HTML file: %@", _sysId);
305 [self tearDownParser];
308 [self walkDocumentTree:self->doc];
309 [self tearDownParser];
315 - (void)parseFromSource:(id)_source systemId:(NSString *)_sysId {
316 NSAutoreleasePool *pool;
318 pool = [[NSAutoreleasePool alloc] init];
320 if ([_source isKindOfClass:[NSData class]]) {
321 [self _parseFromData:_source systemId:_sysId];
324 if ([_source isKindOfClass:[NSString class]]) {
325 [self _parseFromData:[_source dataUsingEncoding:NSISOLatin1StringEncoding]
329 if ([_source isKindOfClass:[NSURL class]]) {
332 data = [_source isFileURL]
333 ? [NSData dataWithContentsOfMappedFile:[_source path]]
334 : [_source resourceDataUsingCache:YES];
336 [self _parseFromData:data systemId:[_source absoluteString]];
341 SaxParseException *e;
344 ui = [NSDictionary dictionaryWithObjectsAndKeys:
345 _source ? _source : @"<nil>", @"source",
349 e = (id)[SaxParseException exceptionWithName:@"SaxIOException"
350 reason:@"can't handle data-source"
353 [self->errorHandler fatalError:e];
356 [self tearDownParser];
360 - (void)parseFromSource:(id)_source {
361 if ([_source isKindOfClass:[NSString class]])
362 [self parseFromSource:_source systemId:@"<string>"];
363 else if ([_source isKindOfClass:[NSData class]])
364 [self parseFromSource:_source systemId:@"<data>"];
365 else if ([_source isKindOfClass:[NSURL class]])
366 [self parseFromSource:_source systemId:[_source absoluteString]];
368 [self parseFromSource:_source systemId:@"<memory>"];
371 - (void)parseFromSystemId:(NSString *)_sysId {
372 NSAutoreleasePool *pool;
375 if (![_sysId hasPrefix:@"file://"]) {
380 pool = [[NSAutoreleasePool alloc] init];
382 /* cut off file:// */
383 _sysId = [_sysId substringFromIndex:7];
386 data = [NSData dataWithContentsOfFile:_sysId];
388 [self _parseFromData:data systemId:_sysId];
393 /* process attribute nodes */
395 - (void)processAttributes:(xmlAttrPtr)_attributes {
396 xmlAttrPtr attribute;
398 /* setup or clear attribute cache */
399 if (self->attributes == nil)
400 attributes = [[SaxAttributes alloc] init];
404 if (_attributes == NULL)
405 /* nothing to process */
410 for (attribute = _attributes; attribute; attribute = attribute->next) {
411 NSString *name, *xhtmlName;
414 printf("attr name '%s' has NS '%s'\n",
415 attribute->name, attribute->ns ? "yes" : "no");
418 name = xmlCharsToString(attribute->name);
419 xhtmlName = [name lowercaseString];
422 if (attribute->children) {
425 if ((t = xmlNodeListGetString(doc, attribute->children, 0))) {
426 value = xmlCharsToString(t);
427 free(t); /* should be xmlFree ?? */
431 [attributes addAttribute:xhtmlName
432 uri:self->namespaceURI
434 type:@"CDATA" value:value];
436 [name release]; name = nil;
437 [value release]; value = nil;
443 /* walking the tree, generating SAX events */
445 - (BOOL)processEntityRefNode:(xmlNodePtr)node {
446 NSLog(@"Ignoring entity ref: '%s'\n", node->name);
450 - (BOOL)processDocumentNode:(xmlNodePtr)node {
453 [self->contentHandler startDocument];
454 [self->contentHandler startPrefixMapping:@"" uri:self->namespaceURI];
455 result = [self processChildren:node->children];
456 [self->contentHandler endPrefixMapping:@""];
457 [self->contentHandler endDocument];
462 - (BOOL)processTextNode:(xmlNodePtr)_node {
463 static unichar c = '\0';
467 if (self->contentHandler == nil)
470 if (_node->content == NULL) {
471 [self->contentHandler characters:&c length:0];
475 if (self->encodeEntities) {
476 /* should use the HTML encoding routine (htmlEncodeEntities) ??? */
478 chars = xmlEncodeEntitiesReentrant(self->doc, _node->content);
481 chars = _node->content;
484 [self->contentHandler characters:&c length:0];
487 if ((len = strlen((char *)chars)) == 0) {
489 [self->contentHandler characters:&c length:0];
496 data = ts = calloc(len + 2, sizeof(unichar)); /* GC ?! */
498 if (_UTF8ToUTF16((void *)&chars, (void *)(chars + len),
499 (void *)&ts, ts + (len * sizeof(unichar)))) {
500 NSLog(@"ERROR(%s:%i): couldn't convert UTF8 to UTF16 !",
501 __PRETTY_FUNCTION__, __LINE__);
502 if (data) free(data);
506 len = (ts - data) / 2;
507 [self->contentHandler characters:data length:len];
509 if (data) free(data);
515 - (BOOL)processCommentNode:(xmlNodePtr)_node {
518 if (self->lexicalHandler == nil)
521 if (_node->content) {
524 /* uses the HTML encoding routine !!!!!!!!!! */
525 chars = xmlEncodeEntitiesReentrant(self->doc, _node->content);
528 [self->lexicalHandler comment:&c length:0];
533 if ((len = strlen((char *)chars)) > 0) {
536 data = ts = calloc(len + 1, sizeof(unichar)); /* GC ?! */
538 if (_UTF8ToUTF16((void *)&chars, (void *)(chars + len),
539 (void *)&ts, ts + (len * sizeof(unichar)))) {
541 NSLog(@"ERROR(%s:%i): couldn't convert UTF8 to UTF16 !",
542 __PRETTY_FUNCTION__, __LINE__);
546 len = (ts - data) / 2;
547 [self->lexicalHandler comment:data length:len];
553 [self->lexicalHandler comment:&c length:0];
558 [self->lexicalHandler comment:&c length:0];
563 - (BOOL)processDTDNode:(xmlNodePtr)node {
564 /* do nothing with DTD nodes .. */
567 - (BOOL)processEntityNode:(xmlNodePtr)node {
568 /* do nothing with entity nodes .. */
569 NSLog(@"%s:%i: ignoring entity node ..", __PRETTY_FUNCTION__, __LINE__);
572 - (BOOL)processPINode:(xmlNodePtr)node {
573 /* do nothing with PI nodes .. */
577 - (BOOL)processElementNode:(xmlNodePtr)node {
578 const htmlElemDesc *tagInfo;
579 NSString *tagName, *xhtmlName;
584 tagInfo = htmlTagLookup(node->name);
585 tagName = xmlCharsToString(node->name);
586 xhtmlName = [tagName lowercaseString];
588 [self processAttributes:node->properties];
590 [self->contentHandler
591 startElement:xhtmlName
592 namespace:self->namespaceURI
594 attributes:self->attributes];
596 [self->attributes clear];
598 result = [self processChildren:node->children];
600 [self->contentHandler
602 namespace:self->namespaceURI
611 - (BOOL)processChildren:(xmlNodePtr)children {
614 if (children == NULL)
617 for (node = children; node; node = node->next) {
618 [self processNode:node];
624 - (BOOL)processNode:(xmlNodePtr)_node {
625 switch(_node->type) {
626 case XML_ELEMENT_NODE:
627 return [self processElementNode:_node];
629 case XML_ATTRIBUTE_NODE:
630 NSLog(@"invalid place for attribute-node !");
634 return [self processTextNode:_node];
636 case XML_CDATA_SECTION_NODE:
637 return [self processTextNode:_node];
639 case HTML_ENTITY_REF_NODE:
640 return [self processEntityRefNode:_node];
642 case XML_ENTITY_NODE:
643 return [self processEntityNode:_node];
646 return [self processPINode:_node];
648 case HTML_COMMENT_NODE:
649 return [self processCommentNode:_node];
651 case XML_HTML_DOCUMENT_NODE:
652 return [self processDocumentNode:_node];
655 return [self processDTDNode:_node];
658 NSLog(@"WARNING: UNKNOWN node type %i\n", _node->type);
664 - (BOOL)walkDocumentTree:(xmlDocPtr)_doc {
668 type = ((xmlDocPtr)self->doc)->type;
669 ((xmlDocPtr)self->doc)->type = XML_HTML_DOCUMENT_NODE;
671 result = [self processNode:(xmlNodePtr)self->doc];
673 ((xmlDocPtr)self->doc)->type = type;
680 static SaxParseException *
681 mkException(libxmlHTMLSAXDriver *self, NSString *key,
682 const char *msg, va_list va)
684 NSString *s, *reason;
686 SaxParseException *e;
688 id keys[7], values[7];
692 s = [NSString stringWithCString:msg];
693 s = [[[NSString alloc]
694 initWithFormat:s arguments:va]
697 r = [s rangeOfString:@"\n"];
698 reason = (r.length > 0)
699 ? [s substringToIndex:r.location]
702 if ([reason length] == 0)
703 reason = @"unknown reason";
705 keys[0] = @"parser"; values[0] = self; count++;
707 values[1] = [NSNumber numberWithInt:self->depth]; count++;
709 if ([s length] > 0) {
710 keys[count] = @"errorMessage";
715 // NSLog(@"locator: %@", self->locator);
717 if ((i = [self->locator lineNumber]) >= 0) {
718 keys[count] = @"line";
719 values[count] = [NSNumber numberWithInt:i];
722 if ((i = [self->locator columnNumber]) >= 0) {
723 keys[count] = @"column";
724 values[count] = [NSNumber numberWithInt:i];
727 if ((tmp = [self->locator publicId])) {
728 keys[count] = @"publicId";
732 if ((tmp = [self->locator systemId])) {
733 keys[count] = @"systemId";
738 ui = [NSDictionary dictionaryWithObjects:values forKeys:keys count:count];
740 e = (id)[SaxParseException exceptionWithName:key
746 static void warning(void *udata, const char *msg, ...) {
748 SaxParseException *e;
750 if (activeDriver == nil) {
751 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
756 e = mkException(activeDriver, @"SAXWarning", msg, args);
759 [activeDriver->errorHandler warning:e];
762 static void error(void *udata, const char *msg, ...) {
764 SaxParseException *e;
766 if (!reportInvalidTags && msg != NULL) {
767 if (toupper(msg[0]) == 'T') {
768 if (strncasecmp(tagInvalidMsg, msg, strlen(tagInvalidMsg)) == 0)
772 else if (toupper(msg[0]) == 'U') {
773 if (strncasecmp(unexpectedNobrCloseMsg, msg,
774 strlen(unexpectedNobrCloseMsg)) == 0)
776 printf("MSG: '%s'\n", msg);
780 if (!reportUnclosedEntities && msg != NULL && toupper(msg[0]) == 'H') {
781 if (strncasecmp(unclosedEntityInvalidMsg, msg,
782 strlen(unclosedEntityInvalidMsg)) == 0)
786 if (activeDriver == nil) {
787 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
791 /* msg is a format, eg 'tag %s is invalid' */
794 e = mkException(activeDriver, @"SAXError", msg, args);
797 [activeDriver->errorHandler error:e];
800 static void fatalError(void *udata, const char *msg, ...) {
802 SaxParseException *e;
804 if (activeDriver == nil) {
805 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
810 e = mkException(activeDriver, @"SAXFatalError", msg, args);
813 [activeDriver->errorHandler fatalError:e];
816 static void setLocator(void *udata, xmlSAXLocatorPtr _locator) {
817 if (activeDriver == nil) {
818 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
822 [activeDriver->locator release];
824 activeDriver->locator = [[libxmlSAXLocator alloc]
825 initWithSaxLocator:_locator
826 parser:activeDriver];
827 activeDriver->locator->ctx = activeDriver->ctxt;
829 [activeDriver->contentHandler setDocumentLocator:activeDriver->locator];
832 @end /* libxmlHTMLSAXDriver */