2 Copyright (C) 2000-2004 SKYRIX Software AG
4 This file is part of OpenGroupware.org.
6 OGo is free software; you can redistribute it and/or modify it under
7 the terms of the GNU Lesser General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
11 OGo is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with OGo; see the file COPYING. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
21 // $Id: libxmlHTMLSAXDriver.m,v 1.5 2004/05/07 16:31:22 helge Exp $
23 #import "libxmlHTMLSAXDriver.h"
24 #import "libxmlSAXLocator.h"
25 #include "TableCallbacks.h"
26 #include <SaxObjC/SaxObjC.h>
27 #include <SaxObjC/SaxException.h>
30 #include <libxml/HTMLparser.h>
31 #include <libxml/HTMLtree.h>
33 @interface libxmlHTMLSAXDriver(PrivateMethods)
35 - (void)tearDownParser;
37 - (BOOL)walkDocumentTree:(xmlDocPtr)_doc;
38 - (BOOL)processNode:(xmlNodePtr)_node;
39 - (BOOL)processChildren:(xmlNodePtr)children;
43 static int _UTF8ToUTF16(unsigned char **sourceStart, unsigned char *sourceEnd,
44 unichar **targetStart, const unichar *targetEnd);
46 static BOOL logUnsupportedFeatures = NO;
47 static BOOL reportInvalidTags = NO;
48 static BOOL reportUnclosedEntities = NO;
49 static NSMapTable *uniqueStrings = NULL; // THREAD
50 static Class NSStringClass = Nil;
52 /* error string detection */
54 TODO: obviously this may change between libxml versions or even
55 localisations ... why doesn't libxml support error codes ?
58 static const unsigned char *tagInvalidMsg = "tag %s invalid";
59 static const unsigned char *unclosedEntityInvalidMsg =
60 "htmlParseEntityRef: expecting ';'";
62 static const unsigned char *unexpectedNobrCloseMsg =
63 "Unexpected end tag : %s";
66 static inline NSString *xmlCharsToString(const xmlChar *_s) {
70 if (_s == NULL) return nil;
72 if (uniqueStrings == NULL) {
73 uniqueStrings = NSCreateMapTable(libxmlNonOwnedCStringMapKeyCallBacks,
74 NSObjectMapValueCallBacks,
77 else if ((s = NSMapGet(uniqueStrings, _s))) {
78 /* found a string in cache ... */
82 newkey = malloc(strlen(_s) + 1);
85 if (NSStringClass == Nil)
86 NSStringClass = [NSString class];
88 s = [[NSStringClass alloc] initWithUTF8String:_s];
89 NSMapInsert(uniqueStrings, newkey, s);
93 static NSString *SaxDeclHandlerProperty =
94 @"http://xml.org/sax/properties/declaration-handler";
95 static NSString *SaxLexicalHandlerProperty =
96 @"http://xml.org/sax/properties/lexical-handler";
98 static NSString *XMLNS_XHTML = @"http://www.w3.org/1999/xhtml";
100 @implementation libxmlHTMLSAXDriver
102 static libxmlHTMLSAXDriver *activeDriver = nil;
103 static void warning(void *udata, const char *msg, ...);
104 static void error(void *udata, const char *msg, ...);
105 static void fatalError(void *udata, const char *msg, ...);
106 static void setLocator(void *udata, xmlSAXLocatorPtr _locator);
109 NSUserDefaults *ud = [NSUserDefaults standardUserDefaults];
111 reportInvalidTags = [ud boolForKey:@"libxmlHTMLSAXDriverReportInvalidTags"];
112 reportUnclosedEntities =
113 [ud boolForKey:@"libxmlHTMLSAXDriverReportUnclosedEntityRefs"];
117 if ((self = [super init])) {
118 self->namespaceURI = [XMLNS_XHTML copy];
119 self->encodeEntities = NO;
125 [self tearDownParser];
127 [self->attributes release];
128 [self->namespaceURI release];
129 [self->lexicalHandler release];
130 [self->declHandler release];
131 [self->contentHandler release];
132 [self->dtdHandler release];
133 [self->errorHandler release];
134 [self->entityResolver release];
138 /* features & properties */
140 - (void)setFeature:(NSString *)_name to:(BOOL)_value {
141 if (logUnsupportedFeatures)
142 NSLog(@"%s: don't know feature %@", __PRETTY_FUNCTION__, _name);
144 - (BOOL)feature:(NSString *)_name {
145 if (logUnsupportedFeatures)
146 NSLog(@"%s: don't know feature %@", __PRETTY_FUNCTION__, _name);
150 - (void)setProperty:(NSString *)_name to:(id)_value {
151 if ([_name isEqualToString:SaxLexicalHandlerProperty]) {
152 ASSIGN(self->lexicalHandler, _value);
155 if ([_name isEqualToString:SaxDeclHandlerProperty]) {
156 ASSIGN(self->declHandler, _value);
160 [SaxNotRecognizedException raise:@"PropertyException"
161 format:@"don't know property %@", _name];
163 - (id)property:(NSString *)_name {
164 if ([_name isEqualToString:SaxLexicalHandlerProperty])
165 return self->lexicalHandler;
166 if ([_name isEqualToString:SaxDeclHandlerProperty])
167 return self->declHandler;
169 [SaxNotRecognizedException raise:@"PropertyException"
170 format:@"don't know property %@", _name];
176 - (void)setDTDHandler:(id<NSObject,SaxDTDHandler>)_handler {
177 ASSIGN(self->dtdHandler, _handler);
179 - (id<NSObject,SaxDTDHandler>)dtdHandler {
180 return self->dtdHandler;
183 - (void)setErrorHandler:(id<NSObject,SaxErrorHandler>)_handler {
184 ASSIGN(self->errorHandler, _handler);
186 - (id<NSObject,SaxErrorHandler>)errorHandler {
187 return self->errorHandler;
190 - (void)setEntityResolver:(id<NSObject,SaxEntityResolver>)_handler {
191 ASSIGN(self->entityResolver, _handler);
193 - (id<NSObject,SaxEntityResolver>)entityResolver {
194 return self->entityResolver;
197 - (void)setContentHandler:(id<NSObject,SaxContentHandler>)_handler {
198 ASSIGN(self->contentHandler, _handler);
200 - (id<NSObject,SaxContentHandler>)contentHandler {
201 return self->contentHandler;
206 - (void)setupParserWithDocumentPath:(NSString *)_path {
209 if (self->ctxt != NULL) {
210 NSLog(@"WARNING(%s): HTML parser context already setup !",
211 __PRETTY_FUNCTION__);
212 [self tearDownParser];
215 memcpy(&sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandler));
217 sax.warning = warning;
218 sax.fatalError = fatalError;
219 sax.setDocumentLocator = setLocator;
221 if (activeDriver != nil) {
222 NSLog(@"WARNING(%s): %@ there is an active driver set (%@), override !",
223 __PRETTY_FUNCTION__, self, activeDriver);
227 self->ctxt = htmlCreatePushParserCtxt(&sax /* sax */,
228 NULL /*self*/ /* userdata */,
231 [_path cString] /* filename */,
232 XML_CHAR_ENCODING_8859_1
236 - (void)tearDownParser {
237 if (activeDriver == self)
241 xmlFreeDoc(self->doc);
245 htmlFreeParserCtxt(self->ctxt);
252 - (void)pushBytes:(const char *)_bytes count:(unsigned)_len {
253 if (_len == 0) return;
254 NSAssert(self->ctxt, @"missing HTML parser context");
255 htmlParseChunk(self->ctxt, _bytes, _len, 0);
259 htmlParseChunk(self->ctxt, &dummyByte, 0, 1 /* terminate */);
260 self->doc = ((xmlParserCtxtPtr)ctxt)->myDoc;
265 - (void)_handleEmptyDataInSystemId:(NSString *)_sysId {
267 An empty HTML file _is_ valid?!
268 I guess it equals to <html><body></body></html>, wrong? => hh
270 [self->contentHandler startDocument];
271 [self->contentHandler startPrefixMapping:@"" uri:self->namespaceURI];
273 [self->contentHandler
274 startElement:@"html" namespace:XMLNS_XHTML
275 rawName:@"html" attributes:nil];
276 [self->contentHandler
277 startElement:@"body" namespace:XMLNS_XHTML
278 rawName:@"body" attributes:nil];
280 [self->contentHandler
281 endElement:@"body" namespace:XMLNS_XHTML rawName:@"body"];
282 [self->contentHandler
283 endElement:@"html" namespace:XMLNS_XHTML rawName:@"html"];
285 [self->contentHandler endPrefixMapping:@""];
286 [self->contentHandler endDocument];
289 - (void)_parseFromData:(NSData *)_data systemId:(NSString *)_sysId {
290 NSAutoreleasePool *pool;
292 if ([_data length] == 0) {
293 [self _handleEmptyDataInSystemId:_sysId];
297 pool = [[NSAutoreleasePool alloc] init];
299 /* parse into structure */
300 [self setupParserWithDocumentPath:_sysId];
301 [self pushBytes:[_data bytes] count:[_data length]];
304 if (self->doc == NULL) {
305 NSLog(@"Could not parse HTML file: %@", _sysId);
306 [self tearDownParser];
309 [self walkDocumentTree:self->doc];
310 [self tearDownParser];
316 - (void)parseFromSource:(id)_source systemId:(NSString *)_sysId {
317 NSAutoreleasePool *pool;
319 pool = [[NSAutoreleasePool alloc] init];
321 if ([_source isKindOfClass:[NSData class]]) {
322 [self _parseFromData:_source systemId:_sysId];
325 if ([_source isKindOfClass:[NSString class]]) {
326 [self _parseFromData:[_source dataUsingEncoding:NSISOLatin1StringEncoding]
330 if ([_source isKindOfClass:[NSURL class]]) {
333 data = [_source isFileURL]
334 ? [NSData dataWithContentsOfMappedFile:[_source path]]
335 : [_source resourceDataUsingCache:YES];
337 [self _parseFromData:data systemId:[_source absoluteString]];
342 SaxParseException *e;
345 ui = [NSDictionary dictionaryWithObjectsAndKeys:
346 _source ? _source : @"<nil>", @"source",
350 e = (id)[SaxParseException exceptionWithName:@"SaxIOException"
351 reason:@"can't handle data-source"
354 [self->errorHandler fatalError:e];
357 [self tearDownParser];
361 - (void)parseFromSource:(id)_source {
362 if ([_source isKindOfClass:[NSString class]])
363 [self parseFromSource:_source systemId:@"<string>"];
364 else if ([_source isKindOfClass:[NSData class]])
365 [self parseFromSource:_source systemId:@"<data>"];
366 else if ([_source isKindOfClass:[NSURL class]])
367 [self parseFromSource:_source systemId:[_source absoluteString]];
369 [self parseFromSource:_source systemId:@"<memory>"];
372 - (void)parseFromSystemId:(NSString *)_sysId {
373 NSAutoreleasePool *pool;
376 if (![_sysId hasPrefix:@"file://"]) {
381 pool = [[NSAutoreleasePool alloc] init];
383 /* cut off file:// */
384 _sysId = [_sysId substringFromIndex:7];
387 data = [NSData dataWithContentsOfFile:_sysId];
389 [self _parseFromData:data systemId:_sysId];
394 /* process attribute nodes */
396 - (void)processAttributes:(xmlAttrPtr)_attributes {
397 xmlAttrPtr attribute;
399 /* setup or clear attribute cache */
400 if (self->attributes == nil)
401 attributes = [[SaxAttributes alloc] init];
405 if (_attributes == NULL)
406 /* nothing to process */
411 for (attribute = _attributes; attribute; attribute = attribute->next) {
412 NSString *name, *xhtmlName;
415 printf("attr name '%s' has NS '%s'\n",
416 attribute->name, attribute->ns ? "yes" : "no");
419 name = xmlCharsToString(attribute->name);
420 xhtmlName = [name lowercaseString];
423 if (attribute->children) {
426 if ((t = xmlNodeListGetString(doc, attribute->children, 0))) {
427 value = xmlCharsToString(t);
428 free(t); /* should be xmlFree ?? */
432 [attributes addAttribute:xhtmlName
433 uri:self->namespaceURI
435 type:@"CDATA" value:value];
437 [name release]; name = nil;
438 [value release]; value = nil;
444 /* walking the tree, generating SAX events */
446 - (BOOL)processEntityRefNode:(xmlNodePtr)node {
447 NSLog(@"Ignoring entity ref: '%s'\n", node->name);
451 - (BOOL)processDocumentNode:(xmlNodePtr)node {
454 [self->contentHandler startDocument];
455 [self->contentHandler startPrefixMapping:@"" uri:self->namespaceURI];
456 result = [self processChildren:node->children];
457 [self->contentHandler endPrefixMapping:@""];
458 [self->contentHandler endDocument];
463 - (BOOL)processTextNode:(xmlNodePtr)_node {
464 static unichar c = '\0';
468 if (self->contentHandler == nil)
471 if (_node->content == NULL) {
472 [self->contentHandler characters:&c length:0];
476 if (self->encodeEntities) {
477 /* should use the HTML encoding routine (htmlEncodeEntities) ??? */
479 chars = xmlEncodeEntitiesReentrant(self->doc, _node->content);
482 chars = _node->content;
485 [self->contentHandler characters:&c length:0];
488 if ((len = strlen(chars)) == 0) {
490 [self->contentHandler characters:&c length:0];
497 data = ts = calloc(len + 2, sizeof(unichar)); /* GC ?! */
499 if (_UTF8ToUTF16((void *)&chars, (void *)(chars + len),
500 (void *)&ts, ts + (len * sizeof(unichar)))) {
501 NSLog(@"ERROR(%s:%i): couldn't convert UTF8 to UTF16 !",
502 __PRETTY_FUNCTION__, __LINE__);
503 if (data) free(data);
507 len = (ts - data) / 2;
508 [self->contentHandler characters:data length:len];
510 if (data) free(data);
516 - (BOOL)processCommentNode:(xmlNodePtr)_node {
519 if (self->lexicalHandler == nil)
522 if (_node->content) {
525 /* uses the HTML encoding routine !!!!!!!!!! */
526 chars = xmlEncodeEntitiesReentrant(self->doc, _node->content);
529 [self->lexicalHandler comment:&c length:0];
534 if ((len = strlen(chars)) > 0) {
537 data = ts = calloc(len + 1, sizeof(unichar)); /* GC ?! */
539 if (_UTF8ToUTF16((void *)&chars, (void *)(chars + len),
540 (void *)&ts, ts + (len * sizeof(unichar)))) {
542 NSLog(@"ERROR(%s:%i): couldn't convert UTF8 to UTF16 !",
543 __PRETTY_FUNCTION__, __LINE__);
547 len = (ts - data) / 2;
548 [self->lexicalHandler comment:data length:len];
554 [self->lexicalHandler comment:&c length:0];
559 [self->lexicalHandler comment:&c length:0];
564 - (BOOL)processDTDNode:(xmlNodePtr)node {
565 /* do nothing with DTD nodes .. */
568 - (BOOL)processEntityNode:(xmlNodePtr)node {
569 /* do nothing with entity nodes .. */
570 NSLog(@"%s:%i: ignoring entity node ..", __PRETTY_FUNCTION__, __LINE__);
573 - (BOOL)processPINode:(xmlNodePtr)node {
574 /* do nothing with PI nodes .. */
578 - (BOOL)processElementNode:(xmlNodePtr)node {
579 const htmlElemDesc *tagInfo;
580 NSString *tagName, *xhtmlName;
585 tagInfo = htmlTagLookup(node->name);
586 tagName = xmlCharsToString(node->name);
587 xhtmlName = [tagName lowercaseString];
589 [self processAttributes:node->properties];
591 [self->contentHandler
592 startElement:xhtmlName
593 namespace:self->namespaceURI
595 attributes:self->attributes];
597 [self->attributes clear];
599 result = [self processChildren:node->children];
601 [self->contentHandler
603 namespace:self->namespaceURI
612 - (BOOL)processChildren:(xmlNodePtr)children {
615 if (children == NULL)
618 for (node = children; node; node = node->next) {
619 [self processNode:node];
625 - (BOOL)processNode:(xmlNodePtr)_node {
626 switch(_node->type) {
627 case XML_ELEMENT_NODE:
628 return [self processElementNode:_node];
630 case XML_ATTRIBUTE_NODE:
631 NSLog(@"invalid place for attribute-node !");
635 return [self processTextNode:_node];
637 case XML_CDATA_SECTION_NODE:
638 return [self processTextNode:_node];
640 case HTML_ENTITY_REF_NODE:
641 return [self processEntityRefNode:_node];
643 case XML_ENTITY_NODE:
644 return [self processEntityNode:_node];
647 return [self processPINode:_node];
649 case HTML_COMMENT_NODE:
650 return [self processCommentNode:_node];
652 case XML_HTML_DOCUMENT_NODE:
653 return [self processDocumentNode:_node];
656 return [self processDTDNode:_node];
659 NSLog(@"WARNING: UNKNOWN node type %i\n", _node->type);
665 - (BOOL)walkDocumentTree:(xmlDocPtr)_doc {
669 type = ((xmlDocPtr)self->doc)->type;
670 ((xmlDocPtr)self->doc)->type = XML_HTML_DOCUMENT_NODE;
672 result = [self processNode:(xmlNodePtr)self->doc];
674 ((xmlDocPtr)self->doc)->type = type;
681 static SaxParseException *
682 mkException(libxmlHTMLSAXDriver *self, NSString *key,
683 const char *msg, va_list va)
685 NSString *s, *reason;
687 SaxParseException *e;
689 id keys[7], values[7];
693 s = [NSString stringWithCString:msg];
694 s = [[[NSString alloc]
695 initWithFormat:s arguments:va]
698 r = [s rangeOfString:@"\n"];
699 reason = (r.length > 0)
700 ? [s substringToIndex:r.location]
703 if ([reason length] == 0)
704 reason = @"unknown reason";
706 keys[0] = @"parser"; values[0] = self; count++;
708 values[1] = [NSNumber numberWithInt:self->depth]; count++;
710 if ([s length] > 0) {
711 keys[count] = @"errorMessage";
716 // NSLog(@"locator: %@", self->locator);
718 if ((i = [self->locator lineNumber]) >= 0) {
719 keys[count] = @"line";
720 values[count] = [NSNumber numberWithInt:i];
723 if ((i = [self->locator columnNumber]) >= 0) {
724 keys[count] = @"column";
725 values[count] = [NSNumber numberWithInt:i];
728 if ((tmp = [self->locator publicId])) {
729 keys[count] = @"publicId";
733 if ((tmp = [self->locator systemId])) {
734 keys[count] = @"systemId";
739 ui = [NSDictionary dictionaryWithObjects:values forKeys:keys count:count];
741 e = (id)[SaxParseException exceptionWithName:key
747 static void warning(void *udata, const char *msg, ...) {
749 SaxParseException *e;
751 if (activeDriver == nil) {
752 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
757 e = mkException(activeDriver, @"SAXWarning", msg, args);
760 [activeDriver->errorHandler warning:e];
763 static void error(void *udata, const char *msg, ...) {
765 SaxParseException *e;
767 if (!reportInvalidTags && msg != NULL) {
768 if (toupper(msg[0]) == 'T') {
769 if (strncasecmp(tagInvalidMsg, msg, strlen(tagInvalidMsg)) == 0)
773 else if (toupper(msg[0]) == 'U') {
774 if (strncasecmp(unexpectedNobrCloseMsg, msg,
775 strlen(unexpectedNobrCloseMsg)) == 0)
777 printf("MSG: '%s'\n", msg);
781 if (!reportUnclosedEntities && msg != NULL && toupper(msg[0]) == 'H') {
782 if (strncasecmp(unclosedEntityInvalidMsg, msg,
783 strlen(unclosedEntityInvalidMsg)) == 0)
787 if (activeDriver == nil) {
788 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
792 /* msg is a format, eg 'tag %s is invalid' */
795 e = mkException(activeDriver, @"SAXError", msg, args);
798 [activeDriver->errorHandler error:e];
801 static void fatalError(void *udata, const char *msg, ...) {
803 SaxParseException *e;
805 if (activeDriver == nil) {
806 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
811 e = mkException(activeDriver, @"SAXFatalError", msg, args);
814 [activeDriver->errorHandler fatalError:e];
817 static void setLocator(void *udata, xmlSAXLocatorPtr _locator) {
818 if (activeDriver == nil) {
819 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
823 [activeDriver->locator release];
825 activeDriver->locator = [[libxmlSAXLocator alloc]
826 initWithSaxLocator:_locator
827 parser:activeDriver];
828 activeDriver->locator->ctx = activeDriver->ctxt;
830 [activeDriver->contentHandler setDocumentLocator:activeDriver->locator];
833 @end /* libxmlHTMLSAXDriver */