2 Copyright (C) 2000-2004 SKYRIX Software AG
4 This file is part of OpenGroupware.org.
6 OGo is free software; you can redistribute it and/or modify it under
7 the terms of the GNU Lesser General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
11 OGo is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with OGo; see the file COPYING. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
23 #import "libxmlHTMLSAXDriver.h"
24 #import "libxmlSAXLocator.h"
25 #include "TableCallbacks.h"
26 #include <SaxObjC/SaxObjC.h>
27 #include <SaxObjC/SaxException.h>
30 #include <libxml/HTMLparser.h>
31 #include <libxml/HTMLtree.h>
33 @interface libxmlHTMLSAXDriver(PrivateMethods)
35 - (void)tearDownParser;
37 - (BOOL)walkDocumentTree:(xmlDocPtr)_doc;
38 - (BOOL)processNode:(xmlNodePtr)_node;
39 - (BOOL)processChildren:(xmlNodePtr)children;
43 static int _UTF8ToUTF16(unsigned char **sourceStart, unsigned char *sourceEnd,
44 unichar **targetStart, const unichar *targetEnd);
46 static BOOL logUnsupportedFeatures = NO;
47 static BOOL reportInvalidTags = NO;
48 static BOOL reportUnclosedEntities = NO;
49 static NSMapTable *uniqueStrings = NULL; // THREAD
50 static Class NSStringClass = Nil;
52 /* error string detection */
54 TODO: obviously this may change between libxml versions or even
55 localisations ... why doesn't libxml support error codes ?
58 static const unsigned char *tagInvalidMsg = "tag %s invalid";
59 static const unsigned char *unclosedEntityInvalidMsg =
60 "htmlParseEntityRef: expecting ';'";
62 static inline NSString *xmlCharsToString(const xmlChar *_s) {
66 if (_s == NULL) return nil;
68 if (uniqueStrings == NULL) {
69 uniqueStrings = NSCreateMapTable(libxmlNonOwnedCStringMapKeyCallBacks,
70 NSObjectMapValueCallBacks,
73 else if ((s = NSMapGet(uniqueStrings, _s))) {
74 /* found a string in cache ... */
78 newkey = malloc(strlen(_s) + 1);
81 if (NSStringClass == Nil)
82 NSStringClass = [NSString class];
84 s = [[NSStringClass alloc] initWithUTF8String:_s];
85 NSMapInsert(uniqueStrings, newkey, s);
89 static NSString *SaxDeclHandlerProperty =
90 @"http://xml.org/sax/properties/declaration-handler";
91 static NSString *SaxLexicalHandlerProperty =
92 @"http://xml.org/sax/properties/lexical-handler";
94 static NSString *XMLNS_XHTML = @"http://www.w3.org/1999/xhtml";
96 @implementation libxmlHTMLSAXDriver
98 static libxmlHTMLSAXDriver *activeDriver = nil;
99 static void warning(void *udata, const char *msg, ...);
100 static void error(void *udata, const char *msg, ...);
101 static void fatalError(void *udata, const char *msg, ...);
102 static void setLocator(void *udata, xmlSAXLocatorPtr _locator);
105 NSUserDefaults *ud = [NSUserDefaults standardUserDefaults];
107 reportInvalidTags = [ud boolForKey:@"libxmlHTMLSAXDriverReportInvalidTags"];
108 reportUnclosedEntities =
109 [ud boolForKey:@"libxmlHTMLSAXDriverReportUnclosedEntityRefs"];
113 if ((self = [super init])) {
114 self->namespaceURI = [XMLNS_XHTML copy];
115 self->encodeEntities = NO;
121 [self tearDownParser];
123 [self->attributes release];
124 [self->namespaceURI release];
125 [self->lexicalHandler release];
126 [self->declHandler release];
127 [self->contentHandler release];
128 [self->dtdHandler release];
129 [self->errorHandler release];
130 [self->entityResolver release];
134 /* features & properties */
136 - (void)setFeature:(NSString *)_name to:(BOOL)_value {
137 if (logUnsupportedFeatures)
138 NSLog(@"%s: don't know feature %@", __PRETTY_FUNCTION__, _name);
140 - (BOOL)feature:(NSString *)_name {
141 if (logUnsupportedFeatures)
142 NSLog(@"%s: don't know feature %@", __PRETTY_FUNCTION__, _name);
146 - (void)setProperty:(NSString *)_name to:(id)_value {
147 if ([_name isEqualToString:SaxLexicalHandlerProperty]) {
148 ASSIGN(self->lexicalHandler, _value);
151 if ([_name isEqualToString:SaxDeclHandlerProperty]) {
152 ASSIGN(self->declHandler, _value);
156 [SaxNotRecognizedException raise:@"PropertyException"
157 format:@"don't know property %@", _name];
159 - (id)property:(NSString *)_name {
160 if ([_name isEqualToString:SaxLexicalHandlerProperty])
161 return self->lexicalHandler;
162 if ([_name isEqualToString:SaxDeclHandlerProperty])
163 return self->declHandler;
165 [SaxNotRecognizedException raise:@"PropertyException"
166 format:@"don't know property %@", _name];
172 - (void)setDTDHandler:(id<NSObject,SaxDTDHandler>)_handler {
173 ASSIGN(self->dtdHandler, _handler);
175 - (id<NSObject,SaxDTDHandler>)dtdHandler {
176 return self->dtdHandler;
179 - (void)setErrorHandler:(id<NSObject,SaxErrorHandler>)_handler {
180 ASSIGN(self->errorHandler, _handler);
182 - (id<NSObject,SaxErrorHandler>)errorHandler {
183 return self->errorHandler;
186 - (void)setEntityResolver:(id<NSObject,SaxEntityResolver>)_handler {
187 ASSIGN(self->entityResolver, _handler);
189 - (id<NSObject,SaxEntityResolver>)entityResolver {
190 return self->entityResolver;
193 - (void)setContentHandler:(id<NSObject,SaxContentHandler>)_handler {
194 ASSIGN(self->contentHandler, _handler);
196 - (id<NSObject,SaxContentHandler>)contentHandler {
197 return self->contentHandler;
202 - (void)setupParserWithDocumentPath:(NSString *)_path {
205 if (self->ctxt != NULL) {
206 NSLog(@"WARNING(%s): HTML parser context already setup !",
207 __PRETTY_FUNCTION__);
208 [self tearDownParser];
211 memcpy(&sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandler));
213 sax.warning = warning;
214 sax.fatalError = fatalError;
215 sax.setDocumentLocator = setLocator;
217 if (activeDriver != nil) {
218 NSLog(@"WARNING(%s): %@ there is an active driver set (%@), override !",
219 __PRETTY_FUNCTION__, self, activeDriver);
223 self->ctxt = htmlCreatePushParserCtxt(&sax /* sax */,
224 NULL /*self*/ /* userdata */,
227 [_path cString] /* filename */,
228 XML_CHAR_ENCODING_8859_1
232 - (void)tearDownParser {
233 if (activeDriver == self)
237 xmlFreeDoc(self->doc);
241 htmlFreeParserCtxt(self->ctxt);
248 - (void)pushBytes:(const char *)_bytes count:(unsigned)_len {
249 if (_len == 0) return;
250 NSAssert(self->ctxt, @"missing HTML parser context");
251 htmlParseChunk(self->ctxt, _bytes, _len, 0);
255 htmlParseChunk(self->ctxt, &dummyByte, 0, 1 /* terminate */);
256 self->doc = ((xmlParserCtxtPtr)ctxt)->myDoc;
261 - (void)_parseFromData:(NSData *)_data systemId:(NSString *)_sysId {
262 NSAutoreleasePool *pool;
264 pool = [[NSAutoreleasePool alloc] init];
266 /* parse into structure */
267 [self setupParserWithDocumentPath:_sysId];
268 [self pushBytes:[_data bytes] count:[_data length]];
271 if (self->doc == NULL) {
272 NSLog(@"Could not parse HTML file: %@", _sysId);
273 [self tearDownParser];
276 [self walkDocumentTree:self->doc];
277 [self tearDownParser];
283 - (void)parseFromSource:(id)_source systemId:(NSString *)_sysId {
284 NSAutoreleasePool *pool;
286 pool = [[NSAutoreleasePool alloc] init];
288 if ([_source isKindOfClass:[NSData class]]) {
289 [self _parseFromData:_source systemId:_sysId];
291 else if ([_source isKindOfClass:[NSString class]]) {
292 [self _parseFromData:[_source dataUsingEncoding:NSISOLatin1StringEncoding]
296 SaxParseException *e;
299 ui = [NSDictionary dictionaryWithObjectsAndKeys:
300 _source ? _source : @"<nil>", @"source",
304 e = (id)[SaxParseException exceptionWithName:@"SaxIOException"
305 reason:@"can't handle data-source"
308 [self->errorHandler fatalError:e];
311 [self tearDownParser];
315 - (void)parseFromSource:(id)_source {
316 [self parseFromSource:_source systemId:@"<memory>"];
319 - (void)parseFromSystemId:(NSString *)_sysId {
320 NSAutoreleasePool *pool;
323 if (![_sysId hasPrefix:@"file://"]) {
328 pool = [[NSAutoreleasePool alloc] init];
330 /* cut off file:// */
331 _sysId = [_sysId substringFromIndex:7];
334 data = [NSData dataWithContentsOfFile:_sysId];
336 [self _parseFromData:data systemId:_sysId];
341 /* process attribute nodes */
343 - (void)processAttributes:(xmlAttrPtr)_attributes {
344 xmlAttrPtr attribute;
346 /* setup or clear attribute cache */
347 if (self->attributes == nil)
348 attributes = [[SaxAttributes alloc] init];
352 if (_attributes == NULL)
353 /* nothing to process */
358 for (attribute = _attributes; attribute; attribute = attribute->next) {
359 NSString *name, *xhtmlName;
362 printf("attr name '%s' has NS '%s'\n",
363 attribute->name, attribute->ns ? "yes" : "no");
366 name = xmlCharsToString(attribute->name);
367 xhtmlName = [name lowercaseString];
370 if (attribute->children) {
373 if ((t = xmlNodeListGetString(doc, attribute->children, 0))) {
374 value = xmlCharsToString(t);
375 free(t); /* should be xmlFree ?? */
379 [attributes addAttribute:xhtmlName
380 uri:self->namespaceURI
382 type:@"CDATA" value:value];
384 [name release]; name = nil;
385 [value release]; value = nil;
391 /* walking the tree, generating SAX events */
393 - (BOOL)processEntityRefNode:(xmlNodePtr)node {
394 NSLog(@"Ignoring entity ref: '%s'\n", node->name);
398 - (BOOL)processDocumentNode:(xmlNodePtr)node {
401 [self->contentHandler startDocument];
402 [self->contentHandler startPrefixMapping:@"" uri:self->namespaceURI];
403 result = [self processChildren:node->children];
404 [self->contentHandler endPrefixMapping:@""];
405 [self->contentHandler endDocument];
410 - (BOOL)processTextNode:(xmlNodePtr)_node {
411 static unichar c = '\0';
415 if (self->contentHandler == nil)
418 if (_node->content == NULL) {
419 [self->contentHandler characters:&c length:0];
423 if (self->encodeEntities) {
424 /* should use the HTML encoding routine (htmlEncodeEntities) ??? */
426 chars = xmlEncodeEntitiesReentrant(self->doc, _node->content);
429 chars = _node->content;
432 [self->contentHandler characters:&c length:0];
435 if ((len = strlen(chars)) == 0) {
437 [self->contentHandler characters:&c length:0];
444 data = ts = calloc(len + 2, sizeof(unichar)); /* GC ?! */
446 if (_UTF8ToUTF16((void *)&chars, (void *)(chars + len),
447 (void *)&ts, ts + (len * sizeof(unichar)))) {
448 NSLog(@"ERROR(%s:%i): couldn't convert UTF8 to UTF16 !",
449 __PRETTY_FUNCTION__, __LINE__);
450 if (data) free(data);
454 len = (ts - data) / 2;
455 [self->contentHandler characters:data length:len];
457 if (data) free(data);
463 - (BOOL)processCommentNode:(xmlNodePtr)_node {
466 if (self->lexicalHandler == nil)
469 if (_node->content) {
472 /* uses the HTML encoding routine !!!!!!!!!! */
473 chars = xmlEncodeEntitiesReentrant(self->doc, _node->content);
476 [self->lexicalHandler comment:&c length:0];
481 if ((len = strlen(chars)) > 0) {
484 data = ts = calloc(len + 1, sizeof(unichar)); /* GC ?! */
486 if (_UTF8ToUTF16((void *)&chars, (void *)(chars + len),
487 (void *)&ts, ts + (len * sizeof(unichar)))) {
489 NSLog(@"ERROR(%s:%i): couldn't convert UTF8 to UTF16 !",
490 __PRETTY_FUNCTION__, __LINE__);
494 len = (ts - data) / 2;
495 [self->lexicalHandler comment:data length:len];
501 [self->lexicalHandler comment:&c length:0];
506 [self->lexicalHandler comment:&c length:0];
511 - (BOOL)processDTDNode:(xmlNodePtr)node {
512 /* do nothing with DTD nodes .. */
515 - (BOOL)processEntityNode:(xmlNodePtr)node {
516 /* do nothing with entity nodes .. */
517 NSLog(@"%s:%i: ignoring entity node ..", __PRETTY_FUNCTION__, __LINE__);
520 - (BOOL)processPINode:(xmlNodePtr)node {
521 /* do nothing with PI nodes .. */
525 - (BOOL)processElementNode:(xmlNodePtr)node {
526 const htmlElemDesc *tagInfo;
527 NSString *tagName, *xhtmlName;
532 tagInfo = htmlTagLookup(node->name);
533 tagName = xmlCharsToString(node->name);
534 xhtmlName = [tagName lowercaseString];
536 [self processAttributes:node->properties];
538 [self->contentHandler
539 startElement:xhtmlName
540 namespace:self->namespaceURI
542 attributes:self->attributes];
544 [self->attributes clear];
546 result = [self processChildren:node->children];
548 [self->contentHandler
550 namespace:self->namespaceURI
559 - (BOOL)processChildren:(xmlNodePtr)children {
562 if (children == NULL)
565 for (node = children; node; node = node->next) {
566 [self processNode:node];
572 - (BOOL)processNode:(xmlNodePtr)_node {
573 switch(_node->type) {
574 case XML_ELEMENT_NODE:
575 return [self processElementNode:_node];
577 case XML_ATTRIBUTE_NODE:
578 NSLog(@"invalid place for attribute-node !");
582 return [self processTextNode:_node];
584 case XML_CDATA_SECTION_NODE:
585 return [self processTextNode:_node];
587 case HTML_ENTITY_REF_NODE:
588 return [self processEntityRefNode:_node];
590 case XML_ENTITY_NODE:
591 return [self processEntityNode:_node];
594 return [self processPINode:_node];
596 case HTML_COMMENT_NODE:
597 return [self processCommentNode:_node];
599 case XML_HTML_DOCUMENT_NODE:
600 return [self processDocumentNode:_node];
603 return [self processDTDNode:_node];
606 NSLog(@"WARNING: UNKNOWN node type %i\n", _node->type);
612 - (BOOL)walkDocumentTree:(xmlDocPtr)_doc {
616 type = ((xmlDocPtr)self->doc)->type;
617 ((xmlDocPtr)self->doc)->type = XML_HTML_DOCUMENT_NODE;
619 result = [self processNode:(xmlNodePtr)self->doc];
621 ((xmlDocPtr)self->doc)->type = type;
628 static SaxParseException *
629 mkException(libxmlHTMLSAXDriver *self, NSString *key,
630 const char *msg, va_list va)
632 NSString *s, *reason;
634 SaxParseException *e;
636 id keys[7], values[7];
640 s = [NSString stringWithCString:msg];
641 s = [[[NSString alloc]
642 initWithFormat:s arguments:va]
645 r = [s rangeOfString:@"\n"];
646 reason = (r.length > 0)
647 ? [s substringToIndex:r.location]
650 if ([reason length] == 0)
651 reason = @"unknown reason";
653 keys[0] = @"parser"; values[0] = self; count++;
655 values[1] = [NSNumber numberWithInt:self->depth]; count++;
657 if ([s length] > 0) {
658 keys[count] = @"errorMessage";
663 // NSLog(@"locator: %@", self->locator);
665 if ((i = [self->locator lineNumber]) >= 0) {
666 keys[count] = @"line";
667 values[count] = [NSNumber numberWithInt:i];
670 if ((i = [self->locator columnNumber]) >= 0) {
671 keys[count] = @"column";
672 values[count] = [NSNumber numberWithInt:i];
675 if ((tmp = [self->locator publicId])) {
676 keys[count] = @"publicId";
680 if ((tmp = [self->locator systemId])) {
681 keys[count] = @"systemId";
686 ui = [NSDictionary dictionaryWithObjects:values forKeys:keys count:count];
688 e = (id)[SaxParseException exceptionWithName:key
694 static void warning(void *udata, const char *msg, ...) {
696 SaxParseException *e;
698 if (activeDriver == nil) {
699 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
704 e = mkException(activeDriver, @"SAXWarning", msg, args);
707 [activeDriver->errorHandler warning:e];
710 static void error(void *udata, const char *msg, ...) {
712 SaxParseException *e;
714 if (!reportInvalidTags && msg != NULL && toupper(msg[0]) == 'T') {
715 if (strncasecmp(tagInvalidMsg, msg, strlen(tagInvalidMsg)) == 0)
718 if (!reportUnclosedEntities && msg != NULL && toupper(msg[0]) == 'H') {
719 if (strncasecmp(unclosedEntityInvalidMsg, msg,
720 strlen(unclosedEntityInvalidMsg)) == 0)
724 if (activeDriver == nil) {
725 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
729 /* msg is a format, eg 'tag %s is invalid' */
732 e = mkException(activeDriver, @"SAXError", msg, args);
735 [activeDriver->errorHandler error:e];
738 static void fatalError(void *udata, const char *msg, ...) {
740 SaxParseException *e;
742 if (activeDriver == nil) {
743 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
748 e = mkException(activeDriver, @"SAXFatalError", msg, args);
751 [activeDriver->errorHandler fatalError:e];
754 static void setLocator(void *udata, xmlSAXLocatorPtr _locator) {
755 if (activeDriver == nil) {
756 NSLog(@"ERROR(%s): no driver is active !", __PRETTY_FUNCTION__);
760 [activeDriver->locator release];
762 activeDriver->locator = [[libxmlSAXLocator alloc]
763 initWithSaxLocator:_locator
764 parser:activeDriver];
765 activeDriver->locator->ctx = activeDriver->ctxt;
767 [activeDriver->contentHandler setDocumentLocator:activeDriver->locator];
770 @end /* libxmlHTMLSAXDriver */