*/ private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) { if (node.tag == tt.tagCenter) { if (lexer.configuration.DropFontTags) { if (node.content != null) { Node last = node.last; Node parent = node.parent; discardContainer(node, pnode); node = lexer.inferredTag("br"); if (last.next != null) last.next.prev = node; node.next = last.next; last.next = node; node.prev = last; if (parent.last == last) parent.last = node; node.parent = parent; } else { Node prev = node.prev; Node next = node.next; Node parent = node.parent; discardContainer(node, pnode); node = lexer.inferredTag("br"); node.next = next; node.prev = prev; node.parent = parent; if (next != null) next.prev = node; else parent.last = node; if (prev != null) prev.next = node; else parent.content = node; } return true; } node.tag = tt.tagDiv; node.element = "div"; addStyleProperty(node, "text-align: center"); return true; } return false; } /* Symptom

...

Action: merge the two divs This is useful after nested s used by Word for indenting have been converted to

s */ private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) { Node child; if (node.tag != tt.tagDiv) return false; child = node.content; if (child == null) return false; if (child.tag != tt.tagDiv) return false; if (child.next != null) return false; mergeStyles(node, child); stripOnlyChild(node); return true; } /* Symptom:

Action: discard outer list */ private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) { Node child, list; if (node.tag == tt.tagUl || node.tag == tt.tagOl) { child = node.content; if (child == null) return false; /* check child has no peers */ if (child.next != null) return false; list = child.content; if (list == null) return false; if (list.tag != node.tag) return false; pnode.setObject(node.next); /* move inner list node into position of outer node */ list.prev = node.prev; list.next = node.next; list.parent = node.parent; fixNodeLinks(list); /* get rid of outer ul and its li */ child.content = null; node.content = null; node.next = null; /* If prev node was a list the chances are this node should be appended to that list. Word has no way of recognizing nested lists and just uses indents */ if (list.prev != null) { node = list; list = node.prev; if (list.tag == tt.tagUl || list.tag == tt.tagOl) { list.next = node.next; if (list.next != null) list.next.prev = list; child = list.last; /*

*/ node.parent = child; node.next = null; node.prev = child.last; fixNodeLinks(node); } } cleanNode(lexer, node); return true; } return false; } /* Symptom: the only child of a block-level element is a presentation element such as B, I or FONT Action: add style "font-weight: bold" to the block and strip the element, leaving its children. example:
Draft Recommended Practice
becomes:
Draft Recommended Practice
This code also replaces the align attribute by a style attribute. However, to avoid CSS problems with Navigator 4, this isn't done for the elements: caption, tr and table */ private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) { Node child; if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) { if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) { /* check for align attribute */ if (node.tag != tt.tagCaption) textAlign(lexer, node); child = node.content; if (child == null) return false; /* check child has no peers */ if (child.next != null) return false; if (child.tag == tt.tagB) { mergeStyles(node, child); addStyleProperty(node, "font-weight: bold"); stripOnlyChild(node); return true; } if (child.tag == tt.tagI) { mergeStyles(node, child); addStyleProperty(node, "font-style: italic"); stripOnlyChild(node); return true; } if (child.tag == tt.tagFont) { mergeStyles(node, child); addFontStyles(node, child.attributes); stripOnlyChild(node); return true; } } } return false; } /* the only child of table cell or an inline element such as em */ private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) { Node child; if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) { child = node.content; if (child == null) return false; /* check child has no peers */ if (child.next != null) return false; if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) { mergeStyles(node, child); addStyleProperty(node, "font-weight: bold"); stripOnlyChild(node); return true; } if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) { mergeStyles(node, child); addStyleProperty(node, "font-style: italic"); stripOnlyChild(node); return true; } if (child.tag == tt.tagFont) { mergeStyles(node, child); addFontStyles(node, child.attributes); stripOnlyChild(node); return true; } } return false; } /* Replace font elements by span elements, deleting the font element's attributes and replacing them by a single style attribute. */ private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) { AttVal av, style, next; if (node.tag == tt.tagFont) { if (lexer.configuration.DropFontTags) { discardContainer(node, pnode); return false; } /* if FONT is only child of parent element then leave alone */ if (node.parent.content == node && node.next == null) return false; addFontStyles(node, node.attributes); /* extract style attribute and free the rest */ av = node.attributes; style = null; while (av != null) { next = av.next; if (av.attribute.equals("style")) { av.next = null; style = av; } av = next; } node.attributes = style; node.tag = tt.tagSpan; node.element = "span"; return true; } return false; } /* Applies all matching rules to a node. */ private Node cleanNode(Lexer lexer, Node node) { Node next = null; MutableObject o = new MutableObject(); boolean b = false; for (next = node; node.isElement(); node = next) { o.setObject(next); b = dir2Div(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = nestedList(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = center2Div(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = mergeDivs(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = blockStyle(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = inlineStyle(lexer, node, o); next = (Node) o.getObject(); if (b) continue; b = font2Span(lexer, node, o); next = (Node) o.getObject(); if (b) continue; break; } return next; } private Node createStyleProperties(Lexer lexer, Node node) { Node child; if (node.content != null) { for (child = node.content; child != null; child = child.next) { child = createStyleProperties(lexer, child); } } return cleanNode(lexer, node); } private void defineStyleRules(Lexer lexer, Node node) { Node child; if (node.content != null) { for (child = node.content; child != null; child = child.next) { defineStyleRules(lexer, child); } } style2Rule(lexer, node); } public void cleanTree(Lexer lexer, Node doc) { doc = createStyleProperties(lexer, doc); if (!lexer.configuration.MakeClean) { defineStyleRules(lexer, doc); createStyleElement(lexer, doc); } } /* simplifies ... ... etc. */ public void nestedEmphasis(Node node) { MutableObject o = new MutableObject(); Node next; while (node != null) { next = node.next; if ((node.tag == tt.tagB || node.tag == tt.tagI) && node.parent != null && node.parent.tag == node.tag) { /* strip redundant inner element */ o.setObject(next); discardContainer(node, o); next = (Node) o.getObject(); node = next; continue; } if (node.content != null) nestedEmphasis(node.content); node = next; } } /* replace i by em and b by strong */ public void emFromI(Node node) { while (node != null) { if (node.tag == tt.tagI) { node.element = tt.tagEm.name; node.tag = tt.tagEm; } else if (node.tag == tt.tagB) { node.element = tt.tagStrong.name; node.tag = tt.tagStrong; } if (node.content != null) emFromI(node.content); node = node.next; } } /* Some people use dir or ul without an li to indent the content. The pattern to look for is a list with a single implicit li. This is recursively replaced by an implicit blockquote. */ public void list2BQ(Node node) { while (node != null) { if (node.content != null) list2BQ(node.content); if (node.tag != null && node.tag.parser == ParserImpl.getParseList() && node.hasOneChild() && node.content.implicit) { stripOnlyChild(node); node.element = tt.tagBlockquote.name; node.tag = tt.tagBlockquote; node.implicit = true; } node = node.next; } } /* Replace implicit blockquote by div with an indent taking care to reduce nested blockquotes to a single div with the indent set to match the nesting depth */ public void bQ2Div(Node node) { int indent; String indent_buf; while (node != null) { if (node.tag == tt.tagBlockquote && node.implicit) { indent = 1; while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) { ++indent; stripOnlyChild(node); } if (node.content != null) bQ2Div(node.content); indent_buf = "margin-left: " + (new Integer(2 * indent)).toString() + "em"; node.element = tt.tagDiv.name; node.tag = tt.tagDiv; node.addAttribute("style", indent_buf); } else if (node.content != null) bQ2Div(node.content); node = node.next; } } /* node is prune up to */ public Node pruneSection(Lexer lexer, Node node) { for (;;) { /* discard node and returns next */ node = Node.discardElement(node); if (node == null) return null; if (node.type == Node.SectionTag) { if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) { node = pruneSection(lexer, node); continue; } if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif")) { node = Node.discardElement(node); break; } } } return node; } public void dropSections(Lexer lexer, Node node) { while (node != null) { if (node.type == Node.SectionTag) { /* prune up to matching endif */ if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) { node = pruneSection(lexer, node); continue; } /* discard others as well */ node = Node.discardElement(node); continue; } if (node.content != null) dropSections(lexer, node.content); node = node.next; } } // gschadow patch start /** Get rid of all this pseudo-XML crap, sections, Asp tags, JSP tags, etc. **/ public void dropPseudoXMLCrap(Lexer lexer, Node node) { while (node != null) { switch (node.type) { case Node.AspTag : case Node.JsteTag : case Node.PhpTag : case Node.SectionTag : node = Node.discardElement(node); break; default : if (node.content != null) dropPseudoXMLCrap(lexer, node.content); node = node.next; break; } } } // gschadow patch end public void purgeAttributes(Node node) { AttVal attr = node.attributes; AttVal next = null; AttVal prev = null; while (attr != null) { next = attr.next; /* special check for class="Code" denoting pre text */ if (attr.attribute != null && attr.value != null && attr.attribute.equals("class") && attr.value.equals("Code")) { prev = attr; } else if ( attr.attribute != null && (attr.attribute.equals("class") || attr.attribute.equals("style") || attr.attribute.equals("lang") || attr.attribute.startsWith("x:") || ((attr.attribute.equals("height") || attr.attribute.equals("width")) && (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh)))) { if (prev != null) prev.next = next; else node.attributes = next; } else prev = attr; attr = next; } } /* Word2000 uses span excessively, so we strip span out */ public Node stripSpan(Lexer lexer, Node span) { Node node; Node prev = null; Node content; /* deal with span elements that have content by splicing the content in place of the span after having processed it */ cleanWord2000(lexer, span.content); content = span.content; if (span.prev != null) prev = span.prev; else if (content != null) { node = content; content = content.next; Node.removeNode(node); Node.insertNodeBeforeElement(span, node); prev = node; } while (content != null) { node = content; content = content.next; Node.removeNode(node); Node.insertNodeAfterElement(prev, node); prev = node; } if (span.next == null) span.parent.last = prev; node = span.next; span.content = null; Node.discardElement(span); return node; } /* map non-breaking spaces to regular spaces */ private void normalizeSpaces(Lexer lexer, Node node) { while (node != null) { if (node.content != null) normalizeSpaces(lexer, node.content); if (node.type == Node.TextNode) { int i; MutableInteger c = new MutableInteger(); int p = node.start; for (i = node.start; i < node.end; ++i) { c.value = (int) node.textarray[i]; /* look for UTF-8 multibyte character */ if (c.value > 0x7F) i += PPrint.getUTF8(node.textarray, i, c); if (c.value == 160) c.value = ' '; p = PPrint.putUTF8(node.textarray, p, c.value); } } node = node.next; } } /* This is a major clean up to strip out all the extra stuff you get when you save as web page from Word 2000. It doesn't yet know what to do with VML tags, but these will appear as errors unless you declare them as new tags, such as o:p which needs to be declared as inline. */ public void cleanWord2000(Lexer lexer, Node node) { /* used to a list from a sequence of bulletted p's */ Node list = null; while (node != null) { /* discard Word's style verbiage */ if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) { node = Node.discardElement(node); continue; } /* strip out all span tags Word scatters so liberally! */ if (node.tag == tt.tagSpan) { node = stripSpan(lexer, node); continue; } /* get rid of Word's xmlns attributes */ if (node.tag == tt.tagHtml) { /* check that it's a Word 2000 document */ if (node.getAttrByName("xmlns:o") == null) return; } if (node.tag == tt.tagLink) { AttVal attr = node.getAttrByName("rel"); if (attr != null && attr.value != null && attr.value.equals("File-List")) { node = Node.discardElement(node); continue; } } /* discard empty paragraphs */ if (node.content == null && node.tag == tt.tagP) { node = Node.discardElement(node); continue; } if (node.tag == tt.tagP) { AttVal attr = node.getAttrByName("class"); /* map sequence of
to
...
*/ if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) { Node.coerceNode(lexer, node, tt.tagLi); if (list == null || list.tag != tt.tagUl) { list = lexer.inferredTag("ul"); Node.insertNodeBeforeElement(node, list); } purgeAttributes(node); if (node.content != null) cleanWord2000(lexer, node.content); /* remove node and append to contents of list */ Node.removeNode(node); Node.insertNodeAtEnd(list, node); node = list.next; } /* map sequence of
to
...
*/ else if (attr != null && attr.value != null && attr.value.equals("Code")) { Node br = lexer.newLineNode(); normalizeSpaces(lexer, node); if (list == null || list.tag != tt.tagPre) { list = lexer.inferredTag("pre"); Node.insertNodeBeforeElement(node, list); } /* remove node and append to contents of list */ Node.removeNode(node); Node.insertNodeAtEnd(list, node); stripSpan(lexer, node); Node.insertNodeAtEnd(list, br); node = list.next; } else list = null; } else list = null; /* strip out style and class attributes */ if (node.type == Node.StartTag || node.type == Node.StartEndTag) purgeAttributes(node); if (node.content != null) cleanWord2000(lexer, node.content); node = node.next; } } public boolean isWord2000(Node root, TagTable tt) { Node html = root.findHTML(tt); return (html != null && html.getAttrByName("xmlns:o") != null); } }