Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenwrite.git

Imports head element into XHTML namespace

AuthorDaveJarvis <email>
Date2025-08-26 13:25:55 GMT-0700
Commit49a1fb54e32f4f8441164c0e7e07a10753520472
Parent19943af
Delta401 lines added, 378 lines removed, 23-line increase
src/main/java/com/keenwrite/dom/DocumentConverter.java
import java.util.LinkedHashMap;
import java.util.Map;
+import java.util.Set;
import static com.keenwrite.dom.DocumentParser.sDomImplementation;
entry( "fi", "fi" ),
entry( "fl", "fl" )
+ );
+
+ private static final Set<String> CODE_BLOCKS = Set.of(
+ "pre",
+ "code",
+ "kbd",
+ "script",
+ "style",
+ "samp",
+ "blockcode",
+ "var",
+ "tex",
+ "tt"
);
private static final NodeVisitor LIGATURE_VISITOR = new NodeVisitor() {
@Override
public void head( final @NotNull Node node, final int depth ) {
if( node instanceof final TextNode textNode ) {
final var parent = node.parentNode();
final var name = parent == null ? "root" : parent.nodeName();
- final var codeBlock =
- "pre".equalsIgnoreCase( name ) ||
- "code".equalsIgnoreCase( name ) ||
- "kbd".equalsIgnoreCase( name ) ||
- "var".equalsIgnoreCase( name ) ||
- "tex".equalsIgnoreCase( name ) ||
- "tt".equalsIgnoreCase( name );
- if( !codeBlock ) {
+ if( !CODE_BLOCKS.contains( name.toLowerCase() ) ) {
// Obtaining the whole text will return newlines, which must be kept
// to ensure that preformatted text maintains its formatting.
src/main/java/com/keenwrite/dom/DocumentParser.java
import static javax.xml.xpath.XPathConstants.NODE;
import static javax.xml.xpath.XPathConstants.NODESET;
-
-/**
- * Responsible for initializing an XML parser.
- */
-public class DocumentParser {
- private static final String LOAD_EXTERNAL_DTD =
- "http://apache.org/xml/features/nonvalidating/load-external-dtd";
- private static final String INDENT_AMOUNT =
- "{http://xml.apache.org/xslt}indent-amount";
- private static final String NAMESPACE = "http://www.w3.org/1999/xhtml";
-
- private static final XPath XPATH = XPathFactory.newInstance().newXPath();
-
- private static final ByteArrayOutputStream sWriter =
- new ByteArrayOutputStream( 65536 );
- private static final OutputStreamWriter sOutput =
- new OutputStreamWriter( sWriter, UTF_8 );
-
- /**
- * Caches {@link XPathExpression}s to avoid re-compiling.
- */
- private static final Map<String, XPathExpression> sXpaths = new HashMap<>();
-
- private static final DocumentBuilderFactory sDocumentFactory;
- private static DocumentBuilder sDocumentBuilder;
- private static Transformer sTransformer;
- private static final XPath sXpath = XPathFactory.newInstance().newXPath();
-
- public static final DOMImplementation sDomImplementation;
-
- static {
- sDocumentFactory = DocumentBuilderFactory.newInstance();
-
- sDocumentFactory.setValidating( false );
- sDocumentFactory.setAttribute( LOAD_EXTERNAL_DTD, false );
- sDocumentFactory.setNamespaceAware( true );
- sDocumentFactory.setIgnoringComments( true );
- sDocumentFactory.setIgnoringElementContentWhitespace( true );
-
- DOMImplementation domImplementation;
-
- try {
- sDocumentBuilder = sDocumentFactory.newDocumentBuilder();
- domImplementation = sDocumentBuilder.getDOMImplementation();
- sTransformer = TransformerFactory.newInstance().newTransformer();
-
- // Ensure Unicode characters (emojis) are encoded correctly.
- sTransformer.setOutputProperty( ENCODING, UTF_16.toString() );
- sTransformer.setOutputProperty( OMIT_XML_DECLARATION, "yes" );
- sTransformer.setOutputProperty( METHOD, "xml" );
- sTransformer.setOutputProperty( INDENT, "no" );
- sTransformer.setOutputProperty( INDENT_AMOUNT, "2" );
- }
- catch( final Exception ex ) {
- clue( ex );
- domImplementation = sDocumentBuilder.getDOMImplementation();
- }
-
- sDomImplementation = domImplementation;
- }
-
- public static Document newDocument() {
- return sDocumentBuilder.newDocument();
- }
-
- /**
- * Creates a new document object model based on the given XML document
- * string. This will return an empty document if the document could not
- * be parsed.
- *
- * @param xml The document text to convert into a DOM.
- * @return The DOM that represents the given XML data.
- */
- public static Document parse( final String xml ) {
- assert xml != null;
-
- if( !xml.isBlank() ) {
- try( final var reader = new StringReader( xml ) ) {
- final var input = new InputSource();
-
- input.setEncoding( UTF_8.toString() );
- input.setCharacterStream( reader );
-
- return sDocumentBuilder.parse( input );
- }
- catch( final Throwable t ) {
- clue( t );
- }
- }
-
- return sDocumentBuilder.newDocument();
- }
-
- /**
- * Creates a well-formed XHTML document from a standard HTML document.
- *
- * @param source The HTML source document to transform.
- * @param metadata The metadata contained within the head element.
- * @param locale The localization information for the lang attribute.
- * @return The well-formed XHTML document.
- */
- public static Document create(
- final Document source,
- final Map<String, String> metadata,
- final Locale locale,
- final String pageTitle
- ) throws XPathExpressionException {
- final var target = createXhtmlDocument();
- final var html = target.getDocumentElement();
- final var sourceHead = evaluate( "//head", source );
- final var head = target.importNode( sourceHead, true );
-
- html.setAttribute( "lang", locale.getLanguage() );
-
- final var encoding = createEncoding( target, "UTF-8" );
- head.appendChild( encoding );
-
- for( final var entry : metadata.entrySet() ) {
- final var node = createMeta( target, entry );
- head.appendChild( node );
- }
-
- final var titleText = Strings.sanitize( pageTitle );
-
- // Empty titles result in <title/>, which some browsers cannot parse.
- if( !titleText.isBlank() ) {
- final var title = createElement( target, "title", titleText );
- head.appendChild( title );
- }
-
- html.appendChild( head );
-
- final var body = createElement( target, "body", null );
- final var sourceBody = source.getElementsByTagName( "body" ).item( 0 );
- final var children = sourceBody.getChildNodes();
- final var count = children.getLength();
-
- for( var i = 0; i < count; i++ ) {
- body.appendChild( importNode( target, children.item( i ) ) );
- }
-
- html.appendChild( body );
-
- return target;
- }
-
- public static Node evaluate( final String xpath, final Document doc ) throws XPathExpressionException {
- return (Node) XPATH.evaluate( xpath, doc, NODE );
- }
-
- /**
- * Parses the given file contents into a document object model.
- *
- * @param doc The source XML document to parse.
- * @return The file as a document object model.
- * @throws IOException Could not open the document.
- * @throws SAXException Could not read the XML file content.
- */
- public static Document parse( final File doc )
- throws IOException, SAXException {
- assert doc != null;
-
- try( final var in = new FileInputStream( doc ) ) {
- return parse( in );
- }
- }
-
- /**
- * Parses the given file contents into a document object model. Callers
- * must close the stream.
- *
- * @param doc The source XML document to parse.
- * @return The {@link InputStream} converted to a document object model.
- * @throws IOException Could not open the document.
- * @throws SAXException Could not read the XML file content.
- */
- public static Document parse( final InputStream doc )
- throws IOException, SAXException {
- assert doc != null;
-
- return sDocumentBuilder.parse( doc );
- }
-
- /**
- * Allows an operation to be applied for every node in the document that
- * matches a given tag name pattern.
- *
- * @param document Document to traverse.
- * @param xpath Document elements to find via {@link XPath} expression.
- * @param consumer The consumer to call for each matching document node.
- */
- public static void visit(
- final Document document,
- final CharSequence xpath,
- final Consumer<Node> consumer ) {
- assert document != null;
- assert consumer != null;
-
- try {
- final var expr = compile( xpath );
- final var nodeSet = expr.evaluate( document, NODESET );
-
- if( nodeSet instanceof NodeList nodes ) {
- for( int i = 0, len = nodes.getLength(); i < len; i++ ) {
- consumer.accept( nodes.item( i ) );
- }
- }
- }
- catch( final Exception ex ) {
- clue( ex );
- }
- }
-
- public static Node createMeta(
- final Document document, final Map.Entry<String, String> entry ) {
- assert document != null;
- assert entry != null;
-
- final var node = createElement( document, "meta", null );
-
- node.setAttribute( "name", entry.getKey() );
- node.setAttribute( "content", entry.getValue() );
-
- return node;
- }
-
- public static Node createEncoding(
- final Document document, final String encoding
- ) {
- assert document != null;
- assert encoding != null;
-
- final var node = createElement( document, "meta", null );
-
- node.setAttribute( "http-equiv", "Content-Type" );
- node.setAttribute( "content", "text/html; charset=" + encoding );
-
- return node;
- }
-
- public static Element createElement(
- final Document document, final String nodeName, final String nodeValue
- ) {
- assert document != null;
- assert nodeName != null;
- assert !nodeName.isBlank();
-
- final var node = document.createElement( nodeName );
-
- if( nodeValue != null ) {
- node.setTextContent( nodeValue );
- }
-
- return node;
- }
-
- public static String toString( final Node xhtml ) {
- assert xhtml != null;
-
- String result = "";
-
- try( final var writer = new StringWriter() ) {
- final var stream = new StreamResult( writer );
-
- transform( xhtml, stream );
-
- result = writer.toString();
- }
- catch( final Exception ex ) {
- clue( ex );
- }
-
- return result;
- }
-
- public static String transform( final Element root )
- throws IOException, TransformerException {
- assert root != null;
-
- try( final var writer = new StringWriter() ) {
- transform( root.getOwnerDocument(), new StreamResult( writer ) );
-
- return writer.toString();
- }
- }
-
- /**
- * Remove whitespace, comments, and XML/DOCTYPE declarations to make
- * processing work with ConTeXt.
- *
- * @param path The SVG file to process.
- * @throws Exception The file could not be processed.
- */
- public static void sanitize( final Path path ) throws Exception {
- assert path != null;
-
- // Preprocessing the SVG image is a single-threaded operation, no matter
- // how many SVG images are in the document to typeset.
- sWriter.reset();
-
- final var target = new StreamResult( sOutput );
- final var source = sDocumentBuilder.parse( toFile( path ) );
-
- transform( source, target );
- write( path, sWriter.toByteArray() );
- }
-
- /**
- * Converts a string into an {@link XPathExpression}, which may be used to
- * extract elements from a {@link Document} object model.
- *
- * @param cs The string to convert to an {@link XPathExpression}.
- * @return {@code null} if there was an error compiling the xpath.
- */
- public static XPathExpression compile( final CharSequence cs ) {
- assert cs != null;
-
- final var xpath = cs.toString();
-
- return sXpaths.computeIfAbsent(
- xpath, _ -> {
- try {
- return sXpath.compile( xpath );
- }
- catch( final XPathExpressionException ex ) {
- clue( ex );
- return null;
- }
- }
- );
- }
-
- /**
- * Merges a source document into a target document. This avoids adding an
- * empty XML namespace attribute to elements.
- *
- * @param target The document to envelop the source document.
- * @param source The source document to embed.
- * @return The target document with the source document included.
- */
- private static Node importNode( final Document target, final Node source ) {
- assert target != null;
- assert source != null;
-
- Node result;
- final var nodeType = source.getNodeType();
-
- if( nodeType == Node.ELEMENT_NODE ) {
- final var element = createElement( target, source.getNodeName(), null );
- final var attrs = source.getAttributes();
-
- if( attrs != null ) {
- final var attrLength = attrs.getLength();
-
- for( var i = 0; i < attrLength; i++ ) {
- final var attr = attrs.item( i );
- element.setAttribute( attr.getNodeName(), attr.getNodeValue() );
- }
- }
-
- final var children = source.getChildNodes();
- final var childLength = children.getLength();
-
- for( var i = 0; i < childLength; i++ ) {
- element.appendChild( importNode( target, children.item( i ) ) );
- }
-
- result = element;
- }
- else if( nodeType == Node.TEXT_NODE ) {
+import static org.w3c.dom.Node.*;
+
+/**
+ * Responsible for initializing an XML parser.
+ */
+public class DocumentParser {
+ private static final String LOAD_EXTERNAL_DTD =
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd";
+ private static final String INDENT_AMOUNT =
+ "{http://xml.apache.org/xslt}indent-amount";
+ private static final String NAMESPACE = "http://www.w3.org/1999/xhtml";
+
+ private static final XPath XPATH = XPathFactory.newInstance().newXPath();
+
+ private static final ByteArrayOutputStream sWriter =
+ new ByteArrayOutputStream( 65536 );
+ private static final OutputStreamWriter sOutput =
+ new OutputStreamWriter( sWriter, UTF_8 );
+
+ /**
+ * Caches {@link XPathExpression}s to avoid re-compiling.
+ */
+ private static final Map<String, XPathExpression> sXpaths = new HashMap<>();
+
+ private static final DocumentBuilderFactory sDocumentFactory;
+ private static DocumentBuilder sDocumentBuilder;
+ private static Transformer sTransformer;
+ private static final XPath sXpath = XPathFactory.newInstance().newXPath();
+
+ public static final DOMImplementation sDomImplementation;
+
+ static {
+ sDocumentFactory = DocumentBuilderFactory.newInstance();
+
+ sDocumentFactory.setValidating( false );
+ sDocumentFactory.setAttribute( LOAD_EXTERNAL_DTD, false );
+ sDocumentFactory.setNamespaceAware( true );
+ sDocumentFactory.setIgnoringComments( true );
+ sDocumentFactory.setIgnoringElementContentWhitespace( true );
+
+ DOMImplementation domImplementation;
+
+ try {
+ sDocumentBuilder = sDocumentFactory.newDocumentBuilder();
+ domImplementation = sDocumentBuilder.getDOMImplementation();
+ sTransformer = TransformerFactory.newInstance().newTransformer();
+
+ // Ensure Unicode characters (emojis) are encoded correctly.
+ sTransformer.setOutputProperty( ENCODING, UTF_16.toString() );
+ sTransformer.setOutputProperty( OMIT_XML_DECLARATION, "yes" );
+ sTransformer.setOutputProperty( METHOD, "xml" );
+ sTransformer.setOutputProperty( INDENT, "no" );
+ sTransformer.setOutputProperty( INDENT_AMOUNT, "2" );
+ }
+ catch( final Exception ex ) {
+ clue( ex );
+ domImplementation = sDocumentBuilder.getDOMImplementation();
+ }
+
+ sDomImplementation = domImplementation;
+ }
+
+ public static Document newDocument() {
+ return sDocumentBuilder.newDocument();
+ }
+
+ /**
+ * Creates a new document object model based on the given XML document
+ * string. This will return an empty document if the document could not
+ * be parsed.
+ *
+ * @param xml The document text to convert into a DOM.
+ * @return The DOM that represents the given XML data.
+ */
+ public static Document parse( final String xml ) {
+ assert xml != null;
+
+ if( !xml.isBlank() ) {
+ try( final var reader = new StringReader( xml ) ) {
+ final var input = new InputSource();
+
+ input.setEncoding( UTF_8.toString() );
+ input.setCharacterStream( reader );
+
+ return sDocumentBuilder.parse( input );
+ }
+ catch( final Throwable t ) {
+ clue( t );
+ }
+ }
+
+ return sDocumentBuilder.newDocument();
+ }
+
+ /**
+ * Creates a well-formed XHTML document from a standard HTML document.
+ *
+ * @param source The HTML source document to transform.
+ * @param metadata The metadata contained within the head element.
+ * @param locale The localization information for the lang attribute.
+ * @return The well-formed XHTML document.
+ */
+ public static Document create(
+ final Document source,
+ final Map<String, String> metadata,
+ final Locale locale,
+ final String pageTitle
+ ) throws XPathExpressionException {
+ final var target = createXhtmlDocument();
+ final var html = target.getDocumentElement();
+ final var hSource = evaluate( "//head", source );
+ final var head = createElement( target, "head", null );
+ final var hChildren = hSource.getChildNodes();
+ final var hCount = hChildren.getLength();
+
+ for( var i = 0; i < hCount; i++ ) {
+ final var imported = target.importNode( hChildren.item( i ), true );
+
+ if( imported.getNodeType() == ELEMENT_NODE ) {
+ final var nodeName = imported.getNodeName();
+ final var node = target.renameNode( imported, NAMESPACE, nodeName );
+
+ head.appendChild( node );
+ }
+ }
+
+ html.setAttribute( "lang", locale.getLanguage() );
+
+ final var encoding = createEncoding( target, "UTF-8" );
+ head.appendChild( encoding );
+
+ for( final var entry : metadata.entrySet() ) {
+ final var node = createMeta( target, entry );
+
+ head.appendChild( node );
+ }
+
+ final var titleText = Strings.sanitize( pageTitle );
+
+ // Empty titles result in <title/>, which some browsers cannot parse.
+ if( !titleText.isBlank() ) {
+ final var title = createElement( target, "title", titleText );
+ head.appendChild( title );
+ }
+
+ html.appendChild( head );
+
+ final var body = createElement( target, "body", null );
+ final var bSource = source.getElementsByTagName( "body" ).item( 0 );
+ final var bChildren = bSource.getChildNodes();
+ final var bCount = bChildren.getLength();
+
+ for( var i = 0; i < bCount; i++ ) {
+ body.appendChild( importNode( target, bChildren.item( i ) ) );
+ }
+
+ html.appendChild( body );
+
+ return target;
+ }
+
+ public static Node evaluate( final String xpath, final Document doc ) throws XPathExpressionException {
+ return (Node) XPATH.evaluate( xpath, doc, NODE );
+ }
+
+ /**
+ * Parses the given file contents into a document object model.
+ *
+ * @param doc The source XML document to parse.
+ * @return The file as a document object model.
+ * @throws IOException Could not open the document.
+ * @throws SAXException Could not read the XML file content.
+ */
+ public static Document parse( final File doc )
+ throws IOException, SAXException {
+ assert doc != null;
+
+ try( final var in = new FileInputStream( doc ) ) {
+ return parse( in );
+ }
+ }
+
+ /**
+ * Parses the given file contents into a document object model. Callers
+ * must close the stream.
+ *
+ * @param doc The source XML document to parse.
+ * @return The {@link InputStream} converted to a document object model.
+ * @throws IOException Could not open the document.
+ * @throws SAXException Could not read the XML file content.
+ */
+ public static Document parse( final InputStream doc )
+ throws IOException, SAXException {
+ assert doc != null;
+
+ return sDocumentBuilder.parse( doc );
+ }
+
+ /**
+ * Allows an operation to be applied for every node in the document that
+ * matches a given tag name pattern.
+ *
+ * @param document Document to traverse.
+ * @param xpath Document elements to find via {@link XPath} expression.
+ * @param consumer The consumer to call for each matching document node.
+ */
+ public static void visit(
+ final Document document,
+ final CharSequence xpath,
+ final Consumer<Node> consumer ) {
+ assert document != null;
+ assert consumer != null;
+
+ try {
+ final var expr = compile( xpath );
+ final var nodeSet = expr.evaluate( document, NODESET );
+
+ if( nodeSet instanceof NodeList nodes ) {
+ for( int i = 0, len = nodes.getLength(); i < len; i++ ) {
+ consumer.accept( nodes.item( i ) );
+ }
+ }
+ }
+ catch( final Exception ex ) {
+ clue( ex );
+ }
+ }
+
+ public static Node createMeta(
+ final Document document, final Map.Entry<String, String> entry ) {
+ assert document != null;
+ assert entry != null;
+
+ final var node = createElement( document, "meta", null );
+
+ node.setAttribute( "name", entry.getKey() );
+ node.setAttribute( "content", entry.getValue() );
+
+ return node;
+ }
+
+ public static Node createEncoding(
+ final Document document, final String encoding
+ ) {
+ assert document != null;
+ assert encoding != null;
+
+ final var node = createElement( document, "meta", null );
+
+ node.setAttribute( "http-equiv", "Content-Type" );
+ node.setAttribute( "content", "text/html; charset=" + encoding );
+
+ return node;
+ }
+
+ public static Element createElement(
+ final Document document, final String nodeName, final String nodeValue
+ ) {
+ assert document != null;
+ assert nodeName != null;
+ assert !nodeName.isBlank();
+
+ // Recreate elements in the target document with namespace.
+ final var node = document.createElementNS( NAMESPACE, nodeName );
+
+ if( nodeValue != null ) {
+ node.setTextContent( nodeValue );
+ }
+
+ return node;
+ }
+
+ public static String toString( final Node xhtml ) {
+ assert xhtml != null;
+
+ String result = "";
+
+ try( final var writer = new StringWriter() ) {
+ final var stream = new StreamResult( writer );
+
+ transform( xhtml, stream );
+
+ result = writer.toString();
+ }
+ catch( final Exception ex ) {
+ clue( ex );
+ }
+
+ return result;
+ }
+
+ public static String transform( final Element root )
+ throws IOException, TransformerException {
+ assert root != null;
+
+ try( final var writer = new StringWriter() ) {
+ transform( root.getOwnerDocument(), new StreamResult( writer ) );
+
+ return writer.toString();
+ }
+ }
+
+ /**
+ * Remove whitespace, comments, and XML/DOCTYPE declarations to make
+ * processing work with ConTeXt.
+ *
+ * @param path The SVG file to process.
+ * @throws Exception The file could not be processed.
+ */
+ public static void sanitize( final Path path ) throws Exception {
+ assert path != null;
+
+ // Preprocessing the SVG image is a single-threaded operation, no matter
+ // how many SVG images are in the document to typeset.
+ sWriter.reset();
+
+ final var target = new StreamResult( sOutput );
+ final var source = sDocumentBuilder.parse( toFile( path ) );
+
+ transform( source, target );
+ write( path, sWriter.toByteArray() );
+ }
+
+ /**
+ * Converts a string into an {@link XPathExpression}, which may be used to
+ * extract elements from a {@link Document} object model.
+ *
+ * @param cs The string to convert to an {@link XPathExpression}.
+ * @return {@code null} if there was an error compiling the xpath.
+ */
+ public static XPathExpression compile( final CharSequence cs ) {
+ assert cs != null;
+
+ final var xpath = cs.toString();
+
+ return sXpaths.computeIfAbsent(
+ xpath, _ -> {
+ try {
+ return sXpath.compile( xpath );
+ }
+ catch( final XPathExpressionException ex ) {
+ clue( ex );
+ return null;
+ }
+ }
+ );
+ }
+
+ /**
+ * Merges a source document into a target document. This avoids adding an
+ * empty XML namespace attribute to elements.
+ *
+ * @param target The document to envelop the source document.
+ * @param source The source document to embed.
+ * @return The target document with the source document included.
+ */
+ private static Node importNode( final Document target, final Node source ) {
+ assert target != null;
+ assert source != null;
+
+ Node result;
+ final var nodeType = source.getNodeType();
+
+ if( nodeType == ELEMENT_NODE ) {
+ final var element = createElement( target, source.getNodeName(), null );
+ final var attrs = source.getAttributes();
+
+ if( attrs != null ) {
+ final var attrLength = attrs.getLength();
+
+ for( var i = 0; i < attrLength; i++ ) {
+ final var attr = attrs.item( i );
+ element.setAttribute( attr.getNodeName(), attr.getNodeValue() );
+ }
+ }
+
+ final var children = source.getChildNodes();
+ final var childLength = children.getLength();
+
+ for( var i = 0; i < childLength; i++ ) {
+ element.appendChild( importNode( target, children.item( i ) ) );
+ }
+
+ result = element;
+ }
+ else if( nodeType == TEXT_NODE ) {
result = target.createTextNode( source.getNodeValue() );
}