XML to plain text

Text Output

1. Formatted plain Text Output from XML source

1.

Formatted plain Text Output from XML source

Eric van der Vlist



[Ednote] More of an announce than anything, Eric has posted
a java extension which works with XT to deliver clean text
with line breaks and indented lists. I have included the
java source and an example below, I hope Eric doesn't
mind. I found it works beautifully.

Principle.

Use standard XSLT constructs to transform any element from
the source document into itself, a p element or a small tree
of ul li (as per lists in HTML). Any element content which
is not a p element is output as plain text. Any p element in
the output is output with a line break before and after. Any
ul li elements are outdented correctly with a hyphen to
identify them.  An additional pre element may be used if no
formatting is needed.

Additionally the <br/> empty element may be used at any
point to create a single line break.  Eric has done this by
tweaking the xsl:output handling, as you can see from the
xslt stylesheet. Don't forget that addition to the
attributes.

I put the .class file in the same directory as the source files, and 
ran the command line
java -classpath .;f:\myjava\xt.jar;f:\myjava\xp.jar;f:\myjava\sax.jar \
-Dcom.jclark.xsl.sax.parser=com.jclark.xml.sax.CommentDriver \
com.jclark.xsl.sax.Driver   xml-file xsl-file output-file
(all on one line)

This ensures I pick up the extension!




Eric keeps his code at 

http://downloads.dyomedea.com/java/examples/outputhandlers/formatedtext/

There are also a couple of examples.
This stylesheet uses the pre format, and includes a specific element,
the para element, which I want formatting in paragraphs with breaks.


<?xml version="1.0" encoding='iso-8859-1'?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  xmlns:xt="http://www.jclark.com/xt"
    >

<xsl:output encoding="iso-8859-1" 
method="java:FormatedTextOutputHandler" 
xmlns:java="http://www.jclark.com/xt/java"/>

<xsl:template match="@*|*">
	<xsl:copy>
		<xsl:apply-templates select="@*|node()"/>
	</xsl:copy>
</xsl:template>

<xsl:template match="para">
 <p><xsl:apply-templates/></p>
 </xsl:template>

<xsl:template match="p[@class ='pre']">
  <xsl:apply-templates/>
</xsl:template>

</xsl:stylesheet>


Java source. Note that he needs access to sax, and that this
is targetted at the XT processor from James Clark.

import com.jclark.xsl.sax.*;

import org.xml.sax.*;
import java.io.*;
import java.util.StringTokenizer;

public class FormatedTextOutputHandler 
            extends HandlerBase implements OutputDocumentHandler {
  private Writer writer;
  private boolean keepOpen;
  private static final int LINE_SIZE = 72;
  private static final int HTAB = 2;
  private int indent=0;
  private int hpos=0;
  private boolean newline = true;
  private boolean pre = false;

  public FormatedTextOutputHandler() {
  }

  public FormatedTextOutputHandler(Writer writer) {
    this.writer = writer;
  }


	public static String urlMailEncode(String string) {
		StringBuffer res = new StringBuffer();
		for (int i=0; i< string.length(); i++) {
			char c = string.charAt(i);
			if (Character.isLetterOrDigit(c)) {
				res.append(c);
			} else if (c==' ') {
				res.append("%20");
			} else {
				res.append(c);
			}
		}
		return res.toString();
	}



  public DocumentHandler init(Destination dest, AttributeList atts)
    throws IOException {
    String mediaType = atts.getValue("media-type");
    if (mediaType == null)
      mediaType = "text/plain";
    writer = dest.getWriter(mediaType, atts.getValue("encoding"));
    keepOpen = dest.keepOpen();
    return this;
  }

  public void endDocument() throws SAXException {
    try {
      if (writer != null) {
	  lineBreak();
	if (keepOpen)
	  writer.flush();
	else
	  writer.close();
	writer = null;
      }
    }
    catch (IOException e) {
      throw new SAXException(e);
    }
  }

	public void write(String string) throws SAXException{
		try {
			writer.write(string);
		} catch (IOException e) {
			throw new SAXException(e);
		}
	}
  

	public void writeWord(String word) throws SAXException{
		if (hpos + word.length() >= LINE_SIZE ){
			lineBreak();
		} 
		for (; hpos<indent; hpos++)
			write(" ");
      	write(word);
		hpos += word.length();
		if (hpos < LINE_SIZE) {
 			write(" ");
			hpos ++;
		} 
		newline=false;
	}
  
	public void lineBreak() throws SAXException{
		if (! newline) {
			write("\n");
			hpos=0;
			newline=true;
		}
	}
  
	public void lineBreak(boolean forceNewline) throws SAXException{
		lineBreak();
		if (forceNewline) newline=false;
	}
  
	public void indent() throws SAXException{
		indent += HTAB;
		lineBreak();
	}

	public void line() throws SAXException{
		lineBreak();
		writeWord("-");
		indent += 2;
	}
	
	public void unindent() throws SAXException{
		indent -= HTAB;
		lineBreak();
	}
	
	public void unline() throws SAXException{
 		indent -= 2;
		lineBreak();
	}

  public void startElement(String name, AttributeList atts) 
                  throws SAXException {
	    if (name.equals("p")) 
			lineBreak();
	    else if (name.equals("br"))
			lineBreak();
	    else if (name.equals("ul"))
			indent();
	    else if (name.equals("li"))
			line();  	
	    else if (name.equals("pre")) {
			pre=true;  	
 			lineBreak();
		}
 }

  public void endElement(String name) throws SAXException {
	    if (name.equals("p")) 
			lineBreak(true);
	    else if (name.equals("ul"))
			unindent();
	    else if (name.equals("li"))
			unline();
	    else if (name.equals("pre")) {
			pre=false;  	
			lineBreak(true);
		}
 }


	public void characters(char cbuf[], int off, int len) throws SAXException {
		if (pre) {
			write (new String(cbuf, off, len));
		} else {
			StringTokenizer words = 
                                new StringTokenizer(new String(cbuf, off, len));
			while (words.hasMoreTokens()) {
				writeWord(words.nextToken());
			}
		}
	}
}