Java HTML Tidy

Updated 3 Sep 1999


This is a Java version of HTML Tidy Release 26 Jul 1999 Copyright © 1999 W3C, see Tidy.java for the copyright notice.

I have made available:

To use the Tidy Java Bean, just include JTidy\lib\Tidy.jar in your classpath.

To build Tidy from the source, you need a Java compiler/runtime environment, supporting Java 1.1 or higher. First, download and expand the archive. For Win 9x/NT, build it using the batch file JTidy\make\build.bat as follows:


    cd JTidy\make

    build c: 26jul9999

Where c: is the root where you expanded the JTidy archive, and 26jul1999 is the directory under JTidy\src where the source is located. NOTE: build.bat assumes that the environment variable java_home points to your JDK installation, and that the JDK tools are in your path.

For Unix environments, follow the procedure in build.bat. Sorry about that, I will try to get a more generic build procedure going in the future.

The main class is: org.w3c.tidy.Tidy


Release News


Code example of how to use the Tidy Java Bean


import java.io.IOException;

import java.net.URL;

import java.io.BufferedInputStream;

import java.io.FileOutputStream;

import java.io.PrintWriter;

import java.io.FileWriter;

import org.w3c.tidy.Tidy;





/**

 * This program shows how HTML could be tidied directly from

 * a URL stream, and running on separate threads.  Note the use

 * of the 'parse' method to parse from an InputStream, and send

 * the pretty-printed result to an OutputStream.

 * In this example thread th1 outputs XML, and thread th2 outputs

 * HTML.  This shows that properties are per instance of Tidy.

 */



public class Test16 implements Runnable {



    private String url;

    private String outFileName;

    private String errOutFileName;

    private boolean xmlOut;



    public Test16(String url, String outFileName,

                  String errOutFileName, boolean xmlOut)

    {

        this.url = url;

        this.outFileName = outFileName;

        this.errOutFileName = errOutFileName;

        this.xmlOut = xmlOut;

    }



    public void run()

    {

        URL u;

        BufferedInputStream in;

        FileOutputStream out;

        Tidy tidy = new Tidy();



        tidy.setXmlOut(xmlOut);

        try {

            tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));

            u = new URL(url);

            in = new BufferedInputStream(u.openStream());

            out = new FileOutputStream(outFileName);

            tidy.parse(in, out);

        }

        catch ( IOException e ) {

            System.out.println( this.toString() + e.toString() );

        }

    }



    public static void main( String[] args ) {

        Test16 t1 = new Test16(args[0], args[1], args[2], true);

        Test16 t2 = new Test16(args[3], args[4], args[5], false);

        Thread th1 = new Thread(t1);

        Thread th2 = new Thread(t2);



        th1.start();

        th2.start();

    }



}

Code example of using Java Tidy as a parser


import java.io.IOException;

import java.io.FileInputStream;

import org.w3c.tidy.Tidy;

import org.w3c.tidy.Node;



/**

 * This program shows how to use Tidy as an HTML parser.

 * It creates an instance of Tidy, calls the parse method

 * to parse a file input stream, and dumps a text representation

 * of the parse tree to System.out.

 */



public class Test17 {



    private static final String spaces =

        "                                                             ";



    private static void dump(Node node, int indent)

    {

        String prefix = spaces.substring(0, indent);

        Node n = node;



        while (n != null) {

            System.out.println( prefix + "------Node-------");

            switch (node.type) {

            case Node.RootNode:

                System.out.println( prefix + "type: RootNode");

                break;

            case Node.DocTypeTag:

                System.out.println( prefix + "type: DocTypeTag");

                break;

            case Node.CommentTag:

                System.out.println( prefix + "type: CommentTag");

                break;

            case Node.ProcInsTag:

                System.out.println( prefix + "type: ProcInsTag");

                break;

            case Node.TextNode:

                System.out.println( prefix + "type: TextNode");

                String v = n.getNodeValue();

                if (v != null) {

                    System.out.println( prefix + "value: " + v);

                } else {

                    System.out.println( prefix + "value: null");

                }

                break;

            case Node.StartTag:

                System.out.println( prefix + "type: StartTag");

                break;

            case Node.EndTag:

                System.out.println( prefix + "type: EndTag");

                break;

            case Node.StartEndTag:

                System.out.println( prefix + "type: StartEndTag");

                break;

            case Node.AspTag:

                System.out.println( prefix + "type: AspTag");

                break;

            default:

                System.out.println( prefix + "invalid type");

                break;

            }

            if (n.element != null) {

                System.out.println( prefix + "element: " + n.element);

            }

            dump(n.content, indent+4);

            n = n.next;

        }

    }



    public static void main( String[] args )

    {

        FileInputStream in;

        Tidy tidy = new Tidy();

        Node root = null;



        try {

            in = new FileInputStream(args[0]);

            tidy.setMakeClean(true);

            root = tidy.parse(in, null);

            dump(root, 0);

        }

        catch ( IOException e ) {

            System.out.println( e.toString() );

        }

    }



}


Send questions, comments, or bug reports to Andy Quick.