java JDK自带的HTML解析器 (HTMLEditorKit.Parser) 示例

2018-07-20    来源:open-open

容器云强势上线!快速搭建集群,上万Linux镜像随意使用

HtmlParseDemo.java

import java.io.*;
import java.net.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;

/**
 * This small demo program shows how to use the
 * HTMLEditorKit.Parser and its implementing class
 * ParserDelegator in the Swing system.
 */

public class HtmlParseDemo {
    public static void main(String [] args) {
        Reader r;
        if (args.length == 0) {
            System.err.println("Usage: java HTMLParseDemo [url | file]");
            System.exit(0);
        }
        String spec = args[0];
        try {
            if (spec.indexOf("://") > 0) {
                URL u = new URL(spec);
                Object content = u.getContent();
                if (content instanceof InputStream) {
                    r = new InputStreamReader((InputStream)content);
                }
                else if (content instanceof Reader) {
                    r = (Reader)content;
                }
                else {
                    throw new Exception("Bad URL content type.");
                }
            }
            else {
                r = new FileReader(spec);
            }

            HTMLEditorKit.Parser parser;
            System.out.println("About to parse " + spec);
            parser = new ParserDelegator();
            parser.parse(r, new HTMLParseLister(), true);
            r.close();
        }
        catch (Exception e) {
            System.err.println("Error: " + e);
            e.printStackTrace(System.err);
        }
    }
}

HTMLParseLister.java

/**
 * HTML parsing proceeds by calling a callback for
 * each and every piece of the HTML do*****ent.  This
 * simple callback class simply prints an indented
 * structural listing of the HTML data.
 */
class HTMLParseLister extends HTMLEditorKit.ParserCallback
{
    int indentSize = 0;

    protected void indent() {
        indentSize += 3;
    }
    protected void unIndent() {
        indentSize -= 3; if (indentSize < 0) indentSize = 0;
    }

    protected void pIndent() {
        for(int i = 0; i < indentSize; i++) System.out.print(" ");
    }

    public void handleText(char[] data, int pos) {
        pIndent();
        System.out.println("Text(" + data.length + " chars)");
    }

    public void handleComment(char[] data, int pos) {
        pIndent();
        System.out.println("Comment(" + data.length + " chars)");
    }

    public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
        pIndent();
        System.out.println("Tag start(<" + t.toString() + ">, " +
                           a.getAttributeCount() + " attrs)");
        indent();
    }

    public void handleEndTag(HTML.Tag t, int pos) {
        unIndent();
        pIndent();
        System.out.println("Tag end(</" + t.toString() + ">)");
    }

    public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
        pIndent();
        System.out.println("Tag(<" + t.toString() + ">, " +
                           a.getAttributeCount() + " attrs)");
    }

    public void handleError(String errorMsg, int pos){
        System.out.println("Parsing error: " + errorMsg + " at " + pos);
    }
}

标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点!
本站所提供的图片等素材,版权归原作者所有,如需使用,请与原作者联系。

上一篇:编程算法 - 迷宫的最短路径 代码(C++)

下一篇:Java实现zip解压缩目录中的所有文件