<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    posts - 495,comments - 227,trackbacks - 0

    為了支持全文檢索,有必要將HTML格式的文章轉化為純文本格式,因此我設計了一個基本的WebFormatter類,提供一個簡單的public static String html2text(String html),將HTML格式轉化為Text:

    /*
    ?* File: WebFormatter.java
    ?* Created on 2005-6-24
    ?* Author: Liao Xuefeng,
    asklxf@163.com
    ?* Copyright (C) 2005, Liao Xuefeng.
    ?*/
    package com.mboker.blog.web.util;

    import java.util.*;
    import java.text.SimpleDateFormat;

    /**
    ?* Do some format on web display.
    ?*
    ?* @author Xuefeng
    ?*/
    public class WebFormatter {

    ??? public static String html2text(String html) {
    ??????? StringBuffer sb = new StringBuffer(html.length());
    ??????? char[] data = html.toCharArray();
    ??????? int start = 0;
    ??????? boolean previousIsPre = false;
    ??????? Token token = null;
    ??????? for(;;) {
    ??????????? token = parse(data, start, previousIsPre);
    ??????????? if(token==null)
    ??????????????? break;
    ??????????? previousIsPre = token.isPreTag();
    ??????????? sb = sb.append(token.getText());
    ??????????? start += token.getLength();
    ??????? }
    ??????? return sb.toString();
    ??? }

    ??? private static Token parse(char[] data, int start, boolean previousIsPre) {
    ??????? if(start>=data.length)
    ??????????? return null;
    ??????? // try to read next char:
    ??????? char c = data[start];
    ??????? if(c=='<') {
    ??????????? // this is a tag or comment or script:
    ??????????? int end_index = indexOf(data, start+1, '>');
    ??????????? if(end_index==(-1)) {
    ??????????????? // the left is all text!
    ??????????????? return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
    ??????????? }
    ??????????? String s = new String(data, start, end_index-start+1);
    ??????????? // now we got s="<...>":
    ??????????? if(s.startsWith("<!--")) { // this is a comment!
    ??????????????? int end_comment_index = indexOf(data, start+1, "-->");
    ??????????????? if(end_comment_index==(-1)) {
    ??????????????????? // illegal end, but treat as comment:
    ??????????????????? return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);
    ??????????????? }
    ??????????????? else
    ??????????????????? return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);
    ??????????? }
    ??????????? String s_lowerCase = s.toLowerCase();
    ??????????? if(s_lowerCase.startsWith("<script")) { // this is a script:
    ??????????????? int end_script_index = indexOf(data, start+1, "</script>");
    ??????????????? if(end_script_index==(-1))
    ??????????????????? // illegal end, but treat as script:
    ??????????????????? return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);
    ??????????????? else
    ??????????????????? return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);
    ??????????? }
    ??????????? else { // this is a tag:
    ??????????????? return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);
    ??????????? }
    ??????? }
    ??????? // this is a text:
    ??????? int next_tag_index = indexOf(data, start+1, '<');
    ??????? if(next_tag_index==(-1))
    ??????????? return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
    ??????? return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);
    ??? }

    ??? private static int indexOf(char[] data, int start, String s) {
    ??????? char[] ss = s.toCharArray();
    ??????? // TODO: performance can improve!
    ??????? for(int i=start; i<(data.length-ss.length); i++) {
    ??????????? // compare from data[i] with ss[0]:
    ??????????? boolean match = true;
    ??????????? for(int j=0; j<ss.length; j++) {
    ??????????????? if(data[i+j]!=ss[j]) {
    ??????????????????? match = false;
    ??????????????????? break;
    ??????????????? }
    ??????????? }
    ??????????? if(match)
    ??????????????? return i;
    ??????? }
    ??????? return (-1);
    ??? }

    ??? private static int indexOf(char[] data, int start, char c) {
    ??????? for(int i=start; i<data.length; i++) {
    ??????????? if(data[i]==c)
    ??????????????? return i;
    ??????? }
    ??????? return (-1);
    ??? }

    }

    class Token {

    ??? public static final int TOKEN_TEXT??? = 0; // html text.
    ??? public static final int TOKEN_COMMENT = 1; // comment like <!-- comments... -->
    ??? public static final int TOKEN_TAG???? = 2; // tag like <pre>, <font>, etc.
    ??? public static final int TOKEN_SCRIPT? = 3;

    ??? private static final char[] TAG_BR? = "<br".toCharArray();
    ??? private static final char[] TAG_P?? = "<p".toCharArray();
    ??? private static final char[] TAG_LI? = "<li".toCharArray();
    ??? private static final char[] TAG_PRE = "<pre".toCharArray();
    ??? private static final char[] TAG_HR? = "<hr".toCharArray();

    ??? private static final char[] END_TAG_TD = "</td>".toCharArray();
    ??? private static final char[] END_TAG_TR = "</tr>".toCharArray();
    ??? private static final char[] END_TAG_LI = "</li>".toCharArray();

    ??? private static final Map SPECIAL_CHARS = new HashMap();

    ??? private int type;
    ??? private String html;?????????? // original html
    ??? private String text = null;??? // text!
    ??? private int length = 0;??????? // html length
    ??? private boolean isPre = false; // isPre tag?

    ??? static {
    ??????? SPECIAL_CHARS.put("&quot;", "\"");
    ??????? SPECIAL_CHARS.put("&lt;",?? "<");
    ??????? SPECIAL_CHARS.put("&gt;",?? ">");
    ??????? SPECIAL_CHARS.put("&amp;",? "&");
    ??????? SPECIAL_CHARS.put("&reg;",? "(r)");
    ??????? SPECIAL_CHARS.put("&copy;", "(c)");
    ??????? SPECIAL_CHARS.put("&nbsp;", " ");
    ??????? SPECIAL_CHARS.put("&pound;", "?");
    ??? }

    ??? public Token(int type, char[] data, int start, int end, boolean previousIsPre) {
    ??????? this.type = type;
    ??????? this.length = end - start;
    ??????? this.html = new String(data, start, length);
    ??????? System.out.println("[Token] html=" + html + ".");
    ??????? parseText(previousIsPre);
    ??????? System.out.println("[Token] text=" + text + ".");
    ??? }

    ??? public int getLength() {
    ??????? return length;
    ??? }

    ??? public boolean isPreTag() {
    ??????? return isPre;
    ??? }

    ??? private void parseText(boolean previousIsPre) {
    ??????? if(type==TOKEN_TAG) {
    ??????????? char[] cs = html.toCharArray();
    ??????????? if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))
    ??????????????? text = "\n";
    ??????????? else if(compareTag(TAG_LI, cs))
    ??????????????? text = "\n* ";
    ??????????? else if(compareTag(TAG_PRE, cs))
    ??????????????? isPre = true;
    ??????????? else if(compareTag(TAG_HR, cs))
    ??????????????? text = "\n--------\n";
    ??????????? else if(compareString(END_TAG_TD, cs))
    ??????????????? text = "\t";
    ??????????? else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))
    ??????????????? text = "\n";
    ??????? }
    ??????? // text token:
    ??????? else if(type==TOKEN_TEXT) {
    ??????????? text = toText(html, previousIsPre);
    ??????? }
    ??? }

    ??? public String getText() {
    ??????? return text==null ? "" : text;
    ??? }

    ??? private String toText(String html, final boolean isPre) {
    ??????? char[] cs = html.toCharArray();
    ??????? StringBuffer buffer = new StringBuffer(cs.length);
    ??????? int start = 0;
    ??????? boolean continueSpace = false;
    ??????? char current, next;
    ??????? for(;;) {
    ??????????? if(start>=cs.length)
    ??????????????? break;
    ??????????? current = cs[start]; // read current char
    ??????????? if(start+1<cs.length) // and next char
    ??????????????? next = cs[start+1];
    ??????????? else
    ??????????????? next = '\0';
    ??????????? if(current==' ') {
    ??????????????? if(isPre || !continueSpace)
    ??????????????????? buffer = buffer.append(' ');
    ??????????????? continueSpace = true;
    ??????????????? // continue loop:
    ??????????????? start++;
    ??????????????? continue;
    ??????????? }
    ??????????? // not ' ', so:
    ??????????? if(current=='\r' && next=='\n') {
    ??????????????? if(isPre)
    ??????????????????? buffer = buffer.append('\n');
    ??????????????? // continue loop:
    ??????????????? start+=2;
    ??????????????? continue;
    ??????????? }
    ??????????? if(current=='\n' || current=='\r') {
    ??????????????? if(isPre)
    ??????????????????? buffer = buffer.append('\n');
    ??????????????? // continue loop:
    ??????????????? start++;
    ??????????????? continue;
    ??????????? }
    ??????????? // cannot continue space:
    ??????????? continueSpace = false;
    ??????????? if(current=='&') {
    ??????????????? // maybe special char:
    ??????????????? int length = readUtil(cs, start, ';', 10);
    ??????????????? if(length==(-1)) { // just '&':
    ??????????????????? buffer = buffer.append('&');
    ??????????????????? // continue loop:
    ??????????????????? start++;
    ??????????????????? continue;
    ??????????????? }
    ??????????????? else { // check if special character:
    ??????????????????? String spec = new String(cs, start, length);
    ??????????????????? String specChar = (String)SPECIAL_CHARS.get(spec);
    ??????????????????? if(specChar!=null) { // special chars!
    ??????????????????????? buffer = buffer.append(specChar);
    ??????????????????????? // continue loop:
    ??????????????????????? start+=length;
    ??????????????????????? continue;
    ??????????????????? }
    ??????????????????? else { // check if like '&#1234':
    ??????????????????????? if(next=='#') { // maybe a char
    ??????????????????????????? String num = new String(cs, start+2, length-3);
    ??????????????????????????? try {
    ??????????????????????????????? int code = Integer.parseInt(num);
    ??????????????????????????????? if(code>0 && code<65536) { // this is a special char:
    ??????????????????????????????????? buffer = buffer.append((char)code);
    ??????????????????????????????????? // continue loop:
    ??????????????????????????????????? start++;
    ??????????????????????????????????? continue;
    ??????????????????????????????? }
    ??????????????????????????? }
    ??????????????????????????? catch(Exception e) {}
    ??????????????????????????? // just normal char:
    ??????????????????????????? buffer = buffer.append("&#");
    ??????????????????????????? // continue loop:
    ??????????????????????????? start+=2;
    ??????????????????????????? continue;
    ??????????????????????? }
    ??????????????????????? else { // just '&':
    ??????????????????????????? buffer = buffer.append('&');
    ??????????????????????????? // continue loop:
    ??????????????????????????? start++;
    ??????????????????????????? continue;
    ??????????????????????? }
    ??????????????????? }
    ??????????????? }
    ??????????? }
    ??????????? else { // just a normal char!
    ??????????????? buffer = buffer.append(current);
    ??????????????? // continue loop:
    ??????????????? start++;
    ??????????????? continue;
    ??????????? }
    ??????? }
    ??????? return buffer.toString();
    ??? }

    ??? // read from cs[start] util meet the specified char 'util',
    ??? // or null if not found:
    ??? private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {
    ??????? int end = start+maxLength;
    ??????? if(end>cs.length)
    ??????????? end = cs.length;
    ??????? for(int i=start; i<start+maxLength; i++) {
    ??????????? if(cs[i]==util) {
    ??????????????? return i-start+1;
    ??????????? }
    ??????? }
    ??????? return (-1);
    ??? }

    ??? // compare standard tag "<input" with tag "<INPUT value=aa>"
    ??? private boolean compareTag(final char[] ori_tag, char[] tag) {
    ??????? if(ori_tag.length>=tag.length)
    ??????????? return false;
    ??????? for(int i=0; i<ori_tag.length; i++) {
    ??????????? if(Character.toLowerCase(tag[i])!=ori_tag[i])
    ??????????????? return false;
    ??????? }
    ??????? // the following char should not be a-z:
    ??????? if(tag.length>ori_tag.length) {
    ??????????? char c = Character.toLowerCase(tag[ori_tag.length]);
    ??????????? if(c<'a' || c>'z')
    ??????????????? return true;
    ??????????? return false;
    ??????? }
    ??????? return true;
    ??? }

    ??? private boolean compareString(final char[] ori, char[] comp) {
    ??????? if(ori.length>comp.length)
    ??????????? return false;
    ??????? for(int i=0; i<ori.length; i++) {
    ??????????? if(Character.toLowerCase(comp[i])!=ori[i])
    ??????????????? return false;
    ??????? }
    ??????? return true;
    ??? }

    ??? public String toString() {
    ??????? return html;
    ??? }
    }

    注意,請先將html中的<body>...</body>部分提取出來,再交給WebFormatter處理,因為html->text轉換實質是刪除所有標簽(某些標簽如<br>被轉化為'\n')、Script和注釋,對于JavaScript生成的動態內容(例如document.write)無能為力。

    posted on 2006-04-07 16:33 SIMONE 閱讀(808) 評論(0)  編輯  收藏 所屬分類: JAVA
    主站蜘蛛池模板: a级毛片毛片免费观看久潮喷| 色屁屁在线观看视频免费| 亚洲精品无码Av人在线观看国产 | 亚洲福利视频一区二区三区| 国产一级婬片A视频免费观看| 色se01短视频永久免费| 亚洲真人日本在线| 一级成人生活片免费看| 成人无遮挡裸免费视频在线观看| 亚洲人色大成年网站在线观看| 麻豆视频免费观看| 亚洲人成综合网站7777香蕉| 9i9精品国产免费久久| 亚洲欧洲日产国码av系列天堂| 99久久免费国产精品热| 亚洲麻豆精品果冻传媒| 2022免费国产精品福利在线| 永久亚洲成a人片777777| 免费观看在线禁片| 亚洲伊人久久综合影院| 男人都懂www深夜免费网站| 亚洲日韩乱码久久久久久| 成年午夜视频免费观看视频 | 永久免费av无码网站大全| 久久精品国产亚洲av瑜伽| 最近免费中文字幕高清大全| 亚洲不卡中文字幕| 特级精品毛片免费观看| 中文亚洲成a人片在线观看| 在线观看亚洲免费视频| 国产亚洲AV手机在线观看| 99re6热视频精品免费观看| 亚洲深深色噜噜狠狠爱网站| 无码A级毛片免费视频内谢| 亚洲av产在线精品亚洲第一站| 国产色爽女小说免费看| 日韩精品无码免费专区午夜不卡| 亚洲免费电影网站| 亚洲国产天堂久久久久久| 日韩在线一区二区三区免费视频| 亚洲AV永久青草无码精品|