锘??xml version="1.0" encoding="utf-8" standalone="yes"?>亚洲人成亚洲精品,亚洲国产夜色在线观看,亚洲AV日韩精品久久久久久http://m.tkk7.com/willpower88/category/37510.html瀵笿AVA鏈夌偣鐞嗚В浜?jiǎn)鈥︹?/description>zh-cnMon, 09 Feb 2009 08:36:13 GMTMon, 09 Feb 2009 08:36:13 GMT60lucene2.0+heritrix紺轟緥琛ュ厖http://m.tkk7.com/willpower88/archive/2009/02/09/253914.html涓鍑?/dc:creator>涓鍑?/author>Mon, 09 Feb 2009 07:44:00 GMThttp://m.tkk7.com/willpower88/archive/2009/02/09/253914.htmlhttp://m.tkk7.com/willpower88/comments/253914.htmlhttp://m.tkk7.com/willpower88/archive/2009/02/09/253914.html#Feedback0http://m.tkk7.com/willpower88/comments/commentRss/253914.htmlhttp://m.tkk7.com/willpower88/services/trackbacks/253914.html search鐨凟xtractor浠g爜濡備笅錛岋紙鍒拰涔︿笂瀹炰緥鐩稿悓錛変緵澶у鍙傝冿細(xì)闄勪歡閲屾湁瀹屾暣浠g爜
package com.luceneheritrixbook.extractor.younet;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;

import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;

/**
 * <p></p>
 * 
@author cnyqiao@hotmail.com
 * @date   Feb 6, 2009 
 
*/

public class ExtractYounetMoblie extends Extractor {

    @Override
    
public void extract() {
        BufferedWriter bw 
= null;
        NodeFilter title_filter 
= new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class""mo_tit"));
        NodeFilter attribute_filter 
= new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp1 blue1"))));
        NodeFilter img_filter 
= new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img")));
        
        
//鎻愬彇鏍囬淇℃伅
        try {
            
//Parser鏍規(guī)嵁榪囨護(hù)鍣ㄨ繑鍥炴墍鏈夋弧瓚寵繃婊ゆ潯浠剁殑鑺傜偣
            
// 榪唬閫愭笎鏌ユ壘
            NodeList nodeList=this.getParser().parse(title_filter);
            NodeIterator it 
= nodeList.elements();
            StringBuffer title 
= new StringBuffer();
            
while (it.hasMoreNodes()) {
                Node node 
= (Node) it.nextNode();
                String[] names 
= node.toPlainTextString().split(" ");
                
for(int i = 0; i < names.length; i++)
                    title.append(names[i]).append(
"-");
                title.append(
new Date().getTime());
                
//鍒涘緩瑕佺敓鎴愮殑鏂囦歡
                bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt")));
                
//鑾峰彇褰撳墠鎻愬彇欏電殑瀹屾暣URL鍦板潃
                int startPos = this.getInuputFilePath().indexOf("mirror"+ 6;
                String url_seg 
= this.getInuputFilePath().substring(startPos);
                url_seg 
= url_seg.replaceAll("\\\\""/");
                String url 
= "http:/" + url_seg;
                
//鍐欏叆褰撳墠鎻愬彇欏電殑瀹屾暣URL鍦板潃
                bw.write(url + NEWLINE);
                bw.write(names[
0+ NEWLINE);
                bw.write(names[
1+ NEWLINE);
                
            }
            
// 閲嶇疆Parser
            this.getParser().reset();
            Parser attNameParser 
= null;
            Parser attValueParser 
= null;
            
//Parser parser=new Parser("http://www.sina.com.cn");
            NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp1 blue1"));
            NodeFilter attributeValue_filter 
= new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp2"));
            String attName 
= "";
            String attValue 
= "";
            
// 榪唬閫愭笎鏌ユ壘
            nodeList=this.getParser().parse(attribute_filter);
            it 
= nodeList.elements();
            
while (it.hasMoreNodes()) {                
                Node node 
= (Node) it.nextNode();
                attNameParser 
= new Parser();
                attNameParser.setEncoding(
"GB2312");
                attNameParser.setInputHTML(node.toHtml());
                NodeList attNameNodeList 
= attNameParser.parse(attributeName_filter);
                attName 
= attNameNodeList.elements().nextNode().toPlainTextString();
                
                attValueParser 
= new Parser();
                attValueParser.setEncoding(
"GB2312");
                attValueParser.setInputHTML(node.toHtml());
                NodeList attValueNodeList 
= attValueParser.parse(attributeValue_filter);
                attValue 
= attValueNodeList.elements().nextNode().toPlainTextString();
                bw.write(attName.trim() 
+ attValue.trim());
                bw.newLine();
            }
            
// 閲嶇疆Parser
            this.getParser().reset();
            String imgUrl 
= "";
            String fileType 
="";
            
// 榪唬閫愭笎鏌ユ壘
            nodeList=this.getParser().parse(img_filter);
            it 
= nodeList.elements();
            
while (it.hasMoreNodes()) {                
                Node node 
= (Node) it.nextNode();
                
                ImageTag imgNode 
= (ImageTag)node.getChildren().elements().nextNode();
                imgUrl 
= imgNode.getAttribute("src");                
                fileType 
= imgUrl.trim().substring(imgUrl
                        .lastIndexOf(
"."+ 1);
                
//鐢熸垚鏂扮殑鍥劇墖鐨勬枃浠跺悕
                String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType;
                
//imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
                
//鍒╃敤miorr鐩綍涓嬬殑鍥劇墖鐢熸垚鐨勬柊鐨勫浘鐗?/span>
                this.copyImage(imgUrl, new_iamge_file);
                bw.write(SEPARATOR 
+ NEWLINE);
                bw.write(new_iamge_file 
+ NEWLINE);
            }
            
            
        } 
catch(Exception e) {
            e.printStackTrace();
        } 
finally {
            
try{
                
if (bw != null)
                    bw.close();
            }
catch(IOException e){
                e.printStackTrace();
            }
        }
        
    }
}
榪愯涔︿笂鐨刪eritrix瀹炰緥錛屽茍鎸変功涓婄殑榛樿璁劇疆榪涜鎶撳彇濡備笅錛碉疾錛╋細(xì)錛堣鑷繁鍒嗘瀽鏁寸悊錛?br />
http://mobile.younet.com/files/list_1.html
http://mobile.younet.com/files/list_2.html
http://mobile.younet.com/files/list_3.html



]]>
主站蜘蛛池模板: 特级做a爰片毛片免费看| 色噜噜狠狠色综合免费视频| 免费无码一区二区三区蜜桃| 亚洲日韩国产一区二区三区| 国产成人高清亚洲一区91| 国产成人涩涩涩视频在线观看免费 | 99在线免费观看视频| 亚洲精品无码久久久久sm| 在线看片免费人成视频久网下载 | 四虎一区二区成人免费影院网址| 国产国产人免费视频成69大陆| 性色av极品无码专区亚洲| 国产99视频精品免费视频7| 国产成人亚洲毛片| MM131亚洲国产美女久久| 97超高清在线观看免费视频| 国产成A人亚洲精V品无码| 污视频在线免费观看| 亚洲性线免费观看视频成熟| 免费无码成人AV片在线在线播放| 国产精品亚洲AV三区| 日日噜噜噜噜夜夜爽亚洲精品| 国产在线精品观看免费观看| 亚洲欧洲日产国码久在线观看| 国产精彩免费视频| 国产综合成人亚洲区| 国产精品xxxx国产喷水亚洲国产精品无码久久一区 | 看成年女人免费午夜视频| 亚洲综合国产一区二区三区| 免费女人高潮流视频在线观看| ASS亚洲熟妇毛茸茸PICS| www国产亚洲精品久久久| 日韩免费电影网站| 亚洲狠狠成人综合网| 亚洲成a人片在线观看播放| 一色屋成人免费精品网站| 老司机免费午夜精品视频| 亚洲av永久无码精品网站 | 边摸边脱吃奶边高潮视频免费| 国产亚洲精品资源在线26u| 一二三四影视在线看片免费|