锘??xml version="1.0" encoding="utf-8" standalone="yes"?> 浠婂ぉ鐢ㄤ簡涓婰ucene錛屽彂鐜扮綉涓婅櫧鐒朵篃鏈変笉灝戜粙緇嶅畠鐨勬枃妗o紝涓嶈繃寰堝閮藉亸鍚戜粙緇嶆蹇靛憖銆佽璁℃垨鑰呮槸涓浜涙洿涓烘繁鍏ョ殑涓滆タ錛屽浜庡叾鍏ラ棬浣跨敤鐨勪粙緇嶆х殑鏂囨。騫朵笉澶氾紝灝卞啓浜嗚繖涔堜竴綃囥?br />
Lucene
鍩烘湰浣跨敤浠嬬粛
鏈枃鐨勭洰鐨勪笉鍦ㄤ簬瀵筁ucene鐨勬蹇靛拰璁捐榪欎簺榪涜浠嬬粛錛屼粎鍦ㄤ簬浠嬬粛鎬庝箞鏍峰幓浣跨敤Lucene鏉ヨ揪鍒拌嚜宸辨兂瑕佺殑鍑犵甯歌鐨勫叏鏂囨绱㈢殑闇姹傦紝濡傛灉鎯蟲繁鍏ヤ簡瑙ucene鐨勮瘽鏈枃涓嶄細(xì)甯︾粰浣犱粈涔堟敹鑾風(fēng)殑銆傜湅瀹屾湰鏂囧悗鎯蟲洿娣卞叆鐨勪簡瑙ucene璇瘋闂細(xì)http://lucene.apache.org 聽 涓.聽 姒傝堪 闅忕潃緋葷粺淇℃伅鐨勮秺鏉ヨ秺澶氾紝鎬庝箞鏍蜂粠榪欎簺淇℃伅嫻鋒磱涓崬璧瘋嚜宸辨兂瑕佺殑閭d竴鏍歸拡灝卞彉寰楅潪甯擱噸瑕佷簡錛屽叏鏂囨绱㈡槸閫氬父鐢ㄤ簬瑙e喅姝ょ被闂鐨勬柟妗堬紝鑰孡ucene鍒欎負(fù)瀹炵幇鍏ㄦ枃媯(gè)绱㈢殑宸ュ叿錛屼換浣曞簲鐢ㄩ兘鍙氳繃宓屽叆瀹冩潵瀹炵幇鍏ㄦ枃媯(gè)绱€?/p>
浜?聽 鐜鎼緩 浠巐ucene.apache.org涓婁笅杞芥渶鏂扮増鏈殑lucene.jar錛屽皢姝ar浣滀負(fù)欏圭洰鐨刡uild path錛岄偅涔堝湪欏圭洰涓氨鍙互鐩存帴浣跨敤lucene浜嗐?/p>
涓?聽 浣跨敤璇存槑 3.1.聽聽聽聽聽聽 鍩烘湰姒傚康 榪欓噷浠嬬粛鐨勪富瑕佷負(fù)鍦ㄤ嬌鐢ㄤ腑緇忓父紕板埌涓浜涙蹇碉紝浠ュぇ瀹墮兘姣旇緝鐔熸?zhèn)夌殑鏁版嵁搴撴潵杩涜绫绘瘮鐨勮瑙eQ屼嬌鐢↙ucene榪涜鍏ㄦ枃媯(gè)绱㈢殑榪囩▼鏈夌偣綾諱技鏁版嵁搴撶殑榪欎釜榪囩▼錛宼able---脿鏌ヨ鐩稿簲鐨勫瓧孌墊垨鏌ヨ鏉′歡----脿榪斿洖鐩稿簲鐨勮褰曪紝棣栧厛鏄疘ndexWriter錛岄氳繃瀹冨緩绔嬬浉搴旂殑绱㈠紩琛紝鐩稿綋浜庢暟鎹簱涓殑table錛屽湪鏋勫緩姝ょ儲(chǔ)寮曡〃鏃墮渶鎸囧畾鐨勪負(fù)璇ョ儲(chǔ)寮曡〃閲囩敤浣曠鏂瑰紡榪涜鏋勫緩錛屼篃灝辨槸璇村浜庡叾涓殑璁板綍鐨勫瓧孌典互浠涔堟柟寮忔潵榪涜鏍煎紡鐨勫垝鍒嗭紝榪欎釜鍦↙ucene涓О涓篈nalyzer錛孡ucene鎻愪緵浜嗗嚑縐嶇幆澧冧笅浣跨敤鐨凙nalyzer錛歋impleAnalyzer銆丼tandardAnalyzer銆丟ermanAnalyzer絳夛紝鍏朵腑StandardAnalyzer鏄粡甯鎬嬌鐢ㄧ殑錛屽洜涓哄畠鎻愪緵浜嗗浜庝腑鏂囩殑鏀寔錛屽湪琛ㄥ緩濂藉悗鎴戜滑灝遍渶瑕佸線閲岄潰鎻掑叆鐢ㄤ簬绱㈠紩鐨勮褰曪紝鍦↙ucene涓繖涓О涓篋ocument錛屾湁鐐圭被浼兼暟鎹簱涓璽able鐨勪竴琛岃褰曪紝璁板綍涓殑瀛楁鐨勬坊鍔犳柟娉曪紝鍦↙ucene涓О涓篎ield錛岃繖涓拰鏁版嵁搴撲腑鍩烘湰涓鏍鳳紝瀵逛簬Field Lucene鍒嗕負(fù)鍙绱㈠紩鐨勶紝鍙垏鍒嗙殑錛屼笉鍙鍒囧垎鐨勶紝涓嶅彲琚儲(chǔ)寮曠殑鍑犵緇勫悎綾誨瀷錛岄氳繃榪欏嚑涓厓绱犲熀鏈笂灝卞彲浠ュ緩绔嬭搗绱㈠紩浜嗐傚湪鏌ヨ鏃剁粡甯哥鍒扮殑涓哄彟澶栧嚑涓蹇碉紝棣栧厛鏄疩uery錛孡ucene鎻愪緵浜嗗嚑縐嶇粡甯稿彲浠ョ敤鍒扮殑Query錛歍ermQuery銆丮ultiTermQuery銆丅ooleanQuery銆乄ildcardQuery銆丳hraseQuery銆丳refixQuery銆丳hrasePrefixQuery銆丗uzzyQuery銆丷angeQuery銆丼panQuery錛孮uery鍏跺疄涔熷氨鏄寚瀵逛簬闇瑕佹煡璇㈢殑瀛楁閲囩敤浠涔堟牱鐨勬柟寮忚繘琛屾煡璇紝濡傛ā緋婃煡璇€佽涔夋煡璇€佺煭璇煡璇€佽寖鍥存煡璇€佺粍鍚堟煡璇㈢瓑錛岃繕鏈夊氨鏄疩ueryParser錛孮ueryParser鍙敤浜庡垱寤轟笉鍚岀殑Query錛岃繕鏈変竴涓狹ultiFieldQueryParser鏀寔瀵逛簬澶氫釜瀛楁榪涜鍚屼竴鍏抽敭瀛楃殑鏌ヨ錛孖ndexSearcher姒傚康鎸囩殑涓洪渶瑕佸浣曠洰褰曚笅鐨勭儲(chǔ)寮曟枃浠惰繘琛屼綍縐嶆柟寮忕殑鍒嗘瀽鐨勬煡璇紝鏈夌偣璞″鏁版嵁搴撶殑鍝绱㈠紩琛ㄨ繘琛屾煡璇㈠茍鎸変竴瀹氭柟寮忚繘琛岃褰曚腑瀛楁鐨勫垎瑙f煡璇㈢殑姒傚康錛岄氳繃IndexSearcher浠ュ強(qiáng)Query鍗沖彲鏌ヨ鍑洪渶瑕佺殑緇撴灉錛孡ucene榪斿洖鐨勪負(fù)Hits.閫氳繃閬嶅巻Hits鍙幏鍙栬繑鍥炵殑緇撴灉鐨凞ocument錛岄氳繃Document鍒欏彲鑾峰彇Field涓殑鐩稿叧淇℃伅浜嗐?/p>
閫氳繃瀵逛簬涓婇潰鍦ㄥ緩绔嬬儲(chǔ)寮曞拰鍏ㄦ枃媯(gè)绱㈢殑鍩烘湰姒傚康鐨勪粙緇嶅笇鏈涜兘璁╀綘瀵筁ucene寤虹珛涓瀹氱殑浜嗚В銆?/p>
3.2.聽聽聽聽聽聽 鍏ㄦ枃媯(gè)绱㈤渶姹傜殑瀹炵幇 绱㈠紩寤虹珛閮ㄥ垎鐨勪唬鐮侊細(xì)
聽聽聽聽聽聽聽 IndexWriter iwriter=getWriter(indexFilePath); 聽聽聽聽聽聽聽 Document doc=new Document(); 聽聽聽聽聽聽聽 doc.add(Field.Keyword("name","jerry")); 聽聽聽聽聽聽聽 doc.add(Field.Text("sender","bluedavy@gmail.com")); 聽聽聽聽聽聽聽 doc.add(Field.Text("receiver","google@gmail.com")); 聽聽聽聽聽聽聽 doc.add(Field.Text("title","鐢ㄤ簬绱㈠紩鐨勬爣棰?)); 聽聽聽聽聽聽聽 doc.add(Field.UnIndexed("content","涓嶅緩绔嬬儲(chǔ)寮曠殑鍐呭")); 聽聽聽聽聽聽聽 Document doc2=new Document(); 聽聽聽聽聽聽聽 doc2.add(Field.Keyword("name","jerry.lin")); 聽聽聽聽聽聽聽 doc2.add(Field.Text("sender","bluedavy@hotmail.com")); 聽聽聽聽聽聽聽 doc2.add(Field.Text("receiver","msn@hotmail.com")); 聽聽聽聽聽聽聽 doc2.add(Field.Text("title","鐢ㄤ簬绱㈠紩鐨勭浜屼釜鏍囬")); 聽聽聽聽聽聽聽 doc2.add(Field.Text("content","寤虹珛绱㈠紩鐨勫唴瀹?)); 聽聽聽聽聽聽聽 iwriter.addDocument(doc); 聽聽聽聽聽聽聽 iwriter.addDocument(doc2); 聽聽聽聽聽聽聽 iwriter.optimize(); 聽聽聽聽聽聽聽 iwriter.close(); 聽聽聽 } 聽聽聽 聽聽聽 private IndexWriter getWriter(String indexFilePath) throws Exception{ 聽聽聽聽聽聽聽 boolean append=true; 聽聽聽聽聽聽聽 File file=new File(indexFilePath+File.separator+"segments"); 聽聽聽聽聽聽聽 if(file.exists()) 聽聽聽聽聽聽聽聽聽聽聽 append=false; 聽聽聽聽聽聽聽 return new IndexWriter(indexFilePath,analyzer,append); 聽聽聽 }
聽聽聽聽聽聽聽 聽聽聽聽聽聽聽 Searcher searcher=new IndexSearcher(indexFilePath); 聽聽聽聽聽聽聽 Hits hits=searcher.search(query); 聽聽聽聽聽聽聽 for (int i = 0; i < hits.length(); i++) { 聽聽聽聽聽聽聽聽聽聽聽 System.out.println(hits.doc(i).get("name")); 聽聽聽聽聽聽聽 }
聽聽聽聽聽聽聽 聽聽聽聽聽聽聽 Searcher searcher=new IndexSearcher(indexFilePath); 聽聽聽聽聽聽聽 Hits hits=searcher.search(query); 聽聽聽聽聽聽聽 for (int i = 0; i < hits.length(); i++) { 聽聽聽聽聽聽聽聽聽聽聽 System.out.println(hits.doc(i).get("name")); 聽聽聽聽聽聽聽 }
聽聽聽聽聽聽聽 聽聽聽聽聽聽聽 Searcher searcher=new IndexSearcher(indexFilePath); 聽聽聽聽聽聽聽 Hits hits=searcher.search(query); 聽聽聽聽聽聽聽 for (int i = 0; i < hits.length(); i++) { 聽聽聽聽聽聽聽聽聽聽聽 System.out.println(hits.doc(i).get("name")); 聽聽聽聽聽聽聽 }
聽聽聽聽聽聽聽 Query mquery=new WildcardQuery(new Term("sender","bluedavy*")); 聽聽聽聽聽聽聽 TermQuery tquery=new TermQuery(new Term("name","jerry")); 聽聽聽聽聽聽聽 聽聽聽聽聽聽聽 BooleanQuery bquery=new BooleanQuery(); 聽聽聽聽聽聽聽 bquery.add(query,true,false); 聽聽聽聽聽聽聽 bquery.add(mquery,true,false); 聽聽聽聽聽聽聽 bquery.add(tquery,true,false); 聽聽聽聽聽聽聽 聽聽聽聽聽聽聽 Searcher searcher=new IndexSearcher(indexFilePath); 聽聽聽聽聽聽聽 Hits hits=searcher.search(bquery); 聽聽聽聽聽聽聽 for (int i = 0; i < hits.length(); i++) { 聽聽聽聽聽聽聽聽聽聽聽 System.out.println(hits.doc(i).get("name")); 聽聽聽聽聽聽聽 }
鐩鎬俊澶у閫氳繃涓婇潰鐨勮鏄庤兘鐭ラ亾Lucene鐨勪竴涓熀鏈殑浣跨敤鏂規(guī)硶錛屽湪鍏ㄦ枃媯(gè)绱㈡椂寤鴻澶у鍏堥噰鐢ㄨ涔夋椂鐨勬悳绱紝鍏堟悳绱㈠嚭鏈夋剰涔夌殑鍐呭錛屼箣鍚庡啀榪涜妯$硦涔嬬被鐨勬悳绱紝^_^錛岃繖涓繕鏄渶瑕佹牴鎹悳绱㈢殑闇姹傛墠鑳藉畾浜嗭紝Lucene榪樻彁渚涗簡寰堝鍏朵粬鏇村ソ鐢ㄧ殑鏂規(guī)硶錛岃繖涓氨絳夊緟澶у鍦ㄤ嬌鐢ㄧ殑榪囩▼涓嚜宸卞幓榪涗竴姝ョ殑鎽哥儲(chǔ)浜嗭紝姣斿瀵逛簬Lucene鏈韓鎻愪緵鐨凲uery鐨勬洿鐔熺粌鐨勬帉鎻★紝瀵逛簬Filter銆丼orter鐨勪嬌鐢紝鑷繁鎵╁睍瀹炵幇Analyzer錛岃嚜宸卞疄鐜癚uery絳夌瓑錛岀敋鑷沖彲浠ュ幓浜嗚В涓浜涘叧浜庢悳绱㈠紩鎿庣殑鎶鏈?鍒囪瘝銆佺儲(chǔ)寮曟帓搴?etc)絳夌瓑銆?br />
]]>
]]>
private void createIndex(String indexFilePath) throws Exception{
3.2.1.聽聽聽聽聽聽 瀵逛簬鏌愬瓧孌電殑鍏抽敭瀛楃殑妯$硦鏌ヨ
Query query=new WildcardQuery(new Term("sender","*davy*"));
3.2.2.聽聽聽聽聽聽 瀵逛簬鏌愬瓧孌電殑鍏抽敭瀛楃殑璇箟鏌ヨ
Query query=QueryParser.parse("绱㈠紩","title",analyzer);
3.2.3.聽聽聽聽聽聽 瀵逛簬澶氬瓧孌電殑鍏抽敭瀛楃殑鏌ヨ
Query query=MultiFieldQueryParser.parse("绱㈠紩",new String[]{"title","content"},analyzer);
3.2.4.聽聽聽聽聽聽 澶嶅悎鏌ヨ(澶氱鏌ヨ鏉′歡鐨勭患鍚堟煡璇?
Query query=MultiFieldQueryParser.parse("绱㈠紩",new String[]{"title","content"},analyzer);
鍥?聽 鎬葷粨
]]>
鍦╥ndexing榪囩▼涓?瑕佹妸闇瑕乮ndexing鐨則ext鍒嗘瀽澶勭悊涓涓? 緇忚繃澶勭悊鍜屽垏璇?鐒跺悗寤虹珛index. 鑰屼笉閫氱殑Analyzer鏈変笉鍚岀殑鍒嗘瀽瑙勫垯, 鍥犳鍦ㄧ▼搴忎腑浣跨敤Lucene鏃?閫夋嫨姝g‘鐨凙nalyzer鏄緢閲嶈鐨?
1.Using Analyzers
鍦ㄤ嬌鐢ˋnalyzer浠ュ墠 鍏堟潵鐪嬬湅text緇忚繃Analyzer鍒嗘瀽鍚庣殑鏁堟灉鍚?
Listing 4.1 Visualizing analyzer effects
Analyzing "The quick brown fox jumped over the lazy dogs"
聽 WhitespaceAnalyzer:
聽聽聽 [The] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dogs]
聽 SimpleAnalyzer:
聽聽聽 [the] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dogs]
聽 StopAnalyzer:
聽聽聽 [quick] [brown] [fox] [jumped] [over] [lazy] [dogs]
聽 StandardAnalyzer:
聽聽聽 [quick] [brown] [fox] [jumped] [over] [lazy] [dogs]
銆
Analyzing "XY&Z Corporation - xyz@example.com"
聽 WhitespaceAnalyzer:
聽聽聽 [XY&Z] [Corporation] [-] [xyz@example.com]
聽 SimpleAnalyzer:
聽聽聽 [xy] [z] [corporation] [xyz] [example] [com]
聽 StopAnalyzer:
聽聽聽 [xy] [z] [corporation] [xyz] [example] [com]
聽 StandardAnalyzer:
聽聽聽 [xy&z] [corporation] [xyz@example.com]
涓婇潰鏄湪涓嬮潰鎴戜滑瑕佹彁鍒扮殑涓涓緥瀛愮殑榪愯緇撴灉. 鍙互鐪嬪嚭涓嶅悓鐨凙nalyzer 鏄浣曟潵鍒嗘瀽text鐨?鍦ㄥ垎鏋怲he quick brown fox jumped over the lazy dogs 鏃? WhitespaceAnalyzer鍜?SimpleAnalyzer鍙槸綆鍗曠殑鎶婅瘝鍒嗗紑,寤虹珛Term灝卞彲浠ヤ簡;鑰屽彟澶栦袱涓狝nalyzer鍒欏幓鎺変簡stop word. 鑰屽湪鍒嗘瀽XY&Z Corporation - xyz@example.com 鐨勬椂鍊?涓嶅悓鐨凙nalyzer 瀵瑰緟 & 鍜?- 鐨勬柟寮忎篃鏄笉涓鏍風(fēng)殑 . 鐜板湪瀵笰nalysis鏈変釜鎰熸х殑浜嗚В,涓嬮潰鏉ョ湅鐪嬩笉鍚屽鐞嗛樁孌電殑鍒嗘瀽榪囩▼.
I. Indexing Analysis
榪樿寰楀湪ch2 indexing 涓?璁插埌 ,鍦ㄥ緩绔媔ndex鏃?浣跨敤IndexWriter 鍦ㄦ瀯閫營ndexWriter鏃?瑕佷嬌鐢ㄥ埌Analyser.濡備笅鎵紺?
Analyzer analyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter(directory,
analyzer, true);聽鐒跺悗灝卞彲浠ヤ嬌鐢╳riter瀵?document 鏉ndexing浜?濡備笅
聽Document doc = new Document();
doc.add(
Field.Text("title", "This is the title"));doc.add(
Field.UnStored("contents", "...document contents..."));writer.addDocument(doc);
聽浣跨敤鐨勬槸鍦ㄦ瀯閫營ndexWriter鏃?鎸囧畾鐨凙nalyzer. 濡傛灉瑕佺粰涓涓枃妗e崟鐙寚瀹氫竴涓狝nalyzer 鍙互鐢ㄤ笅闈㈢殑涓涓柟娉?
聽writer.addDocument(doc,analyzer);
II.QueryParser Analysis
聽 Analysis 鏄痶erm鎼滅儲(chǔ)鐨勫叧閿?瑕佺‘淇濈粡榪嘇nalyzer鍒嗘瀽鍚庣殑term鍜岃绱㈠紩鐨勪竴鏍?榪欐牱鎵嶅彲浠ュ緱鍒版悳绱㈢粨鏋?鍦ㄤ嬌鐢≦ueryParser parse 鐢ㄦ埛杈撳叆鐨勬悳绱㈣〃杈懼紡鏃跺彲浠?鎸囧畾涓涓狝nalyzer 濡備笅鎵紺?
Query query = QueryParser.parse(expression, "contents",
analyzer);聽閫氳繃QueryParser鐨勯潤鎬佹柟娉曞疄鐜? 濡傛灉浣跨敤QueryParser瀹炰緥, 鍒欏彲浠ュ湪鏋勯燪ueryParser鏃跺?鎻愪緵涓涓狝nalyzer 濡備笅:
QueryParser parser = new QueryParser("contents",
analyzer);query = parser.parse(expression);
聽QueryParser
analyzes individual pieces of the expression, not the expression as awhole, which may include operators, parenthesis, and other special expression
syntax to denote range, wildcard, and fuzzy searches.
QueryParser 騫崇瓑鐨勫垎鏋愭墍鏈夌殑text,濂瑰茍涓嶇煡閬撲粬浠槸濡備綍姣廼ndxed, 榪欐椂濡傛灉褰撴悳绱竴涓绱㈠紩涓篕eyword鐨刦iled鏃?灝卞彲鑳戒細(xì)閬囧埌闂.
榪樻湁涓涓棶棰樺氨鏄湪鍒嗘瀽涓浜涘寘鍚叾浠栧厓绱犵殑text鏃惰濡備綍澶勭悊 ,濡?Html xml 鏂囨。, 浠栦滑閮藉甫鏈夊厓绱犳爣絳?鑰岃繖浜涙爣絳句竴鑸槸涓嶇儲(chǔ)寮曠殑.浠ュ強(qiáng)濡備綍澶勭悊鍒嗗煙(field)绱㈠紩, 濡?Html 鏈塇eader 鍜?Body鍩?濡備綍鍒嗗紑鎼滅儲(chǔ) 榪欎釜闂Analyzer鐜板湪涔熶笉鑳借В鍐崇殑, 鍥犱負(fù)鍦ㄦ瘡嬈nalyzer閮藉鐞嗗崟涓煙. 鍦ㄥ悗闈㈡垜浠湪榪涗竴姝ヨ璁鴻闂.
聽2. Analyzing the Analyzer
瑕佽緇嗕簡瑙ucene鍒嗘瀽鏂囨湰鐨勮繃紼嬪氨瑕佺煡閬揂nalyzer鏄浣曞伐浣滅殑,涓嬮潰灝辨潵鐪嬬湅Analyzer鏄庝箞宸ヤ綔鐨勫惂. Analyzer鏄悇涓猉XXAnalyzer鐨勫熀綾?,璇ョ被鍑哄鐨勭畝鍗?姣旀垜鎯寵薄鐨勮綆鍗曞浜? 鍙涓涓柟娉?tokenStream(String fieldName, Reader reader); fieldName 鍙傛暟瀵規(guī)湁浜汚nalyzer瀹炵幇鏄病鏈変綔鐢ㄧ殑,濡係impleAnalyzer, 璇ョ被鐨勪唬鐮佸涓?
public final class SimpleAnalyzer extends Analyzer {
聽 public TokenStream tokenStream(String fieldName, Reader reader) {
聽聽聽 return new LowerCaseTokenizer(reader);
聽 }
}
鍙互鐪嬪埌璇ョ被涔熸槸鍑哄鐨勭畝鍗? 鍙敤鍒頒簡LowerCaseTokenizer; 浣哃owerCaseTokenizer鏄共浠涔堢殑鍛? 鐪嬬湅鍚嶅瓧灝卞彲浠ョ寽涓樊涓嶅鍟?,璇ョ被鎶奣ext 涓潪瀛楁瘝(nonletters)鐨勫瓧絎﹀幓鎺?騫舵妸鎵鏈塗ext杞崲涓哄皬鍐?
鑰岃繑鍥炵殑
TokenStream 鏄竴涓?enumerator-like class ,閫氳繃濂瑰彲浠ュ緱鍒拌繛緇殑 Tokens,褰撳埌杈炬湯灝炬椂鍊欒繑鍥瀗ull.聽
1. 瀹炵幇涓涓畝鍗曠殑search feature
聽聽 鍦ㄦ湰绔犱腑鍙檺浜庤璁虹畝鍗昄ucene 鎼滅儲(chǔ)API, 鏈変笅闈㈠嚑涓浉鍏崇殑綾?
聽Lucene 鍩烘湰鎼滅儲(chǔ)API:
綾?/p> | 鍔熻兘 |
IndexSearcher | 鎼滅儲(chǔ)涓涓猧ndex鐨勫叆鍙?鎵鏈夌殑searches閮芥槸閫氳繃IndexSearcher 瀹炰緥鐨勫嚑涓噸杞界殑鏂規(guī)硶瀹炵幇鐨? |
Query (and subclasses) | 鍚勪釜瀛愮被灝佽浜嗙壒瀹氭悳绱㈢被鍨嬬殑閫昏緫(logic),Query瀹炰緥浼犻掔粰IndexSearcher鐨剆earch鏂規(guī)硶. |
QueryParser | 澶勭悊涓涓彲璇葷殑琛ㄨ揪寮?杞崲涓轟竴涓叿浣撶殑Query瀹炰緥. |
Hits | 鍖呭惈浜嗘悳绱㈢殑緇撴灉.鏈塈ndexSearcher鐨剆earch鍑芥暟榪斿洖. |
涓嬮潰鎴戜滑鏉ョ湅鍑犱釜涔︿腑鐨勪緥瀛?
LiaTestCase.java聽 涓涓戶鎵胯嚜TestCase 騫朵笖鎵╁睍浜員estCase鐨勭被, 涓嬮潰鐨勫嚑涓緥瀛愰兘緇ф壙鑷綾?
01聽package聽lia.common;
02聽
03聽import聽junit.framework.TestCase;
04聽import聽org.apache.lucene.store.FSDirectory;
05聽import聽org.apache.lucene.store.Directory;
06聽import聽org.apache.lucene.search.Hits;
07聽import聽org.apache.lucene.document.Document;
08聽
09聽import聽java.io.IOException;
10聽import聽java.util.Date;
11聽import聽java.text.ParseException;
12聽import聽java.text.SimpleDateFormat;
13聽
14聽/**
15聽聽*聽LIA聽base聽class聽for聽test聽cases.
16聽聽*/
17聽public聽abstract聽class聽LiaTestCase聽extends聽TestCase聽{
18聽聽聽private聽String聽indexDir聽=聽System.getProperty("index.dir");聽 // 嫻嬭瘯 index 宸茬粡寤虹珛濂戒簡
19聽聽聽protected聽Directory聽directory;
20聽
21聽聽聽protected聽void聽setUp()聽throws聽Exception聽{
22聽聽聽聽聽directory聽=聽FSDirectory.getDirectory(indexDir,聽false);
23聽聽聽}
24聽
25聽聽聽protected聽void聽tearDown()聽throws聽Exception聽{
26聽聽聽聽聽directory.close();
27聽聽聽}
28聽
29聽聽聽/**
30聽聽聽聽*聽For聽troubleshooting 涓轟簡 瑙e喅闂鐨勬柟娉?/font>
31聽聽聽聽*/
32聽聽聽protected聽final聽void聽dumpHits(Hits聽hits)聽throws聽IOException聽{
33聽聽聽聽聽if聽(hits.length()聽==聽0)聽{
34聽聽聽聽聽聽聽System.out.println("No聽hits");
35聽聽聽聽聽}
36聽
37聽聽聽聽聽for聽(int聽i=0;聽i聽<聽hits.length();聽i++)聽{
38聽聽聽聽聽聽聽Document聽doc聽=聽hits.doc(i);
39聽聽聽聽聽聽聽System.out.println(hits.score(i)聽+聽":"聽+聽doc.get("title"));
40聽聽聽聽聽}
41聽聽聽}
42聽
43聽聽聽protected聽final聽void聽assertHitsIncludeTitle(
44聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽Hits聽hits,聽String聽title)
45聽聽聽聽聽throws聽IOException聽{
46聽聽聽聽聽for聽(int聽i=0;聽i聽<聽hits.length();聽i++)聽{
47聽聽聽聽聽聽聽Document聽doc聽=聽hits.doc(i);
48聽聽聽聽聽聽聽if聽(title.equals(doc.get("title")))聽{
49聽聽聽聽聽聽聽聽聽assertTrue(true);
50聽聽聽聽聽聽聽聽聽return;
51聽聽聽聽聽聽聽}
52聽聽聽聽聽}
53聽
54聽聽聽聽聽fail("title聽'"聽+聽title聽+聽"'聽not聽found");
55聽聽聽}
56聽
57聽聽聽protected聽final聽Date聽parseDate(String聽s)聽throws聽ParseException聽{
58聽聽聽聽聽聽聽return聽new聽SimpleDateFormat("yyyy-MM-dd").parse(s);
59聽聽聽}
60聽}
聽 I.鎼滅儲(chǔ)涓涓壒瀹氱殑Term 鍜屽埄鐢≦ueryParser 瑙f瀽鐢ㄦ埛杈撳叆鐨勮〃杈懼紡
聽 瑕佸埄鐢ㄤ竴涓壒瀹氱殑term鎼滅儲(chǔ),浣跨敤QueryTerm灝卞彲浠ヤ簡,鍗曚釜term 灝ゅ叾閫傚悎Keyword鎼滅儲(chǔ). 瑙f瀽鐢ㄦ埛杈撳叆鐨勮〃杈懼紡鍙互鏇撮傚悎鐢ㄦ埛鐨勪嬌鐢ㄦ柟寮?鎼滅儲(chǔ)琛ㄨ揪寮忕殑瑙f瀽鏈塓ueryParser鏉ュ畬鎴?濡傛灉琛ㄨ揪寮忚В鏋愰敊璇?浼?xì)鏈夊紓甯告姏鍑? 鍙互鍙栧緱鐩鎬俊鐨勯敊璇俊鎭?浠ヤ究緇欑敤鎴烽傚綋鐨勬彁紺?鍦ㄨВ鏋愯〃杈懼紡鏃?榪橀渶瑕佷竴涓狝nalyzer 鏉ュ垎鏋愮敤鎴風(fēng)殑杈撳叆, 騫舵牴鎹笉鍚岀殑Analyzer鏉ョ敓浜х浉搴旂殑Term鐒跺悗鏋勬垚Query瀹炰緥.
涓嬮潰鐪嬩釜渚嬪瓙鍚?BasicSearchingTest.java
01聽package聽lia.searching;
02聽
03聽import聽lia.common.LiaTestCase;
04聽import聽org.apache.lucene.analysis.SimpleAnalyzer;
05聽import聽org.apache.lucene.document.Document;
06聽import聽org.apache.lucene.index.Term;
07聽import聽org.apache.lucene.queryParser.QueryParser;
08聽import聽org.apache.lucene.search.Hits;
09聽import聽org.apache.lucene.search.IndexSearcher;
10聽import聽org.apache.lucene.search.Query;
11聽import聽org.apache.lucene.search.TermQuery;
12聽
13聽public聽class聽BasicSearchingTest聽extends聽LiaTestCase聽{
14聽
15聽聽聽public聽void聽testTerm()聽throws聽Exception聽{
16聽聽聽聽聽IndexSearcher聽searcher聽=聽new聽IndexSearcher(directory);
17聽聽聽聽聽Term聽t聽=聽new聽Term("subject",聽"ant");聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽 // 鏋勯犱竴涓猅erm
18聽聽聽聽聽Query聽query聽=聽new聽TermQuery(t);
19聽聽聽聽聽Hits聽hits聽=聽searcher.search(query);聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽 // 鎼滅儲(chǔ)
20聽聽聽聽聽assertEquals("JDwA",聽1,聽hits.length());聽聽聽聽聽聽聽聽聽聽聽聽 //嫻嬭瘯緇撴灉
21聽
22聽聽聽聽聽t聽=聽new聽Term("subject",聽"junit");
23聽聽聽聽聽hits聽=聽searcher.search(new聽TermQuery(t));聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽
24聽聽聽聽聽assertEquals(2,聽hits.length());
25聽
26聽聽聽聽聽searcher.close();
27聽聽聽}
28聽
29聽聽聽public聽void聽testKeyword()聽throws聽Exception聽{聽 // 嫻嬭瘯鍏抽敭瀛楁悳绱?/font>
30聽聽聽聽聽IndexSearcher聽searcher聽=聽new聽IndexSearcher(directory);
31聽聽聽聽聽Term聽t聽=聽new聽Term("isbn",聽"1930110995");聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽 // 鍏抽敭瀛?term
32聽聽聽聽聽Query聽query聽=聽new聽TermQuery(t);
33聽聽聽聽聽Hits聽hits聽=聽searcher.search(query);
34聽聽聽聽聽assertEquals("JUnit聽in聽Action",聽1,聽hits.length());
35聽聽聽}
36聽
37聽聽聽public聽void聽testQueryParser()聽throws聽Exception聽{聽 // 嫻嬭瘯 QueryParser.
38聽聽聽聽聽IndexSearcher聽searcher聽=聽new聽IndexSearcher(directory);
39聽
40聽聽聽聽聽Query聽query聽=聽QueryParser.parse("+JUNIT聽+ANT聽-MOCK",
41聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽"contents",
42聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽new聽SimpleAnalyzer());聽 // 閫氳繃瑙f瀽鎼滅儲(chǔ)琛ㄨ揪寮?榪斿洖涓涓猀uery瀹炰緥
43聽聽聽聽聽Hits聽hits聽=聽searcher.search(query);
44聽聽聽聽聽assertEquals(1,聽hits.length());
45聽聽聽聽聽Document聽d聽=聽hits.doc(0);
46聽聽聽聽聽assertEquals("Java聽Development聽with聽Ant",聽d.get("title"));
47聽
48聽聽聽聽聽query聽=聽QueryParser.parse("mock聽OR聽junit",
49聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽"contents",
50聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽new聽SimpleAnalyzer());聽聽聽聽聽聽聽聽聽聽聽聽聽 // 閫氳繃瑙f瀽鎼滅儲(chǔ)琛ㄨ揪寮?榪斿洖涓涓猀uery瀹炰緥
51聽聽聽聽聽hits聽=聽searcher.search(query);
52聽聽聽聽聽assertEquals("JDwA聽and聽JIA",聽2,聽hits.length());
53聽聽聽}
54聽}
1,indexing鐨勫鐞嗚繃紼?
聽 棣栧厛瑕佹妸indexing鐨勬暟鎹漿鎹負(fù)text,鍥犱負(fù)Lucene鍙兘绱㈠紩text,鐒跺悗鐢盇nalysis鏉ヨ繃铏憈ext,鎶婁竴浜沜h1涓彁鍒扮殑鎵璋撶殑stop words 榪囨護(hù)鎺? 鐒跺悗寤虹珛index.寤虹珛鐨刬ndex涓?font face="NewBaskervilleITCbyBT-Italic" size="3">inverted index 涔熷氨鏄墍璋撶殑鍊掓帓绱㈠紩.
2,鍩烘湰鐨刬ngex鎿嶄綔
聽聽 鍩烘湰鐨勬搷浣?鍖呮嫭 :娣誨姞 鍒犻櫎 鏇存柊.
I . 娣誨姞
涓嬮潰鎴戜滑鐪嬩釜渚嬪瓙浠g爜 BaseIndexingTestCase.class
01聽package聽lia.indexing; Heterogeneous Documents
|
榪欐槸涓涓祴璇曡秴綾?鍙互琚叾浠栫殑嫻嬭瘯鐢ㄤ緥緇ф壙 鏉ユ祴璇曚笉鍚岀殑鍔熻兘.涓婇潰甯︽湁璇︾粏鐨勬敞閲?
鍦ㄦ坊鍔燜ield鏃? 浼?xì)閬囧埌鍚屼箟璇嶇殑鎯呭?娣誨姞鍚屼箟璇嶇敱涓ょ鏂瑰紡:
聽a.鍒涘緩涓涓悓涔夎瘝璇嶇粍,寰幆娣誨姞鍒癝ingle Strng鐨勪笉鍚孎ield涓?
聽b.鎶婂悓涔夎瘝娣誨姞鍒頒竴涓狟ase word鐨刦ield涓?濡備笅:
聽
String baseWord = "fast";
String synonyms[] = String {"quick", "rapid", "speedy"};
Document doc = new Document();
doc.add(Field.Text("word", baseWord));
for (int i = 0; i < synonyms.length; i++) {
doc.add(Field.Text("word", synonyms[i]));
}
聽
榪欐牱 鍦?/font>Lucene鍐呴儴鎶婃瘡涓瘝閮芥坊鍔犵殑涓涓悕涓簑ord鐨凢ield涓?鍦ㄦ悳绱㈡椂 浣犲彲浠ヤ嬌鐢ㄤ換浣曚竴涓粰瀹氱殑璇嶈.
鎴戜滑涓昏鏉ョ湅鐪?榪欎釜 indexing and searching 渚嬪瓙 鐒跺悗浜嗚В涓浜涘熀鏈蹇?
package聽lia.meetlucene;
import聽org.apache.lucene.index.IndexWriter;
import聽org.apache.lucene.analysis.standard.StandardAnalyzer;
import聽org.apache.lucene.document.Document;
import聽org.apache.lucene.document.Field;
import聽java.io.File;
import聽java.io.IOException;
import聽java.io.FileReader;
import聽java.util.Date;
/**
聽*聽This聽code聽was聽originally聽written聽for
聽*聽Erik's聽Lucene聽intro聽java.net聽article
聽*/
public聽class聽Indexer聽{
聽聽public聽static聽void聽main(String[]聽args)聽throws聽Exception聽{
聽聽聽聽if聽(args.length聽!=聽2)聽{
聽聽聽聽聽聽throw聽new聽Exception("Usage:聽java聽"聽+聽Indexer.class.getName()
聽聽聽聽聽聽聽聽+聽"聽<index聽dir>聽<data聽dir>");
聽聽聽聽}
聽聽聽聽File聽indexDir聽=聽new聽File(args[0]); // 鍦ㄨ鐩綍涓垱寤篖ucene Incex
聽聽聽聽File聽dataDir聽=聽new聽File(args[1]); // 璇ョ洰褰曚腑瀛樻斁澶囩儲(chǔ)寮曠殑鏂囦歡
聽聽聽聽long聽start聽=聽new聽Date().getTime();
聽聽聽聽int聽numIndexed聽=聽index(indexDir,聽dataDir);
聽聽聽聽long聽end聽=聽new聽Date().getTime();
聽聽聽聽System.out.println("Indexing聽"聽+聽numIndexed聽+聽"聽files聽took聽"
聽聽聽聽聽聽+聽(end聽-聽start)聽+聽"聽milliseconds");
聽聽}
聽聽public聽static聽int聽index(File聽indexDir,聽File聽dataDir)
聽聽聽聽throws聽IOException聽{
聽聽聽聽if聽(!dataDir.exists()聽||聽!dataDir.isDirectory())聽{
聽聽聽聽聽聽throw聽new聽IOException(dataDir
聽聽聽聽聽聽聽聽+聽"聽does聽not聽exist聽or聽is聽not聽a聽directory");
聽聽聽聽}
聽聽聽聽IndexWriter聽writer聽=聽new聽IndexWriter(indexDir,
聽聽聽聽聽聽new聽StandardAnalyzer(),聽true);聽聽聽聽聽聽聽聽聽聽聽聽聽聽 //(1)鍒涘緩 Lucene Index
聽聽聽聽writer.setUseCompoundFile(false);
聽聽聽聽indexDirectory(writer,聽dataDir);
聽聽聽聽int聽numIndexed聽=聽writer.docCount();
聽聽聽聽writer.optimize();
聽聽聽聽writer.close();聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽 // close index
聽聽聽聽return聽numIndexed;
聽聽}
聽聽private聽static聽void聽indexDirectory(IndexWriter聽writer,聽File聽dir)
聽聽聽聽throws聽IOException聽{
聽聽聽聽File[]聽files聽=聽dir.listFiles();
聽聽聽聽for聽(int聽i聽=聽0;聽i聽<聽files.length;聽i++)聽{
聽聽聽聽聽聽File聽f聽=聽files[i];
聽聽聽聽聽聽if聽(f.isDirectory())聽{
聽聽聽聽聽聽聽聽indexDirectory(writer,聽f);聽聽//(2)聽recurse
聽聽聽聽聽聽}聽else聽if聽(f.getName().endsWith(".txt"))聽{
聽聽聽聽聽聽聽聽indexFile(writer,聽f);
聽聽聽聽聽聽}
聽聽聽聽}
聽聽}
聽聽private聽static聽void聽indexFile(IndexWriter聽writer,聽File聽f)
聽聽聽聽throws聽IOException聽{
聽聽聽聽if聽(f.isHidden()聽||聽!f.exists()聽||聽!f.canRead())聽{
聽聽聽聽聽聽return;
聽聽聽聽}
聽聽聽聽System.out.println("Indexing聽"聽+聽f.getCanonicalPath());
聽聽聽聽Document聽doc聽=聽new聽Document();
聽聽聽聽doc.add(Field.Text("contents",聽new聽FileReader(f)));聽 // (3) index file content
聽聽聽聽doc.add(Field.Keyword("filename",聽f.getCanonicalPath())); // (4) index file name
聽聽聽聽writer.addDocument(doc);聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽聽 //(5) add document in Lucene index
聽聽}
}
涓婇潰鐨処ndexer 浣跨敤浜嗗嚑琛?Lucene鐨凙PI, 鏉ndexing 涓涓洰褰曚笅闈㈢殑鏂囦歡. 榪愯鏃跺?闇瑕佷袱涓弬鏁?, 涓涓繚瀛榠ndex鐨勭洰褰曞拰瑕佺儲(chǔ)寮曠殑鏂囦歡鐩綍.
鍦ㄤ笂闈㈢殑綾諱腑,闇瑕佷笅闈㈢殑涓浜汱ucene classes 鏉ユ墽琛?indexing 澶勭悊:
鈻?
IndexWriter鈻?
Directory鈻?
Analyzer鈻?
Document鈻?
FieldIndexWriter 鏄痠ndexing 澶勭悊鏃剁敤鍒扮殑涓績緇勪歡,璇ョ被create 鏂癷ndex 騫朵笖娣誨姞documents 鍒板凡緇忓瓨鍦ㄧ殑index, BTW,鍦↙ucene涓繕鏈夊埆鐨勬柟娉曟潵鏇存柊index.
Directory: 鐢ㄦ潵瀛樻斁index鏂囦歡鐨勬枃浠剁洰褰?璇ョ被鏄釜鎶借薄綾?鐢ㄥ嚑涓瓙綾誨彲浠ヤ嬌鐢?涓婇潰浣跨敤浜咶ile鏉ヤ唬琛ㄦ枃浠惰礬寰?鍦↙ucene涓敤涓や釜涓昏鐨凞irectory瀛愮被,涓涓狥SDirectory,涓涓?RAMDirectory,鍓嶈呮槸鎶奿ndex淇濆瓨鍒扮‖鐩樹腑鐨?鍚庤呮槸淇濆瓨鍦ㄥ唴瀛樹腑鐨?鍦ㄥ唴瀛樹腑澶勭悊鏁板害褰撶劧灝辯浉搴旂殑蹇竴浜?浜嗕絾鍙傚悎浜庡皬鏂囦歡.
Analyzer: 鍦ㄦ枃浠跺绱㈠紩浠ュ墠瑕佸厛閫氳繃Analyzer鍒嗘瀽,鍘繪帀涓浜涘search鏃犵敤鐨勮瘝璇?濡傝嫳璇腑 鐨勫皬璇?in at a 絳夌瓑,鍦↙ucene涓縐頒負(fù)stop words 鐨勮瘝),榪樺彲浠ュ鐞嗗ぇ灝忓啓鐨勯棶棰?鏄ぇ灝忓啓鐩稿叧鍟?榪樻槸涓嶇浉鍏?,浣跨敤Lucene鏃跺?閫夋嫨Analyzer鏄叧閿?
Document: 浠h〃涓浜汧ields鐨勯泦鍚?鍙互鎯寵薄涓轟竴浜涙暟鎹殑闆嗗悎.
Field: 鍦╥ndex涓殑姣忎竴涓狣ocument涓兘鍖呭惈涓浜?鍛藉悕鐨凢ields 鐢‵ield鏉ユ瀯閫? 姣忎竴涓猣ield閮芥槸鐨勬悳绱㈡槸絎﹀悎瑕佹眰鍜屼笉絎﹀悎瑕佹眰鐨刬ndex涓殑涓浜涙暟鎹?Lucene鎻愪緵浜嗗洓縐嶄笉鍚岀殑Field,
1,Keyword聽 涓嶅垎鏋?鍙儲(chǔ)寮曞拰淇濆瓨,璞′竴浜涚壒孌婁俊鎭?涓嶅彲浠ュ垎鍓茬殑 濡?鐢?shù)璇濆忥L(fēng)爜 緗戠珯 Email 絳?
2,UnIndexed 鏃笉绱㈠紩涔熶笉鍒嗘瀽,鍙槸鎶婂間繚瀛樺湪index涓?璇ョ被鍨嬮傚悎鐢ㄦ潵鏄劇ず鎼滅儲(chǔ)緇撴灉鐨刦ield,浣嗘槸浣犱粠鏉ヤ笉鎼滅儲(chǔ)璇ユ樉紺虹殑鏁版嵁,濡俇RL
3,UnStored UnIndexed鐨勫绔嬮潰, 鍒嗘瀽鍜岀儲(chǔ)寮曚絾鏄笉淇濆瓨鍦╥ndex涓?閫傚悎澶у瀷鏁版嵁 鍙悳绱絾鏄笉鏄劇ず鍘熷鏁版嵁.
4,Test 鍒嗘瀽涓旂儲(chǔ)寮?濡傛灉绱㈠紩鏁版嵁鏄疭tring鍒欎篃淇濆瓨鍦╥ndex涓? 濡傛灉鏄疪eader鍒欎笉淇濆瓨.