锘??xml version="1.0" encoding="utf-8" standalone="yes"?>
棣栧厛鏉ョ湅analysis鍖咃紝榪欎釜鍖呬富瑕佹槸鎻愪緵涓浜涚畝鍗曠殑璇嶆眹鍖栧鐞?br />
浠?span style="color: #339966">Tokenizer緇撳熬鐨勭被鏄皢瑕佸鐞嗙殑瀛楃涓茶繘琛屽垎鍓叉垚Token嫻侊紝鑰屾牴鎹垎鍓茬殑渚濇嵁鐨勫張浜х敓浜?jiǎn)浠ヤ笅鍑犱釜Tokenizer綾?br />
棣栧厛Tokenizer綾繪槸鎵鏈?span style="color: #000000">浠okenizer緇撳熬鐨勭被鐨勫熀綾?br />
鐒跺悗鏄疌harTokenizer錛屾墍鏈夌殑浠?span style="color: #339966">Tokenizer緇撳熬鐨勭被閮芥槸浠庤繖涓被緇ф壙鐨?br />
榪欎釜綾諱腑鏈変竴涓娊璞℃柟娉?br />
protected abstract boolean isTokenChar(char c);
鍙﹀涓涓渶瑕佽瀛愮被瑕嗗啓鐨勬柟娉?br />
protected char normalize(char c) {}錛?/span>
鏄鍗曚釜瀛楃榪涜澶勭悊鐨勬柟娉曡濡傝灝嗚嫳鏂囧瓧姣嶅叏閮ㄨ漿鍖栦負(fù)灝忓啓
榪樻湁涓涓彉閲?br />
protected Reader input;
榪欎釜璇誨彇鍣ㄦ槸榪欎簺綾繪墍澶勭悊鐨勬暟鎹殑 鏁版嵁婧?br />
杈撳叆涓涓猂eader 錛屼駭鐢熶竴涓猅oken嫻?/span>
榪欎釜鏂規(guī)硶鏄槸鍚﹁繘琛屽垏鍒嗙殑渚濇嵁錛屼緷嬈¤鍙朿har嫻侊紝鐒跺悗鐢ㄨ繖涓柟娉曞姣忎釜char榪涜媯(gè)嫻嬶紝濡傛灉榪斿洖false鍒欏皢棰勫厛瀛樺偍鍦?br />
璇嶆眹緙撳啿鍖轟腑鐨刢har鏁扮粍浣滀負(fù)涓涓猅oken榪斿洖
LetterTokenizer 錛?/span>
protected boolean isTokenChar(char c) {
return Character.isLetter(c);
}
WhitespaceTokenizer錛?/span>
protected boolean isTokenChar(char c) {
return !Character.isWhitespace(c);
}
LowerCaseTokenizer extends LetterTokenizer錛?/span>
protected char normalize(char c) {
return Character.toLowerCase(c);
}
鍦ㄦ瀯閫犲嚱鏁頒腑璋冪敤super(in);榪涜鍜?nbsp;LetterTokenizer鍚屾牱鐨勬搷浣滐紝浣嗘槸鍦ㄨ瘝姹囧寲涔嬪墠鎵鏈夌殑璇嶉兘杞寲涓哄皬鍐欎簡(jiǎn)
鐒跺悗鏄互Filter緇撳熬鐨勭被錛岃繖涓被綈囦富瑕佹槸瀵瑰凡緇忚瘝姹囧寲鐨凾oken嫻佽繘琛岃繘涓姝ョ殑澶勭悊
杈撳叆鏄疶oken嫻?, 杈撳嚭浠嶇劧鏄疶oken嫻併?br />
TokenFilter extends TokenStream 鏄墍鏈夎繖浜涚被鐨勭埗綾?br />
protected TokenStream input;
鍦═okenFilter 涓湁涓涓猅okenStream 鍙橀噺錛屾槸Filter綾葷皣澶勭悊鐨勬暟鎹簮錛岃孎ilter綾葷皣鍙堟槸緇ф壙浜?jiǎn)TokenStream 綾葷殑
鏈変竴涓猵ublic final Token next()鏂規(guī)硶,榪欎釜鏂規(guī)硶浠okenStream.next()浜х敓鐨凾oken嫻?涓哄鐞嗘簮錛屼駭鐢熺殑浠嶇劧鏄疶oken嫻?br />
鍙笉榪囦腑闂存湁涓浜涘鐞嗙殑榪囩▼
LowerCaseFilter錛氬皢鎵鏈夌殑Token嫻佺殑杞寲涓哄皬鍐?br />
t.termText = t.termText.toLowerCase();
StopFilter錛氳繃婊ゆ帀涓浜涘仠姝㈣瘝錛岃繖浜涘仠姝㈣瘝鐢辨瀯閫犲嚱鏁版寚瀹?/span>
for (Token token = input.next(); token != null; token = input.next())
if (!stopWords.contains(token.termText))
return token;
姣旇緝涓涓婽okenizer綾葷皣鍜孎ilter綾葷皣錛屽彲浠ョ煡閬?br />
Tokenizer綾葷皣涓昏鏄杈撳叆鐨凴eader嫻侊紝瀹為檯涓婃槸瀛楃嫻佹寜鐓т竴瀹氱殑瑙勫垯榪涜鍒嗗壊錛屼駭鐢熷嚭Token嫻?br />
鍏惰緭鍏ユ槸瀛楃涓茬殑Reader嫻佸艦寮忥紝杈撳嚭鏄疶oken嫻?br />
Filter綾葷皣涓昏鏄杈撳叆鐨凾oken嫻佽繘琛屾洿榪涗竴姝ョ殑澶勭悊錛屽鍘婚櫎鍋滄璇嶏紝杞寲涓哄皬鍐?br />
涓昏涓轟竴浜涙牸寮忓寲鎿嶄綔銆?br />
鐢變簬Filter綾葷皣鐨勮緭鍏ヨ緭鍑虹浉鍚岋紝鎵浠ュ彲浠ュ祵濂楀嚑涓笉鍚岀殑Filter綾伙紝浠ヨ揪鍒伴鏈熺殑澶勭悊鐩殑銆?br />
鍓嶄竴涓狥ilter綾葷殑杈撳嚭浣滀負(fù)鍚庝竴涓狥ilter綾葷殑杈撳叆
鑰孴okenizer綾葷皣鐢變簬杈撳叆杈撳嚭涓嶅悓錛屾墍浠ヤ笉鑳藉祵濂?br />
.fnm鐨勬枃浠舵牸寮忥細(xì) 錛團(tuán)ield鐨勪俊鎭級(jí)
int: Field鐨勪釜鏁幫紝鏈灝戜負(fù)1錛屾渶灝戞湁涓涓狥ield("",false)錛屽湪鍒濆鍖栫殑鏃跺欏啓鍏?鏆傛椂涓嶇煡閬撳師鍥?; 鍚嶇О涓虹┖瀛楃涓詫紝鏈儲(chǔ)寮曪紝 鏈?nbsp; 鍚?nbsp; 閲忓寲銆俽eadVInt()璇誨彇
String: byte String鏄?nbsp;Field鐨勫悕縐?nbsp; byte鎸囩ず璇ield 鏄惁琚儲(chǔ)寮曪紝鏄惁鍚戦噺鍖?錛堝兼湁錛?1錛?0錛?1錛夌涓涓?浠h〃琚儲(chǔ)寮曪紝絎簩涓唬琛ㄨ鍚戦噺鍖?br />
String: byte Field 鍚屼笂
.fdx鐨勬枃浠舵牸寮忥細(xì)涓昏鏄彁渚涘.fdt涓瓨鍌ㄧ殑document鐨勯殢鍗寵鍙?br />
long : 絎竴涓猟ocument鍦?fdt鏂囦歡涓殑浣嶇疆
long: 絎簩涓猟ocument鍦?fdt鏂囦歡涓殑浣嶇疆
.fdt鐨勬枃浠舵牸寮忥細(xì) .fdt鏂囦歡瀛樺偍浜?jiǎn)涓緋誨垪document鐨勪俊鎭?br /> VInt: 璇ocument涓殑isStored灞炴т負(fù)true鐨勫煙鐨勪釜鏁?br /> (VInt:) 濡傛灉璇ield鐨刬sStored灞炴т負(fù)true鍒欏緱鍒拌field鐨刦ieldNumber錛屾殏鏃朵笉鐭ラ亾榪欎釜fieldNumber鏄庝箞浜х敓鐨勶紝鏈変粈涔堢敤錛屽垵姝ヤ及璁℃槸鎸夌収field鍒涘緩鐨勯『搴忎駭鐢熺殑錛屾瘡嬈″啀涓婁竴涓猣ield鐨刦ieldNumber鍩虹涓婂姞1銆?br /> byte: 濡傛灉璇ield鐨刬sTokenized灞炴т負(fù)true鍐欏叆1鍚﹀垯鍐欏叆false銆?br /> String: 璇ield鐨剆tringValue()鍊箋?br /> 涓涓猟ocument緇撴潫錛屼笅闈㈢殑鏁版嵁灝嗕細(xì)寮濮嬩竴涓柊鐨刣ocument錛屾瘡涓柊鐨刣ocument鐨勫紑濮嬬偣鐨勬枃浠朵綅緗兘浼?xì)鍦?fdx涓湁璁拌澆錛屼究浜庨殢鍗寵闂?/span>
try {
//DataInputStream fis = new DataInputStream(new FileInputStream("C:\\sf\\snow\\segments"));
FSDirectory dir=FSDirectory.getDirectory("C:/sf/snow", false);
InputStream input = dir.openFile("segments");
System.out.println("Format:"+input.readInt()); //寰楀埌鏂囦歡鏍囧織錛屾槸鍚︿負(fù)姝e父鐨剆egments鏂囦歡
System.out.println("version:"+input.readLong()); //寰楀埌鐗堟湰鍙?br />
System.out.println("name:"+input.readInt()); //寰楀埌鐢ㄦ潵閲嶅懡鍚嶆柊孌電殑int錛屾殏鏃朵笉鐭ラ亾鏈変粈涔堢敤
int n=input.readInt(); //孌電殑鏁扮洰
System.out.println("SegmentNum:"+n);
for(int i=0;i<n;i++) { //鐢ㄥ驚鐜墦鍗板嚭鎵鏈夋鐨勪俊鎭?鍚嶇О鍜岄暱搴?br />
System.out.println("segment "+i+" - name:"+input.readString()+" num:"+input.readInt());
}
} catch (Exception e) {
}
褰撶劧,璇ョ被鎻愪緵浜?jiǎn)鏇翠负澶嶆潅鐨勮畨K棶鍜屾洿鏂皊egments鏂囦歡鐨勬柟娉?br />
final void read(Directory directory) 灝嗘墍鏈夌殑孌典俊鎭繚瀛樺湪鏈瑅ector涓?br />
final void write(Directory directory) 璺熸柊璇egment鏂囦歡鐨勫唴瀹癸紝涓昏鏄負(fù)浜?jiǎn)娣诲姞娈靛Q?br />
涓昏鏄洿鏂?鐗堟湰鍙?孌電殑鏁扮洰錛岃窡鏂板畬榪欎簺鍚庡嵆鍙線segment鏂囦歡鍚庢坊鍔犳柊孌電殑淇℃伅銆?br />
import org.apache.lucene.store.Directory;
final class SegmentInfo {
public String name; //鍦ㄧ儲(chǔ)寮曠洰褰曚腑鍞竴鐨勫悕縐?nbsp;
public int docCount; // 璇ユ涓璬oc鐨勬暟鐩?br />
public Directory dir; // 璇ユ鎵瀛樺湪鐨凞irrectory
public SegmentInfo(String name, int docCount, Directory dir) {
this.name = name;
this.docCount = docCount;
this.dir = dir;
}
}
public void flushBuffer(byte[] src, int len) {
int bufferNumber = pointer/BUFFER_SIZE; //buffer搴忓垪錛屽嵆褰撳墠鎵鍐橞uffer鍦≧AMFile涓殑Vector涓殑搴忓垪鍙?br />
int bufferOffset = pointer%BUFFER_SIZE; //鍋忕Щ閲忥紝鍗沖綋鍓嶆墍鍐欏瓧鑺傚湪褰撳墠Buffer涓殑鍋忕Щ閲忋?br />
int bytesInBuffer = BUFFER_SIZE - bufferOffset; //褰撳墠Buffer鐨勫墿浣欏彲鍐欏瓧鑺傛暟
//bytesToCopy鏄疄闄呭啓鍏ョ殑瀛楄妭鏁幫紝濡傛灉褰撳墠Bufer鐨勫墿浣欏瓧鑺傛暟澶т簬闇瑕佸啓鐨勫瓧鑺傜殑鎬繪暟鍒欏啓鍏ユ墍鏈夊瓧鑺?br />
//鍚﹀垯錛屽皢褰撳墠Buffer鍐欐弧鍗沖彲錛屽墿浣欑殑瀛楄妭灝嗗啓鍏ヤ笅涓涓狟uffer
int bytesToCopy = bytesInBuffer >= len ? len : bytesInBuffer;
if (bufferNumber == file.buffers.size())
file.buffers.addElement(new byte[BUFFER_SIZE]); //鍦≧AMFile涓坊鍔犳柊鐨刡yte[1024]鍏冪礌
byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber);
System.arraycopy(src, 0, buffer, bufferOffset, bytesToCopy);
if (bytesToCopy < len) { // not all in one buffer,
int srcOffset = bytesToCopy;
bytesToCopy = len - bytesToCopy; // remaining bytes 鍓╀綑鐨勬湭鍐欏叆鐨勫瓧鑺傛暟
bufferNumber++; //灝哹uffer鏁板鍔?
if (bufferNumber == file.buffers.size())
file.buffers.addElement(new byte[BUFFER_SIZE]);
buffer = (byte[])file.buffers.elementAt(bufferNumber); //鍓╀綑瀛楄妭鍐欏叆涓嬩竴涓狟uffer
System.arraycopy(src, srcOffset, buffer, 0, bytesToCopy);
}
pointer += len;
if (pointer > file.length)
file.length = pointer; //縐諱綅鏂囦歡鎸囬拡 鍦ㄥ師鏈夌殑鍩虹涓婂姞涓婂疄闄呭啓鍏ョ殑瀛楄妭鎬繪暟
file.lastModified = System.currentTimeMillis(); //淇敼鏂囦歡鐨勬渶鍚庝慨鏀規(guī)椂闂翠負(fù)褰撳墠鏃墮棿
}
浠庢寚瀹氱殑瀛楄妭鏁扮粍澶嶅埗鎸囧畾闀垮害鐨勫瓧鑺傚埌RAMFile涓幓銆傜敱浜嶳AMFile涓璙ector鐨勫厓绱犳槸byte[1024]鎵浠ュ彲鑳藉瓨鍦ㄥ仛涓嬈¤鎿嶄綔
瑕佹搷浣滀袱涓猇ector鍏冪礌鐨勬儏鍐點(diǎn)傚嵆鍏堝皢褰撳墠byte[1024]鏁扮粍濉弧錛屽啀鏂板緩涓涓厓绱犺杞藉墿浣欑殑瀛楄妭銆?br />
鍙﹀榪樻湁涓涓獁riteTo(OutputStream out)鏂規(guī)硶錛屽皢RAMFile涓殑鏁版嵁杈撳嚭鍒板彟涓涓緭鍑烘祦
UNICODE鍊?UTF-8緙栫爜
U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
鍙瀵逛簬鍦?0x00-0x7F鑼冨洿鍐呯殑UNICODE鍊鹼紙鏈澶ф湁鏁堟暟浣嶏細(xì)7浣嶏級(jí)錛屽皢浼?xì)缂栫爜鎴愬崟瀛楄妭鐨勫Q屼細(xì)澶уぇ鑺傜害瀛樺偍絀洪棿銆?br />
瀵逛簬鍦?nbsp; 0x80-0x7FF鑼冨洿鍐呯殑UNICODE錛堟渶澶ф湁鏁堟暟浣嶏細(xì)11浣嶏級(jí)錛屼細(xì)緙栫爜鎴愬弻瀛楄妭鐨勩傚厛瀛樺偍鍘熷瓧鑺備綆5浣嶇殑鏁頒綅錛屼笖灝嗘渶楂樹(shù)綅鍜屾楂樹(shù)綅閮界疆1錛屽啀嬈¢珮?shù)綅缃?錛坵riteByte((byte)(0xC0 | (code >> 6)));錛夈傜劧鍚庡瓨鍌ㄥ悗6浣嶇殑瀛楄妭錛屽皢鍓嶄袱浣嶇疆10錛坵riteByte((byte)(0x80 | (code & 0x3F)));錛?br />
瀵逛簬鍏朵粬鐨刄NICODE鍊煎垯
writeByte((byte)(0xE0 | (code >>> 12))); 4浣?br />
writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); 5浣?br />
writeByte((byte)(0x80 | (code & 0x3F))); 3- 5浣?br />
final void writeString(String s) throws IOException
璇ュ嚱鏁伴鍏堢敤s.length()鍒ゆ柇璇tring鎬誨叡鏈夊灝戜釜瀛楃
鐒跺悗棣栧厛璋冪敤writeVInt鍐欏叆榪欎釜瀛楃闀垮害
鍐嶈皟鐢╳riteChars(s,s.length())鍐欏叆瀛楃
鍦╥nputStream涓殑readString()鏂規(guī)硶鍒欎笌鍏剁浉鍙嶏紝棣栧厛鐢╮eadVInt()鏂規(guī)硶璇誨彇瀛楃闀垮害len 鐒跺悗璇誨彇len闀垮害鐨勫瓧絎?br />
protected final void flush() throws IOException
璇ユ柟娉曡皟鐢ㄥ彟澶栦竴涓柟娉昮lushBuffer灝嗙紦鍐插尯涓殑鏁版嵁杈撳嚭錛岀劧鍚庢竻絀虹紦鍐插尯錛?br />
abstract void flushBuffer(byte[] b, int len) throws IOException
鍙flushBuffer鏂規(guī)硶鏄痑bstract鐨勶紝鍗抽渶瑕佸叾瀛愮被瀵硅鏂規(guī)硶榪涜瑕嗗啓錛屼互瀹氫綅璇ヨ緭鍑烘祦鐨勮緭鍑烘柟寮忋?br />
final long getFilePointer() throws IOException
寰楀埌鏂囦歡鎸囬拡鐨勪綅緗紝鍗沖緱鍒拌緭鍑烘祦宸茬粡杈撳嚭鐨勫瓧鑺傛暟銆?br />
public void seek(long pos) throws IOException
杈撳嚭緙撳啿鍖虹殑鍐呭錛岀劧鍚庡皢鏂囦歡鎸囬拡瀹氫綅鍒發(fā)ong鎵鎸囩ず鐨勬枃浠朵綅緗?br />
abstract long length() throws IOException
榪斿洖鏂囦歡涓凡鏈夌殑瀛楄妭鏁般傞渶瑕佸瓙綾誨疄鐜般?br />