lucene的jar包自己去下載。
首先是建立索引的代碼:
public class TextFileIndexer {
public static void main(String[] args) throws Exception {
/* 指明要索引文件夾的位置,這里是d盤的s文件夾下 */
File fileDir = new File("d:\\s");
/* 這里放索引文件的位置 */
File indexDir = new File("d:\\index");
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,
true);
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
//增加document到索引去
System.out.println("File正在被索引
.");
/*
* 注意要變的就是這里,路徑和讀取文件的方法
* */
String path ="d:\\s\\2.doc";
String temp = ReadFile.readWord(path);
// String path ="d:\\s\\index.htm";
// String temp = ReadFile.readHtml(path);
Document document = new Document();
Field FieldPath = new Field("path",path,
Field.Store.YES, Field.Index.NO);
Field FieldBody = new Field("body", temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
//optimize()方法是對索引進行優化
indexWriter.optimize();
indexWriter.close();
//測試一下索引的時間
long endTime = new Date().getTime();
System.out
.println("這花費了"
+ (endTime - startTime)
+ " 毫秒來把文檔增加到索引里面去!"
+ fileDir.getPath());
}
}
public static void main(String[] args) throws Exception {
/* 指明要索引文件夾的位置,這里是d盤的s文件夾下 */
File fileDir = new File("d:\\s");
/* 這里放索引文件的位置 */
File indexDir = new File("d:\\index");
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,
true);
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
//增加document到索引去
System.out.println("File正在被索引

/*
* 注意要變的就是這里,路徑和讀取文件的方法
* */
String path ="d:\\s\\2.doc";
String temp = ReadFile.readWord(path);
// String path ="d:\\s\\index.htm";
// String temp = ReadFile.readHtml(path);
Document document = new Document();
Field FieldPath = new Field("path",path,
Field.Store.YES, Field.Index.NO);
Field FieldBody = new Field("body", temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
//optimize()方法是對索引進行優化
indexWriter.optimize();
indexWriter.close();
//測試一下索引的時間
long endTime = new Date().getTime();
System.out
.println("這花費了"
+ (endTime - startTime)
+ " 毫秒來把文檔增加到索引里面去!"
+ fileDir.getPath());
}
}
上面已經注釋了要換的地方,我們要做的就是換文件的路徑和讀取文件的方法。
下面來具體看下讀取文件的方法
1.首先來看WORD文檔:
我這里用的是poi,相關jar包自己去下載,然后加到工程中(以下所要用的jar包也是,不再重復說)。
來看相關代碼:
public static String readWord(String path) {
StringBuffer content = new StringBuffer("");// 文檔內容
try {
HWPFDocument doc = new HWPFDocument(new FileInputStream(path));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍歷段落讀取數據
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
} catch (Exception e) {
}
return content.toString().trim();
}
StringBuffer content = new StringBuffer("");// 文檔內容
try {
HWPFDocument doc = new HWPFDocument(new FileInputStream(path));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍歷段落讀取數據
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
} catch (Exception e) {
}
return content.toString().trim();
}
2.PDF文件用的是PDFbox:
public static String readPdf(String path) throws Exception {
StringBuffer content = new StringBuffer("");// 文檔內容
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
return content.toString().trim();
}
StringBuffer content = new StringBuffer("");// 文檔內容
FileInputStream fis = new FileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
return content.toString().trim();
}
3.html文件:
public static String readHtml(String urlString) {
StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 讀取頁面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis,"utf-8"));//這里的字符編碼要注意,要對上html頭文件的一致,否則會出亂碼
String line = null;
while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 讀取頁面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis,"utf-8"));//這里的字符編碼要注意,要對上html頭文件的一致,否則會出亂碼
String line = null;
while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
4.txt文件:
public static String readTxt(String path) {
StringBuffer content = new StringBuffer("");// 文檔內容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null;
while ((s1 = br.readLine()) != null) {
content.append(s1 + "\r");
}
br.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return content.toString().trim();
}
StringBuffer content = new StringBuffer("");// 文檔內容
try {
FileReader reader = new FileReader(path);
BufferedReader br = new BufferedReader(reader);
String s1 = null;
while ((s1 = br.readLine()) != null) {
content.append(s1 + "\r");
}
br.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return content.toString().trim();
}
接下來數搜索代碼:
public class TestQuery {
public static void main(String[] args) throws IOException, ParseException {
Hits hits = null;
//搜索內容自己換
String queryString = "根據國務院的決定";
Query query = null;
IndexSearcher searcher = new IndexSearcher("d:\\index"); //這里注意索引存放的路徑
Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser("body", analyzer);
/**
* 建索引的時候我們指定了body建立為內容,我們搜索的時候也是針對body的,所以
* QueryParser qp = new QueryParser("body", analyzer);
* 這句和建立索引時候
Field FieldBody = new Field("body", temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
*的這句的"body"是對應的。
*/
query = qp.parse(queryString);
} catch (ParseException e) {
System.out.println("異常");
}
if (searcher != null) {
hits = searcher.search(query);
if (hits.length() > 0) {
System.out.println("找到:" + hits.length() + " 個結果!");
for (int i = 0; i < hits.length(); i++) {//輸出搜索信息
Document document = hits.doc(i);
System.out.println("contents:"+document.get("body"));
//同樣原理這里的document.get("body")就是取得建立在索引文件里面的額body的所有內容
//你若想輸出文件路徑就用document.get("path")就可以了
}
} else{
System.out.println("0個結果!");
}
}
}
public static void main(String[] args) throws IOException, ParseException {
Hits hits = null;
//搜索內容自己換
String queryString = "根據國務院的決定";
Query query = null;
IndexSearcher searcher = new IndexSearcher("d:\\index"); //這里注意索引存放的路徑
Analyzer analyzer = new StandardAnalyzer();
try {
QueryParser qp = new QueryParser("body", analyzer);
/**
* 建索引的時候我們指定了body建立為內容,我們搜索的時候也是針對body的,所以
* QueryParser qp = new QueryParser("body", analyzer);
* 這句和建立索引時候
Field FieldBody = new Field("body", temp, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
*的這句的"body"是對應的。
*/
query = qp.parse(queryString);
} catch (ParseException e) {
System.out.println("異常");
}
if (searcher != null) {
hits = searcher.search(query);
if (hits.length() > 0) {
System.out.println("找到:" + hits.length() + " 個結果!");
for (int i = 0; i < hits.length(); i++) {//輸出搜索信息

Document document = hits.doc(i);
System.out.println("contents:"+document.get("body"));
//同樣原理這里的document.get("body")就是取得建立在索引文件里面的額body的所有內容
//你若想輸出文件路徑就用document.get("path")就可以了
}
} else{
System.out.println("0個結果!");
}
}
}