初探Lucene
学习地址:
https://segmentfault.com/a/1190000003101607
http://yijun1171.github.io/2014/12/06/Lucene%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0/
http://www.cnblogs.com/forfuture1978/archive/2009/12/14/1623594.html
添加依赖:
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-core</artifactId>
- <version>4.3.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-queryparser</artifactId>
- <version>4.3.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-queries</artifactId>
- <version>4.3.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-highlighter</artifactId>
- <version>4.3.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-smartcn</artifactId>
- <version>4.3.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
- <version>4.3.1</version>
- </dependency>
索引基本使用
1.创建索引和搜索
- import java.io.File;
- import java.io.IOException;
-
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field.Store;
- import org.apache.lucene.document.IntField;
- import org.apache.lucene.document.StringField;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.IndexWriterConfig.OpenMode;
- import org.apache.lucene.queryparser.classic.ParseException;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.Version;
-
- public class Index {
- public static void main(String[] args) {
- Index index = new Index();
- index.createIndex();
- index.search();
- }
-
- public void createIndex() {
-
- // 创建一个分词器(指定Lucene版本)
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
- // IndexWriter配置信息(指定Lucene版本和分词器)
- IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer);
- // 设置索引的打开方式
- indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
- // 创建Directory对象和IndexWriter对象
- Directory directory = null;
- IndexWriter indexWriter = null;
- try {
- directory = FSDirectory.open(new File("Lucene_index/test"));
-
- // 检查Directory对象是否处于锁定状态(如果锁定则进行解锁)
- if (IndexWriter.isLocked(directory)) {
- IndexWriter.unlock(directory);
- }
-
- indexWriter = new IndexWriter(directory, indexWriterConfig);
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- // 创建测试文档并为其添加域
- Document doc1 = new Document();
- doc1.add(new StringField("id", "abcde", Store.YES)); // 添加一个id域,域值为abcde
- doc1.add(new TextField("content", "使用Lucene实现全文检索", Store.YES)); // 文本域
- doc1.add(new IntField("num", 1, Store.YES)); // 添加数值域
-
- // 将文档写入索引
- try {
- indexWriter.addDocument(doc1);
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- Document doc2 = new Document();
- doc2.add(new StringField("id", "yes", Store.YES));
- doc2.add(new TextField("content", "Docker容器技术简介", Store.YES));
- doc2.add(new IntField("num", 2, Store.YES));
- try {
- indexWriter.addDocument(doc2);
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- // 将IndexWriter提交
- try {
- indexWriter.commit();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- indexWriter.close();
- directory.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
-
- public void search() {
- Directory directory = null;
- DirectoryReader dReader = null;
- try {
- directory = FSDirectory.open(new File("Lucene_index/test")); // 索引文件
- dReader = DirectoryReader.open(directory); // 读取索引文件
- IndexSearcher searcher = new IndexSearcher(dReader); // 创建IndexSearcher对象
-
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); // 指定分词技术(标准分词-与创建索引时使用的分词技术一致)
-
- // 创建查询字符串(指定搜索域和采用的分词技术)
- QueryParser parser = new QueryParser(Version.LUCENE_43, "content", analyzer);
- Query query = parser.parse("Docker"); // 创建Query对象(指定搜索词)
-
- // 检索索引(指定前10条)
- TopDocs topDocs = searcher.search(query, 10);
- if (topDocs != null) {
- System.out.println("符合条件的文档总数为:" + topDocs.totalHits);
- for (int i = 0; i < topDocs.scoreDocs.length; i++) {
- Document doc = searcher.doc(topDocs.scoreDocs[i].doc);
- System.out.println(
- "id = " + doc.get("id") + ",content = " + doc.get("content") + ",num = " + doc.get("num"));
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- } catch (ParseException e) {
- e.printStackTrace();
- } finally {
- try {
- dReader.close();
- directory.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
2.分词器对比
- import java.io.IOException;
- import java.io.StringReader;
-
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.cjk.CJKAnalyzer;
- import org.apache.lucene.analysis.core.KeywordAnalyzer;
- import org.apache.lucene.analysis.core.SimpleAnalyzer;
- import org.apache.lucene.analysis.core.StopAnalyzer;
- import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- import org.apache.lucene.util.Version;
- import org.wltea.analyzer.lucene.IKAnalyzer;
-
- public class AnalyzerTest {
- public static void main(String[] args) {
- AnalyzerTest test=new AnalyzerTest();
- test.testAnalyzer();
- }
-
- public void testAnalyzer() {
-
- final String str = "今天的生活是因为你三年前的选择,而今天的选择,将决定你三年后的生活。";
- Analyzer analyzer = null;
-
- analyzer = new StandardAnalyzer(Version.LUCENE_43); // 标准分词
- print(analyzer, str);
- analyzer = new IKAnalyzer(); // 第三方中文分词
- print(analyzer, str);
- analyzer = new WhitespaceAnalyzer(Version.LUCENE_43); // 空格分词
- print(analyzer, str);
- analyzer = new SimpleAnalyzer(Version.LUCENE_43); // 简单分词
- print(analyzer, str);
- analyzer = new CJKAnalyzer(Version.LUCENE_43); // 二分法分词
- print(analyzer, str);
- analyzer = new KeywordAnalyzer(); // 关键字分词
- print(analyzer, str);
- analyzer = new StopAnalyzer(Version.LUCENE_43); // 被忽略词分词器
- print(analyzer, str);
-
- }
-
- /**
- * 该方法用于打印分词器及其分词结果
- *
- * @param analyzer
- * 分词器
- * @param str
- * 需要分词的字符串
- */
- public void print(Analyzer analyzer, String str) {
-
- StringReader stringReader = new StringReader(str);
- try {
- TokenStream tokenStream = analyzer.tokenStream("", stringReader); // 分词
- tokenStream.reset();
-
- CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); // 获取分词结果的CharTermAttribute
- System.out.println("分词技术:" + analyzer.getClass());
- while (tokenStream.incrementToken()) {
- System.out.print(term.toString() + "|");
- }
- System.out.println();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
结果:
分词技术:class org.apache.lucene.analysis.standard.StandardAnalyzer
今|天|的|生|活|是|因|为|你|三|年|前|的|选|择|而|今|天|的|选|择|将|决|定|你|三|年|后|的|生|活|
分词技术:class org.wltea.analyzer.lucene.IKAnalyzer
今天|的|生活|是因为|因为|你|三年|三|年前|年|前|的|选择|而今|今天|的|选择|将|决定|你|三年|三|年后|年|后|的|生活|
分词技术:class org.apache.lucene.analysis.core.WhitespaceAnalyzer
今天的生活是因为你三年前的选择,而今天的选择,将决定你三年后的生活。|
分词技术:class org.apache.lucene.analysis.core.SimpleAnalyzer
今天的生活是因为你三年前的选择|而今天的选择|将决定你三年后的生活|
分词技术:class org.apache.lucene.analysis.cjk.CJKAnalyzer
今天|天的|的生|生活|活是|是因|因为|为你|你三|三年|年前|前的|的选|选择|而今|今天|天的|的选|选择|将决|决定|定你|你三|三年|年后|后的|的生|生活|
分词技术:class org.apache.lucene.analysis.core.KeywordAnalyzer
今天的生活是因为你三年前的选择,而今天的选择,将决定你三年后的生活。|
分词技术:class org.apache.lucene.analysis.core.StopAnalyzer
今天的生活是因为你三年前的选择|而今天的选择|将决定你三年后的生活|
正文到此结束