-
Lucene第六篇:Lucene索引操作 原创
-
-
- jack
- 2018-04-07 23:05:44 +0800
我在第四篇中提到的核心对象IndexWriter,它是索引操作的核心组件,负责创建新索引,打开已有索引进行增加、更新和删除操作。为了更好的演示这些操作,我将以本博客的全文搜索为例进行讲解。本文只做索引的操作演示,下一篇博文再进行索引的搜索演示。
- 索引创建器Indexer.java:负责创建新索引,打开已有索引进行增加、更新和删除操作。
- 测试类Test.java: 对上面的各项操作进行验证。
在实际项目的使用中为了效率和性能,最好使每次的操作共享一个IndexWriter对象。例如,如果是web项目可以把IndexWriter对象,放在application对象中,后面的每次对索引的各项操作时,先从application对象中取IndexWriter对象。如过存在直接使用,不存在新建一个。
在我的博客集成Lucene全文检索的时候,我意识到自己面临这样的几个问题:
- 博客已经发表20多篇文章,需要批量生成索引;
- 集成索引后还有增加新的博文,需要生成新的索引;
- 博文是抽业余时间断断续续写的,后面review的时候需要更新,对应的索引也需要更新;
- 博文删除后同时删除对应的索引,避免被全文搜索出来,查看又不存在给读者造成困扰。
测试类Test.java
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
public class Test {
public static void main(String[] args) throws IOException {
// 假设数据有张blog表,字段为id, title, content, summary
String primaryKey = "id";
String indexDirecory = "d:/lucence/index101";
//config应使用json数据结构,这里作为demo,减少jar依赖
Map<String, String> pkConfig = new HashMap<String, String>();
pkConfig.put("fieldType", "StringField");
pkConfig.put("fieldStore", "yes");
Map<String, String> titleConfig = new HashMap<String, String>();
titleConfig.put("fieldType", "TextField");
titleConfig.put("fieldStore", "yes");
Map<String, String> contentConfig = new HashMap<String, String>();
contentConfig.put("fieldType", "TextField");
contentConfig.put("fieldStore", "no");
Map<String, String> summaryConfig = new HashMap<String, String>();
summaryConfig.put("fieldType", "TextField");
summaryConfig.put("fieldStore", "yes");
Map<String, Map<String, String>> fieldIndexConfig = new HashMap<String, Map<String, String>>();
fieldIndexConfig.put("id", pkConfig);
fieldIndexConfig.put("title", titleConfig);
fieldIndexConfig.put("content", contentConfig);
fieldIndexConfig.put("summary", summaryConfig);
Indexer indexer = new Indexer(primaryKey, indexDirecory, fieldIndexConfig);
try {
/** 批量创建索引,索引只用创建一次,运行一次后请将此方法注释掉 */
//testCreateIndex(indexDirecory, indexer);
/** 更新索引 */
testUpdateIndex(indexDirecory, indexer);
/** 删除索引 */
testDeleteIndex(indexDirecory, indexer);
} catch (Exception e) {
e.printStackTrace();
} finally {
indexer.getIndexWriter().close();
}
}
public static void testUpdateIndex(String indexDirecory, Indexer indexer) throws Exception {
Map<String, Object> blogTwo = new HashMap<String, Object>();
blogTwo.put("id", "90002");
blogTwo.put("title", "Lucene第三篇:lucene实战代码示例 ##更新测试###");
indexer.update(blogTwo, true);
Map<String, Object> blogThree = new HashMap<String, Object>();
blogThree.put("id", "90003");
blogThree.put("title", "Lucene第四篇:Lucene创建索引常用的核心类 ##更新测试###");
indexer.update(blogThree, true);
search(indexDirecory, "##更新测试###");
System.out.println("####################更新索引,测试结束###########################");
}
public static void testDeleteIndex(String indexDirecory, Indexer indexer) throws Exception {
String primarykeyValue = "90002";
indexer.delete(primarykeyValue, true);
search(indexDirecory, "##更新测试###");
System.out.println("####################删除索引,测试结束###########################");
}
public static void testCreateIndex(String indexDirecory, Indexer indexer) throws Exception {
// 为省略数据库查询部分,我这里简单的构造几条数据为数据库查询的结果集List<Map<string, Object>
Map<String, Object> blogOne = new HashMap<String, Object>();
blogOne.put("id", "90001");
blogOne.put("title", "Lucene第二篇:Lucene索引和搜索原理");
blogOne.put("content", "Lucene优秀的简单易用API,可以避免让我们陷入复杂难懂的搜索算法细节。");
blogOne.put("summary", "Lucene索引和搜索原理");
Map<String, Object> blogTwo = new HashMap<String, Object>();
blogTwo.put("id", "90002");
blogTwo.put("title", "Lucene第三篇:lucene实战代码示例");
blogTwo.put("content", "学习程序开发最好的方式,是写代码去练习、去验证来理解。");
blogTwo.put("summary", "lucene实战代码创建索引和检索示例");
Map<String, Object> blogThree = new HashMap<String, Object>();
blogThree.put("id", "90003");
blogThree.put("title", "Lucene第四篇:Lucene创建索引常用的核心类");
blogThree.put("content", "在第3篇中如你所看到的,创建索引和执行搜索,仅会用到几个公开的核心类。");
blogThree.put("summary", "创建索引常用的5个核心类");
List<Map<String, Object>> blogs = new ArrayList<Map<String, Object>>();
blogs.add(blogOne);
blogs.add(blogTwo);
blogs.add(blogThree);
indexer.create(blogs, true);
search(indexDirecory, "Lucene");
System.out.println("####################创建索引,测试结束###########################");
}
public static void search(String indexDir, String queryText) throws IOException, ParseException {
// 指定索引文件目录
File ifDir = new File(indexDir);
Path path = ifDir.toPath();
FSDirectory fSDirectory = FSDirectory.open(path);
IndexReader indexReader = DirectoryReader.open(fSDirectory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
// 使用标准分析器,并同时对多个field(title\content)进行搜索。
// String[] Multi_Field = new String[] {"title", "content" };
String[] Multi_Field = new String[] { "title", "content", "summary" };
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(Multi_Field, new StandardAnalyzer());
Query query = queryParser.parse(queryText);
// 搜索前100匹配度最高的document
TopDocs topdocs = indexSearcher.search(query, 100);
ScoreDoc[] scoreDocs = topdocs.scoreDocs;
System.out.println("搜索到 " + scoreDocs.length + " 篇blog:");
for (ScoreDoc scoreDoc : scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(scoreDoc.score);
System.out.println("--");
}
System.out.println();
}
}
索引创建器Indexer.java
package lucene;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.FSDirectory;
public class Indexer {
/** 文档feild的类型,如 :StringField、TextField etc. 。*/
private static String _FIELD_TYPE = "fieldType";
/** 是否存储field原文, 值为yes或no */
private static String _FIELD_STORE = "fieldStore";
/** 文档主键feild名称,例如数据库表主键名称 id */
private String _PRIMARY_KEY;
/** 索引文件目录,例如 d:/lucene/index */
private String _indexDirecory;
/**
* field创建索引配置项,格式: {"title": {"fieldType": "StringField", "store": "yes"},
* "content": {"fieldType": "TextField", "store": "no"}}
*/
private Map<String, Map<String, String>> _fieldIndexConfig;
private IndexWriter _indexWriter;
/**
* 索引创建器
*
* @param primaryKey
* 文档主键feild名称
* @param indexDirecory
* 索引文件目录,例如 d:/lucene/index。
* @param fieldIndexConfig
* field创建索引配置项,格式: {"title": {"fieldType": "StringField", "store":
* "yes"}, "content": {"fieldType": "TextField", "store": "no"}}。
*/
public Indexer(String primaryKey, String indexDirecory, Map<String, Map<String, String>> fieldIndexConfig) {
_PRIMARY_KEY = primaryKey;
_indexDirecory = indexDirecory;
_fieldIndexConfig = fieldIndexConfig;
createIndexWriter();
}
/**
* 删除索引
*
* @param primarykeyValue
* 主键的值
* @param commit
* 是否立即提交删除操作
*/
public void delete(String primarykeyValue, boolean commit) {
try {
Term term = new Term(_PRIMARY_KEY, primarykeyValue);
_indexWriter.deleteDocuments(term);
if (commit) {
_indexWriter.commit();
}
} catch (IOException e) {
// TODO 为减少jar依赖使用,请自行修改
new RuntimeException("删除索引失败。", e);
}
}
/***
* 更新索引
*
* @param docData
* 要更新的field数据
* @param commit
* 是否立即提交更新操作
*/
public void update(Map<String, Object> docData, boolean commit) {
if (!docData.containsKey(_PRIMARY_KEY)) {
new RuntimeException("无主键,不能更新索引。");
}
String pkValue = docData.get(_PRIMARY_KEY).toString();
try {
Document document = createDocument(docData);
Term term = new Term(_PRIMARY_KEY, pkValue);
_indexWriter.updateDocument(term, document);
if (commit) {
_indexWriter.commit();
}
} catch (IOException e) {
// TODO 为减少jar依赖使用,请自行修改
new RuntimeException("更新索引失败。", e);
}
}
/**
* 批量创建索引
*
* @param docDatas
* 索引数据document集合
* @throws IOException
*/
public void create(List<Map<String, Object>> docDatas, boolean commit) throws IOException {
for (Map<String, Object> docData : docDatas) {
create(docData, false);
}
if (commit) {
_indexWriter.commit();
}
}
/**
* 创建索引
*
* @param docData
* 索引数据document
*/
public void create(Map<String, Object> docData, boolean commit) {
try {
Document document = createDocument(docData);
_indexWriter.addDocument(document);
if (commit) {
_indexWriter.commit();
}
} catch (IOException e) {
// TODO 为减少jar依赖使用,请自行修改
new RuntimeException("创建document索引失败。", e);
}
}
private Document createDocument(Map<String, Object> docData) {
Document document = new Document();
for (Map.Entry<String, Object> entry : docData.entrySet()) {
String key = entry.getKey();
Object vlaue = entry.getValue();
Field field = null;
if (_fieldIndexConfig.containsKey(key)) {
Map<String, String> config = _fieldIndexConfig.get(key);
String fieldType = config.get(_FIELD_TYPE);
String storeType = config.get(_FIELD_STORE);
if ("StringField".equals(fieldType)) {
field = new StringField(key, vlaue.toString(), getStore(storeType));
} else if ("TextField".equals(fieldType)) {
field = new TextField(key, vlaue.toString(), getStore(storeType));
} else if ("StoredField".equals(fieldType)) {
field = new StoredField(key, vlaue.toString());
} else {
// TODO 请自行补充其它类型
new RuntimeException("Field类型不支持,请补充此类型。");
}
} else {
field = new TextField(key, vlaue.toString(), Store.NO);
}
document.add(field);
}
return document;
}
public IndexWriter getIndexWriter() {
return _indexWriter;
}
private void createIndexWriter() {
try {
File ifDir = new File(_indexDirecory);
Path path = ifDir.toPath();
FSDirectory fsd;
fsd = FSDirectory.open(path);
// 指定索引需要的分析器
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
// 创建索引器IndexWriter
_indexWriter = new IndexWriter(fsd, config);
} catch (IOException e) {
// TODO 为减少jar依赖使用,请自行修改
new RuntimeException("创建IndexWriter对象失败。", e);
}
}
private Store getStore(String storeType) {
if (storeType != null && "yes".equalsIgnoreCase(storeType)) {
return Store.YES;
} else {
return Store.NO;
}
}
}
本文到这里结束,下一篇博文再进行索引的搜索演示,包括全文任意匹配和搜索分页显示。
- 查看 0 条评论
系 列
- 杂记
- 软件招标采购中的野路子公司
- Lucene
- Lucene第一篇:初识Lucene
- Lucene第二篇:Lucene索引和搜索原理
- Lucene第三篇:lucene实战代码示例
- Lucene第四篇:Lucene创建索引常用核心类
- Nginx
- 使用Nginx实现同一固定IP上,多个Web站点访问不加端口号
- 使用WinSW将Nginx注册为windows系统服务可随机启动
- 全站HTTPS第一篇:Nginx配置HTTPS
- 全站HTTPS第二篇:Nginx SSL + Tomcat集群,request.getScheme() 取不到HTTPS协议
- 全站HTTPS第三篇:Nginx 80端口自动转发到443端口