import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Sugesstion {
private static final String GRAMMED_WORDS_FIELD = "words";
private static final String SOURCE_WORD_FIELD = "sourceWord";
private static final String COUNT_FIELD = "count";
private static final String[] ENGLISH_STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "i", "if", "in", "into", "is",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private final Directory autoCompleteDirectory;
private IndexReader autoCompleteReader;
private IndexSearcher autoCompleteSearcher;
public Sugesstion(String autoCompleteDir) throws IOException {
this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir,
null);
reOpenReader();
}
public List<String> suggestTermsFor(String term) throws IOException {
// get the top 5 terms for query
Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, term));
Sort sort = new Sort(COUNT_FIELD, true);
TopDocs docs = autoCompleteSearcher.search(query, null, 5, sort);
List<String> suggestions = new ArrayList<String>();
for (ScoreDoc doc : docs.scoreDocs) {
suggestions.add(autoCompleteReader.document(doc.doc).get(
SOURCE_WORD_FIELD));
}
return suggestions;
}
@SuppressWarnings("unchecked")
public void reIndex(Directory sourceDirectory, String fieldToAutocomplete)
throws CorruptIndexException, IOException {
// build a dictionary (from the spell package)
IndexReader sourceReader = IndexReader.open(sourceDirectory);
LuceneDictionary dict = new LuceneDictionary(sourceReader,
fieldToAutocomplete);
// code from
// org.apache.lucene.search.spell.SpellChecker.indexDictionary(
// Dictionary)
IndexReader.unlock(autoCompleteDirectory);
// use a custom analyzer so we can do EdgeNGramFiltering
IndexWriter writer = new IndexWriter(autoCompleteDirectory,
new Analyzer() {
public TokenStream tokenStream(String fieldName,
Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new ISOLatin1AccentFilter(result);
result = new StopFilter(result,
ENGLISH_STOP_WORDS);
result = new EdgeNGramTokenFilter(
result, Side.FRONT,1, 20);
return result;
}
}, true);
writer.setMergeFactor(300);
writer.setMaxBufferedDocs(150);
// go through every word, storing the original word (incl. n-grams)
// and the number of times it occurs
Map<String, Integer> wordsMap = new HashMap<String, Integer>();
Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
while (iter.hasNext()) {
String word = iter.next();
int len = word.length();
if (len < 3) {
continue; // too short we bail but "too long" is fine...
}
if (wordsMap.containsKey(word)) {
throw new IllegalStateException(
"This should never happen in Lucene 2.3.2");
// wordsMap.put(word, wordsMap.get(word) + 1);
} else {
// use the number of documents this word appears in
wordsMap.put(word, sourceReader.docFreq(new Term(
fieldToAutocomplete, word)));
}
}
for (String word : wordsMap.keySet()) {
// ok index the word
Document doc = new Document();
doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES,
Field.Index.UN_TOKENIZED)); // orig term
doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES,
Field.Index.TOKENIZED)); // grammed
doc.add(new Field(COUNT_FIELD,
Integer.toString(wordsMap.get(word)), Field.Store.NO,
Field.Index.UN_TOKENIZED)); // count
writer.addDocument(doc);
}
sourceReader.close();
// close writer
writer.optimize();
writer.close();
// re-open our reader
reOpenReader();
}
private void reOpenReader() throws CorruptIndexException, IOException {
if (autoCompleteReader == null) {
autoCompleteReader = IndexReader.open(autoCompleteDirectory);
} else {
autoCompleteReader.reopen();
}
autoCompleteSearcher = new IndexSearcher(autoCompleteReader);
}
public static void main(String[] args) throws Exception {
Sugesstion autocomplete = new Sugesstion("/index/autocomplete");
// run this to re-index from the current index, shouldn't need to do
// this very often
// autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null),
// "content");
String term = "steve";
System.out.println(autocomplete.suggestTermsFor(term));
// prints [steve, steven, stevens, stevenson, stevenage]
}
}
相关推荐
在前端开发中,我们可以使用jQuery来实现类似百度的自动补全功能,为网页搜索框添加自动补全特效。 首先,要实现自动补全功能,我们需要一个文本输入框,用户在这个输入框中输入内容。接下来,我们要通过JavaScript...
在项目中,经常会用到输入框的自动补全功能,就像百度、淘宝等搜索框一样:当用户输入首字母、关键词时,后台会迅速将与此相关的条目返回并显示到前台,以便用户选择,提升用户体验。当然本项目的补全功能和这些大厂...
6. **搜索算法**:虽然百度地图API会处理大部分搜索逻辑,但在前端,可能需要自定义一些搜索逻辑,比如实现模糊匹配、自动补全等功能,这就需要了解一些基础的字符串匹配和排序算法。 7. **地图渲染**:收到搜索...
标题中的“像百度一样 文本框输入文本后提示相关信息”是指实现一个搜索或者输入功能,当用户在文本框中输入文字时,系统能够实时显示与输入内容相关的建议或信息,这种技术通常被称为自动补全(Autocomplete)或者...
【标题】"和百度下拉框效果一样的"指的是在网页搜索框中实现类似百度搜索引擎的自动补全功能,即用户在输入关键词时,系统会实时显示与输入内容相关的建议搜索词,这种效果通常称为自动填充或自动建议。这种功能提高...
这种功能通常被称为自动补全或智能提示,能够提高用户搜索效率,减少输入错误。 描述中提到,这个功能是通过结合多种技术来实现的,包括`AJAX(异步JavaScript和XML)`、`JS(JavaScript)`、`DOM(文档对象模型)`...
这个功能通常称为自动补全或建议搜索,它能够根据用户输入的部分关键词提供相关的搜索建议,极大地提高了搜索效率和用户体验。 首先,我们需要理解智能检索的核心机制。百度搜索框的智能检索基于大量的历史搜索数据...
描述中提到的"像百度一样",意味着我们不仅要实现基本的边输入边筛选,还可能需要考虑额外的用户体验细节,如输入提示的排序(根据相关性或热度)、输入延迟(避免用户每键入一个字符都发送请求)以及错误处理等。...
通过这些插件,开发者可以在VS Code中实现HTML、Vue.js、JavaScript的高效开发,同时享受到中文界面和自动代码检查带来的便利。使用离线的vsix包安装这些插件,对于网络环境不佳或者需要批量部署工作环境的团队来说...
1、支持地图种类繁多:几乎包含所有主流在线地图,包括不仅限于谷歌、百度... 18、支持二次开发,可轻松搭建类似谷歌、百度地图一样的离线地图。 19、Windows原生语言开发,比Java/.NET等语言开发的程序效率高几倍。
6.这时软件自动添加基本的头文件,因为这个程序我们不需要其他的功能,所以 直接点击Next。 7.我们将base class 选为QDialog 对话框类。然后点击Next。 8.点击Finish,完成工程的建立。 9.我们可以看见工程中的所有...
精易模块 V5.15 what’s new:(2015XXXX) 1、新增“线程_枚举”枚举指定进程ID中所有线程列表,成功返回线程数量,失败...10、新增“进程_取IO写入计数”,功能与XP系统下任务管理器一样。 11、新增“进程_取IO写入...