Lucene 中获取没有存储的字段值的几种方法

2017-10-10 Eric Wang 更多博文 » 博客 » GitHub »

Lucene

原文链接 http://codepub.cn/2017/10/10/several-ways-to-get-the-field-values-that-are-not-stored-in-lucene/
注:以下为加速网络访问所做的原文缓存,经过重新格式化,可能存在格式方面的问题,或偶有遗漏信息,请以原文为准。


一般来说,如果想要从Lucene索引中获取Field的值,那么需要在索引阶段设置Field.Store.YES才可以,然后在搜索阶段得到TopDocs对象之后,用它去获取ScoreDoc再取出Document,使用Document获取存储在索引中的值。但是我们都知道,存储字段是需要硬盘空间的,如果想要追求极致的存储空间并且获取Field的值,那么在不存储的情况下,如何获取呢?其实仔细思索一下,在我们只索引不存储的情况下,Lucene依然可以判断搜索是否命中,这说明在Lucene索引中依然存有一份Field的值,这样在搜索阶段才能判断是否匹配。本文就是探讨在这种情形下,使用Lucene的核心包获取没有存储的Field的值的几种方法,如果你还有其它不同的方法请留言。

  • testGetFieldByStore 演示存储Field值时如何获取
  • testGetFieldByTerms 演示通过Terms获取没有存储的Field值
  • testGetFieldByFieldDocWithSorted 演示通过FieldDoc获取没有存储的值
  • testGetFieldByTermVector 演示通过TermVector获取没有存储的值
  • testGetFieldByTermVectors 演示通过TermVectors获取没有存储的值

这里补充一下,在lucene-suggest包中,有LuceneDictionary类,通过该类的getEntryIterator方法也能获取没有存储的Field的值,不过其本质和通过Terms获取方式一样,在此不再列举。源码示例如下

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.junit.Test;

import java.io.IOException;

import static org.apache.lucene.search.SortField.Type.STRING;

/**
 * <p>
 * Created by wangxu on 2017/10/10 17:33.
 * </p>
 * <p>
 * Description: Lucene 6.5.0
 * </p>
 *
 * @author Wang Xu
 * @version V1.0.0
 * @since V1.0.0 <br/>
 * WebSite: http://codepub.cn <br>
 * Licence: Apache v2 License
 */
public class GetNonStoredFieldDemo {
    private RAMDirectory ramDirectory = new RAMDirectory();
    private IndexWriter indexWriter = new IndexWriter(ramDirectory, new IndexWriterConfig(new WhitespaceAnalyzer()));

    public GetNonStoredFieldDemo() throws IOException {
    }

    @Test
    public void testGetFieldByStore() throws IOException {
        initIndexForStore();
        IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(ramDirectory));
        int count = indexSearcher.count(new MatchAllDocsQuery());
        TopDocs search = indexSearcher.search(new MatchAllDocsQuery(), count);
        ScoreDoc[] scoreDocs = search.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            Document doc = indexSearcher.doc(scoreDoc.doc);
            System.out.println(doc.get("IDX") + "=>" + doc.get("title"));
        }
        ramDirectory.close();
    }

    @Test
    public void testGetFieldByTerms() throws IOException {
        initIndexForTerms();
        Fields fields = MultiFields.getFields(DirectoryReader.open(ramDirectory));
        Terms idx = fields.terms("IDX");
        Terms title = fields.terms("title");
        //or you can use like this
        //TermsEnum idxIter = MultiFields.getTerms(DirectoryReader.open(ramDirectory), "IDX").iterator();
        TermsEnum idxIter = idx.iterator();
        TermsEnum titleIter = title.iterator();
        BytesRef bytesRef;
        while ((bytesRef = idxIter.next()) != null) {
            System.out.println(bytesRef.utf8ToString() + "=>" + titleIter.next().utf8ToString());
        }
        ramDirectory.close();
    }

    @Test
    public void testGetFieldByFieldDocWithSorted() throws IOException {
        initIndexForFieldDocWithSorted();
        IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(ramDirectory));
        int count = indexSearcher.count(new MatchAllDocsQuery());
        //must use method which returns TopFieldDocs
        TopFieldDocs search = indexSearcher.search(new MatchAllDocsQuery(), count, new Sort(new SortField("IDX", STRING)));
        ScoreDoc[] scoreDocs = search.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            FieldDoc fieldDoc = (FieldDoc) scoreDoc;
            Object[] fields = fieldDoc.fields;
            if (fields[0] instanceof BytesRef) {
                BytesRef temp = (BytesRef) fields[0];
                System.out.println(temp.utf8ToString() + "=>" + indexSearcher.doc(scoreDoc.doc).get("title"));
            }
        }
        ramDirectory.close();
    }

    @Test
    public void testGetFieldByTermVector() throws IOException {
        initIndexForTermVector();
        IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(ramDirectory));
        int count = indexSearcher.count(new MatchAllDocsQuery());
        TopDocs search = indexSearcher.search(new MatchAllDocsQuery(), count);
        ScoreDoc[] scoreDocs = search.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            int doc = scoreDoc.doc;
            Terms idx = indexSearcher.getIndexReader().getTermVector(doc, "IDX");
            TermsEnum iterator = idx.iterator();
            BytesRef bytesRef;
            while ((bytesRef = iterator.next()) != null) {
                System.out.println(bytesRef.utf8ToString() + "=>" + indexSearcher.doc(doc).get("title"));
            }
        }
        ramDirectory.close();
    }

    @Test
    public void testGetFieldByTermVectors() throws IOException {
        initIndexForTermVector();
        IndexSearcher indexSearcher = new IndexSearcher(DirectoryReader.open(ramDirectory));
        int count = indexSearcher.count(new MatchAllDocsQuery());
        TopDocs search = indexSearcher.search(new MatchAllDocsQuery(), count);
        ScoreDoc[] scoreDocs = search.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            int doc = scoreDoc.doc;
            Fields termVectors = indexSearcher.getIndexReader().getTermVectors(doc);
            Terms idx = termVectors.terms("IDX");
            TermsEnum iterator = idx.iterator();
            BytesRef bytesRef;
            while ((bytesRef = iterator.next()) != null) {
                System.out.println(bytesRef.utf8ToString() + "=>" + indexSearcher.doc(doc).get("title"));
            }
        }
        ramDirectory.close();
    }

    private void initIndexForStore() throws IOException {
        Document document = new Document();
        document.add(new StringField("IDX", "TEST01", Field.Store.YES));
        document.add(new StringField("title", "TITLE01", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new StringField("IDX", "TEST02", Field.Store.YES));
        document.add(new StringField("title", "TITLE02", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new StringField("IDX", "TEST03", Field.Store.YES));
        document.add(new StringField("title", "TITLE03", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new StringField("IDX", "TEST04", Field.Store.YES));
        document.add(new StringField("title", "TITLE04", Field.Store.YES));
        indexWriter.addDocument(document);
        indexWriter.close();
    }

    private void initIndexForTerms() throws IOException {
        Document document = new Document();
        document.add(new StringField("IDX", "TEST01", Field.Store.NO));
        document.add(new StringField("title", "TITLE01", Field.Store.NO));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new StringField("IDX", "TEST02", Field.Store.NO));
        document.add(new StringField("title", "TITLE02", Field.Store.NO));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new StringField("IDX", "TEST03", Field.Store.NO));
        document.add(new StringField("title", "TITLE03", Field.Store.NO));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new StringField("IDX", "TEST04", Field.Store.NO));
        document.add(new StringField("title", "TITLE04", Field.Store.NO));
        indexWriter.addDocument(document);
        indexWriter.close();
    }

    private void initIndexForTermVector() throws IOException {
        FieldType fieldType = new FieldType();
        fieldType.setStoreTermVectors(true);
        fieldType.setIndexOptions(IndexOptions.DOCS);
        Document document = new Document();
        document.add(new Field("IDX", "TEST01", fieldType));
        document.add(new StringField("title", "TITLE01", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new Field("IDX", "TEST02", fieldType));
        document.add(new StringField("title", "TITLE02", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new Field("IDX", "TEST03", fieldType));
        document.add(new StringField("title", "TITLE03", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new Field("IDX", "TEST04", fieldType));
        document.add(new StringField("title", "TITLE04", Field.Store.YES));
        indexWriter.addDocument(document);
        indexWriter.close();
    }

    private void initIndexForFieldDocWithSorted() throws IOException {
        Document document = new Document();
        document.add(new SortedDocValuesField("IDX", new BytesRef("TEST01")));
        document.add(new StringField("title", "TITLE01", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new SortedDocValuesField("IDX", new BytesRef("TEST02")));
        document.add(new StringField("title", "TITLE02", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new SortedDocValuesField("IDX", new BytesRef("TEST03")));
        document.add(new StringField("title", "TITLE03", Field.Store.YES));
        indexWriter.addDocument(document);

        document = new Document();
        document.add(new SortedDocValuesField("IDX", new BytesRef("TEST04")));
        document.add(new StringField("title", "TITLE04", Field.Store.YES));
        indexWriter.addDocument(document);
        indexWriter.close();
    }

}