Сортировка индекса Lucene по StoredField или функции docID

Сортировка индекса Lucene по StoredField или функции docID ⇐ JAVA

1 сообщение • Страница 1 из 1

Anonymous

Сортировка индекса Lucene по StoredField или функции docID

Цитата

Сообщение Anonymous » 20 янв 2026, 18:36

У меня есть много статических (неизменных) индексов Lucene, где каждому документу присвоено целое число StoredField с именем «fileId» (и каждый индекс имеет только 1 сегмент). Теперь я хочу изменить эти индексы, чтобы документы сортировались по идентификатору файла.
Я нашел здесь два модульных теста для сортировки индексов, но если я использую new Sort(new SortField("fileId", INT)), как в тестах, то при вызове SortingCodecReader.wrap() я получаю "IllegalStateException: неожиданный тип docvalues NONE для поля 'fileId' (expected=NUMERIC). Переиндексируйте с правильным типом значений документа.
Я думаю, это потому, что «fileId» не было полем DocValue. К счастью, я могу легко получить идентификатор файла из идентификатора документа, но, к сожалению, SortField не предоставляет конструктор для сортировки по чему-либо на основе идентификатора документа. Поэтому я создал DocSortField, который расширяет SortField. Кажется, это должно работать, но по какой-то причине после запуска кода индекс не был отсортирован правильно (на самом деле порядок, похоже, вообще не изменился). Сообщения об ошибках нет, поэтому я понятия не имею, где что-то идет не так. Я попробовал добавить вызов ForceMerge(1) для принудительной сортировки, но это не помогло.
Моя версия Lucene — 9.11.1.
import org.apache.lucene.index.*;
import org.apache.lucene.search.Sort;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.apache.lucene.index.PostingsEnum.POSITIONS;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

public class LuceneIndexSorter {

public static void main(String[] args) throws IOException {
File oldDir = new File("path/to/index");
if (areDocumentsInOrder(oldDir)) {
throw new RuntimeException("Documents already in order");
}
File newDir = new File(oldDir.getParent(), oldDir.getName() + " 2");
Sort indexSort = new Sort(new DocSortField("fileId", LuceneIndexSorter::getFileId));
// The rest comes from https://github.com/apache/lucene-solr/b ... eader.java
IndexWriterConfig indexWriterConfig = new IndexWriterConfig().setIndexSort(indexSort);
Directory newDirectory = FSDirectory.open(newDir.toPath());
Directory oldDirectory = FSDirectory.open(oldDir.toPath());
IndexWriter indexWriter = new IndexWriter(newDirectory, indexWriterConfig);
try (DirectoryReader indexReader = DirectoryReader.open(oldDirectory)) {
List wrappedCodecReaders = new ArrayList();
for (LeafReaderContext ctx : indexReader.leaves()) {
CodecReader wrap = SortingCodecReader.wrap(SlowCodecReaderWrapper.wrap(ctx.reader()), indexSort);
assert wrap.toString().startsWith("SortingCodecReader(");
wrappedCodecReaders.add(wrap);
}
indexWriter.addIndexes(wrappedCodecReaders.toArray(new CodecReader[0]));
indexWriter.commit(); // This line is needed to prevent "IndexNotFoundException: no segments* file found in MMapDirectory@...", although it's not used in the unit tests
}
// Check if it worked
if (!areDocumentsInOrder(newDir)) {
throw new RuntimeException("Documents weren't correctly sorted"); // Program ends up here with no other output
}
}

private static boolean areDocumentsInOrder(File dir) throws IOException {
FSDirectory directory = FSDirectory.open(dir.toPath());
DirectoryReader indexReader = DirectoryReader.open(directory);
try (LeafReader leafReader = indexReader.leaves().get(0).reader()) {
PostingsEnum postings = leafReader.postings(new Term("tokens", "a"), POSITIONS);
int docId = postings.nextDoc();
List fileIds = new ArrayList();
while (docId != NO_MORE_DOCS) {
fileIds.add(getFileId(docId));
if (fileIds.size() >= 2 && fileIds.get(fileIds.size() - 2) > (fileIds.get(fileIds.size() - 1))) {
return false;
}
docId = postings.nextDoc();
}
return true;
}
}

private static int getFileId(int docId) {
...
}
}

import org.apache.lucene.index.IndexSorter;
import org.apache.lucene.search.SortField;

import java.util.function.Function;

public class DocSortField extends SortField {
private Function docIdToValue;

public DocSortField(String field, Function docIdToValue) {
super(field, Type.DOC);
this.docIdToValue = docIdToValue;
}

@Override
public IndexSorter getIndexSorter() {
return new DocIdSorter(Provider.NAME, docIdToValue);
}
}

import org.apache.lucene.index.IndexSorter;
import org.apache.lucene.index.LeafReader;

import java.util.List;
import java.util.function.Function;

public class DocIdSorter implements IndexSorter {
private final String providerName;
private final Function docIdToValue;

public DocIdSorter(String providerName, Function docIdToValue) {
this.providerName = providerName;
this.docIdToValue = docIdToValue;
}

@Override
public ComparableProvider[] getComparableProviders(List

Подробнее здесь: https://stackoverflow.com/questions/798 ... n-of-docid

1768923381

Anonymous

У меня есть много статических (неизменных) индексов Lucene, где каждому документу присвоено целое число StoredField с именем «fileId» (и каждый индекс имеет только 1 сегмент). Теперь я хочу изменить эти индексы, чтобы документы сортировались по идентификатору файла.
Я нашел здесь два модульных теста для сортировки индексов, но если я использую new Sort(new SortField("fileId", INT)), как в тестах, то при вызове SortingCodecReader.wrap() я получаю "IllegalStateException: неожиданный тип docvalues NONE для поля 'fileId' (expected=NUMERIC). Переиндексируйте с правильным типом значений документа.
Я думаю, это потому, что «fileId» не было полем DocValue. К счастью, я могу легко получить идентификатор файла из идентификатора документа, но, к сожалению, SortField не предоставляет конструктор для сортировки по чему-либо на основе идентификатора документа. Поэтому я создал DocSortField, который расширяет SortField. Кажется, это должно работать, но по какой-то причине после запуска кода индекс не был отсортирован правильно (на самом деле порядок, похоже, вообще не изменился). Сообщения об ошибках нет, поэтому я понятия не имею, где что-то идет не так.  Я попробовал добавить вызов ForceMerge(1) для принудительной сортировки, но это не помогло.
Моя версия Lucene — 9.11.1.
import org.apache.lucene.index.*;
import org.apache.lucene.search.Sort;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.apache.lucene.index.PostingsEnum.POSITIONS;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

public class LuceneIndexSorter {

public static void main(String[] args) throws IOException {
File oldDir = new File("path/to/index");
if (areDocumentsInOrder(oldDir)) {
throw new RuntimeException("Documents already in order");
}
File newDir = new File(oldDir.getParent(), oldDir.getName() + " 2");
Sort indexSort = new Sort(new DocSortField("fileId", LuceneIndexSorter::getFileId));
// The rest comes from https://github.com/apache/lucene-solr/blob/a7bdc6893e21954ed9f6d8bce256a4a9c917310b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java
IndexWriterConfig indexWriterConfig = new IndexWriterConfig().setIndexSort(indexSort);
Directory newDirectory = FSDirectory.open(newDir.toPath());
Directory oldDirectory = FSDirectory.open(oldDir.toPath());
IndexWriter indexWriter = new IndexWriter(newDirectory, indexWriterConfig);
try (DirectoryReader indexReader = DirectoryReader.open(oldDirectory)) {
List wrappedCodecReaders = new ArrayList();
for (LeafReaderContext ctx : indexReader.leaves()) {
CodecReader wrap = SortingCodecReader.wrap(SlowCodecReaderWrapper.wrap(ctx.reader()), indexSort);
assert wrap.toString().startsWith("SortingCodecReader(");
wrappedCodecReaders.add(wrap);
}
indexWriter.addIndexes(wrappedCodecReaders.toArray(new CodecReader[0]));
indexWriter.commit(); // This line is needed to prevent "IndexNotFoundException: no segments* file found in MMapDirectory@...", although it's not used in the unit tests
}
// Check if it worked
if (!areDocumentsInOrder(newDir)) {
throw new RuntimeException("Documents weren't correctly sorted"); // Program ends up here with no other output
}
}

private static boolean areDocumentsInOrder(File dir) throws IOException {
FSDirectory directory = FSDirectory.open(dir.toPath());
DirectoryReader indexReader = DirectoryReader.open(directory);
try (LeafReader leafReader = indexReader.leaves().get(0).reader()) {
PostingsEnum postings = leafReader.postings(new Term("tokens", "a"), POSITIONS);
int docId = postings.nextDoc();
List fileIds = new ArrayList();
while (docId != NO_MORE_DOCS) {
fileIds.add(getFileId(docId));
if (fileIds.size() >= 2 && fileIds.get(fileIds.size() - 2) > (fileIds.get(fileIds.size() - 1))) {
return false;
}
docId = postings.nextDoc();
}
return true;
}
}

private static int getFileId(int docId) {
...
}
}

import org.apache.lucene.index.IndexSorter;
import org.apache.lucene.search.SortField;

import java.util.function.Function;

public class DocSortField extends SortField {
private Function docIdToValue;

public DocSortField(String field, Function docIdToValue) {
super(field, Type.DOC);
this.docIdToValue = docIdToValue;
}

@Override
public IndexSorter getIndexSorter() {
return new DocIdSorter(Provider.NAME, docIdToValue);
}
}

import org.apache.lucene.index.IndexSorter;
import org.apache.lucene.index.LeafReader;

import java.util.List;
import java.util.function.Function;

public class DocIdSorter implements IndexSorter {
private final String providerName;
private final Function  docIdToValue;

public DocIdSorter(String providerName, Function docIdToValue) {
this.providerName = providerName;
this.docIdToValue = docIdToValue;
}

@Override
public ComparableProvider[] getComparableProviders(List

Подробнее здесь: [url]https://stackoverflow.com/questions/79870757/sort-a-lucene-index-by-a-storedfield-or-a-function-of-docid[/url]