添加时间:[2010-7-22 18:03:36]
|
Java Lucene排重实现group by
作者无: 加入时间:2010-7-22 18:03:36 点击次数:488 次 package com.loongtao.lucene.test; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.search.DuplicateFilter; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; public class DuplicateFilterTest { public static void main(String[] args) { Directory dir = new RAMDirectory(); Document doc = new Document(); doc.add(new Field("id", "binbin", Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("duplicate", "123456", Store.YES, Index.NOT_ANALYZED)); Document doc1 = new Document(); doc1.add(new Field("id", "yaoyao", Store.YES, Index.NOT_ANALYZED)); doc1.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc1.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc1 .add(new Field("duplicate", "123456", Store.YES, Index.NOT_ANALYZED)); Document doc2 = new Document(); doc2.add(new Field("id", "zhangjian", Store.YES, Index.NOT_ANALYZED)); doc2.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc2.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc2 .add(new Field("duplicate", "123455", Store.YES, Index.NOT_ANALYZED)); Document doc3 = new Document(); doc3.add(new Field("id", "liweicheng", Store.YES, Index.NOT_ANALYZED)); doc3.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED)); doc3.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED)); doc3 .add(new Field("duplicate", "123451", Store.YES, Index.NOT_ANALYZED)); try { IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED); indexWriter.addDocument(doc); indexWriter.addDocument(doc1); indexWriter.addDocument(doc2); indexWriter.addDocument(doc3); indexWriter.close(); Query query = new TermQuery(new Term("string", "haha")); Filter filter = new DuplicateFilter("duplicate"); IndexSearcher indexSearcher = new IndexSearcher(dir); TopDocs top = indexSearcher.search(query, filter, 200); ScoreDoc[] scoreDocs = top.scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { Document rdoc = indexSearcher.doc(scoreDoc.doc); System.out.print("id:"+rdoc.get("id") +" 排重ID:" +rdoc.get("duplicate")); Query queryDuplicate = new TermQuery(new Term("duplicate", rdoc.get("duplicate"))); System.out.println("转载:"+ indexSearcher.search(queryDuplicate, 100).totalHits ); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } 上一篇: |
|||||