`

lucene跨度域查询(一)

 
阅读更多

一、跨度域查询

       基类是:SpanQuery

       子类:SpantermQuery;     //测试用的这个子类当然还有其他

 

二、测试

public class SpanTest {

	Directory dir;
	IndexWriter writer;
	IndexReader reader;
    IndexSearcher search;
    String[] ceshi;
    //初始化把索引存在内存中做测试
    public void init() throws IOException{
    	dir=new RAMDirectory();
    	writer=writer(dir);
    	ceshi=new String[]{"i like you","are you ok"};
    	
    }
    
    public IndexWriter writer(Directory dir) throws IOException{
    	Analyzer analyzer=new WhitespaceAnalyzer(Version.LUCENE_42);
    	IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_42,analyzer);
    	return    new IndexWriter(dir, config);
    }
    
    public void createWrite() throws IOException{
    	
    	for(int i=0;i<ceshi.length;i++){
    		
    		Document doc=new Document();
    		doc.add(new TextField("contents",ceshi[i],Store.YES));
    	    writer.addDocument(doc);
    	}
    	writer.close();
    }
    
    public void test() throws IOException{
    	
    	reader=DirectoryReader.open(dir);
    	search=new IndexSearcher(reader);
    	
    	
    	SpanTermQuery query=new SpanTermQuery(new Term("contents","you"));
    	Map<Term,TermContext>m=new HashMap<Term,TermContext>();
		TermContext termContext=TermContext.build(search.getTopReaderContext(),query.getTerm(),false);
		m.put(query.getTerm(), termContext);
		
		Bits bits = new Bits.MatchAllBits(0); 
				
	    Spans spans=query.getSpans(reader.getContext().leaves().get(0),bits, m);
	    int num=0;
	    System.out.println(query.getTerm());
	    while(spans.next()){
	    	num++;
	    	int id=spans.doc();
	    	
	    	Document d=reader.document(id);
	   Analyzer analyzer=new WhitespaceAnalyzer(Version.LUCENE_42);
	   TokenStream ts= analyzer.tokenStream("contents", new StringReader(d.get("contents")));
	  CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
	  ts.reset();  //此行,不能少,不然会报 java.lang.ArrayIndexOutOfBoundsException
  	 
      StringBuffer buffer=new StringBuffer("");
      
      buffer.append("");
  	int k=0;
  	 while(ts.incrementToken()){
       	    
       		if(k==spans.start()){			
       		buffer.append("<");
       		}
       		buffer.append(termAttribute.toString());
       		if(k+1==spans.end()){
       			
       			buffer.append(">");
       		}
   
       		buffer.append("  ");
       		k++;
       	}
  	 System.out.println(spans.start()+"        "+spans.end());
	    	System.out.println(buffer);
	   
	    	//if(num==3)break;
	    	//break;
	    }
	    if(num==0){
	    	
	    	System.out.println("no spans");
	    }
    }
    
   public static void main(String[] args) throws IOException {
	SpanTest s=new SpanTest();
	s.init();
	s.createWrite();
	s.test();
}
}

 运行结果:

 最后:

int k=0;
  	 while(ts.incrementToken()){
       	    
       		if(k==spans.start()){			
       		buffer.append("<");
       		}
       		buffer.append(termAttribute.toString());
       		if(k+1==spans.end()){
       			
       			buffer.append(">");
       		}
   
       		buffer.append("  ");
       		k++;
       	}
但是当用其他的分词器里面的匹配写法就出错了,因为其他分词器可能了一些会分词的时候会根据
需求去掉一些词,例如StopWords或者其他的一些词,而那个匹配是根据从0开始的,所以当用
其他的分词器这个地方不是一个真正的单词位置。

 

 

 

 

 

 

 

 

 

 

 

 

  • 大小: 8.4 KB
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics