Sunday, March 30, 2008

Groovy Plus Lucene

The Lucene Index and Search tool from Apache is a beautiful thing. Lucene allows random text (or structured data) to be indexed then searches can be performed in milliseconds. Here are two scripts, one to create the index, the other to search it. I thought it might be interesting to see the results of searches of the JAVA API docs so the path to the JavaDocs is the value used as the datasrc. The names of the scripts are not significant.

indexdocs.groovy

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.FileDocument;

datasrc = "/dev/java/jdk1.6.0_02/docs/api";

indexDir = new File("jdk_api_index");

IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(), true);
writer.setUseCompoundFile(false);

new File(datasrc).eachFileRecurse {
  if (!it.isDirectory() && it.canRead() && it.exists()) {
     writer.addDocument(FileDocument.Document(it));
  }
}

searchdocs.groovy

import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.*;
import org.apache.lucene.index.*;

String index = "jdk_api_index";
String field = "contents";
reader = IndexReader.open(index);
searcher = new IndexSearcher(reader);
analyzer = new StandardAnalyzer();
parser = new QueryParser(field, analyzer);
query = parser.parse("writable");

hits = searcher.search(query);

for (hit in hits) { 
   println( hit.getDocument().get("path") )
}