CustomAnalyzer.java
package sk.iway.iwcm.system.fulltext.lucene;
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.ASCIIFoldingFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
import sk.iway.iwcm.system.fulltext.FulltextSearch;
/**
* CustomAnalyzer
* Applies Lemmatising ONLY on fields DATA and TITLE
*
*@Title webjet7
*@Company Interway s.r.o. (www.interway.sk)
*@Copyright Interway s.r.o. (c) 2001-2011
*@author $Author: jeeff thaber $
*@version $Revision: 1.3 $
*@created Date: 15.4.2011 11:27:45
*@modified $Date: 2004/08/16 06:26:11 $
*/
public class CustomAnalyzer extends StopwordAnalyzerBase
{
/** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
/**
* Specifies whether deprecated acronyms should be replaced with HOST type.
* See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"}
*/
//private final boolean replaceInvalidAcronym;
private String language;
/** An unmodifiable set containing some common English words that are usually not
useful for searching. */
/** Builds an analyzer with the given stop words.
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
* @param stopWords stop words */
private CustomAnalyzer(Version matchVersion, Set<?> stopWords) {
super(matchVersion, stopWords);
//replaceInvalidAcronym = matchVersion.onOrAfter(Version.LUCENE_31);
}
/** Builds an analyzer with the default stop words ({@link
* #STOP_WORDS_SET}).
* @param matchVersion Lucene version to match See {@link
* <a href="#version">above</a>}
*/
public CustomAnalyzer(Version matchVersion,String language) {
this(matchVersion, FulltextSearch.stopwords(language));
this.language = language;
}
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
* reusableTokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
/**
* @see #setMaxTokenLength
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader)
{
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
src.setMaxTokenLength(maxTokenLength);
//src.setReplaceInvalidAcronym(replaceInvalidAcronym);
TokenStream tok = new StandardFilter(matchVersion, src);
tok = new LowerCaseFilter(matchVersion, tok);
/* use algorithmic stemmers for Enlish and Czech language */
if (language.equals("en"))
{
tok = new PorterStemFilter(tok);
tok = new StopFilter(matchVersion, tok, stopwords);
}else if (language.equals("cz"))
{
tok = new CzechStemFilter(tok);
tok = new ASCIIFoldingFilter(tok);
tok = new StopFilter(matchVersion, tok, stopwords);
}
else{
tok = new ASCIIFoldingFilter(tok);
tok = new StopFilter(matchVersion, tok, stopwords);
tok = new LemmatisatingFilter(Version.LUCENE_31, tok, language);
}
return new TokenStreamComponents(src, tok)
{
@Override
protected boolean reset(final Reader reader) throws IOException
{
src.setMaxTokenLength(maxTokenLength);
return super.reset(reader);
}
};
}
}