Lemmas.java

package sk.iway.iwcm.system.fulltext.lucene;

import java.nio.ByteBuffer;
import java.util.Arrays;

import io.github.duckasteroid.cdb.Cdb;

import org.apache.lucene.analysis.cz.CzechStemmer;
import org.apache.lucene.analysis.de.GermanMinimalStemmer;
import org.apache.lucene.analysis.en.EnglishMinimalStemmer;

import sk.iway.iwcm.Cache;
import sk.iway.iwcm.Constants;
import sk.iway.iwcm.Logger;
import sk.iway.iwcm.Tools;
import sk.iway.iwcm.system.fulltext.cdb.CdbCacheListener;
import sk.iway.iwcm.system.fulltext.cdb.CdbFactory;
import sk.iway.iwcm.system.fulltext.cdb.CdbUtils;

/**
 * Lemmas.java
 *
 *@Title webjet7
 *@Company Interway s.r.o. (www.interway.sk)
 *@Copyright Interway s.r.o. (c) 2001-2011
 *@author $Author: jeeff thaber $
 *@version $Revision: 1.3 $
 *@created Date: 4.5.2011 15:40:45
 *@modified $Date: 2004/08/16 06:26:11 $
 */
public class Lemmas
{
	protected Lemmas() {
		//utility class
	}

	/**
	 * Vytvori lemmas z celej vety:
	 * Žiadosti o výplatu -> ziadost o vyplat
	 * @param language
	 * @param text
	 * @return
	 */
	public static String get(String language, String text) {
		String[] words = Tools.getTokens(text, " \t\n");
		StringBuilder response = new StringBuilder();
		for (String word : words) {
			if (response.length()>1) response.append(" ");
			char[] lemmas = get(language, word.toCharArray(), 0, word.length());
			if (lemmas != null && lemmas.length>1) response.append(lemmas);
		}

		return response.toString();
	}

	/**
	 * Get a lemma from the supplied form
	 * @param language
	 * @param form
	 * @param offset
	 * @param length
	 * @return
	 */
	public static char[] get(String language, char[] form,int offset,int length)
	{
		if (Constants.getBoolean("luceneIndexingSkAlgorithmicStemming") && "sk".equals(language)){
			return SlovakStemmer.stem(new String(form,offset,length)).toCharArray();
		}

		if ("cz".equals(language)) {
			//pre CZ nemame Lemmas, mame len stemmer, ale lepsie ako nic
			CzechStemmer stemmer = new CzechStemmer();
			int baseLength = stemmer.stem(form, length);
			if (baseLength<1) return form;
			char[] stemmed = Arrays.copyOf(form, baseLength);
			return stemmed;
		}
		else if ("en".equals(language)) {
			//pre CZ nemame Lemmas, mame len stemmer, ale lepsie ako nic
			EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();
			int baseLength = stemmer.stem(form, length);
			if (baseLength<1) return form;
			char[] stemmed = Arrays.copyOf(form, baseLength);
			return stemmed;
		}
		else if ("de".equals(language)) {
			//pre CZ nemame Lemmas, mame len stemmer, ale lepsie ako nic
			GermanMinimalStemmer stemmer = new GermanMinimalStemmer();
			int baseLength = stemmer.stem(form, length);
			if (baseLength<1) return form;
			char[] stemmed = Arrays.copyOf(form, baseLength);
			return stemmed;
		}

		try {
			Cache c = Cache.getInstance();
			CdbCacheListener.init();
			String CACHE_KEY = "Lucene.Lemmas."+language+"."+Thread.currentThread().getId();
			Cdb cdb = (Cdb)c.getObject(CACHE_KEY);
			if (cdb == null) {
				cdb = (Cdb)new CdbFactory(language,CdbFactory.Type.LEMMAS).makeObject();
				c.setObjectSeconds(CACHE_KEY, cdb, 5*60, false);
			}

			ByteBuffer bytes = cdb.find( ByteBuffer.wrap(CdbUtils.encode(form, offset, length)) );

			if (bytes != null && bytes.hasArray())
			{
				return CdbUtils.decode(bytes.array());
			}
		} catch (Exception e) {
			Logger.error(Lemmas.class, e);
		}
		return SlovakStemmer.stem(new String(form,offset,length)).toCharArray();
	}
}