SearchSnippet.java

package sk.iway.iwcm.doc;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import javax.servlet.http.HttpServletRequest;

import sk.iway.iwcm.Constants;
import sk.iway.iwcm.DB;
import sk.iway.iwcm.Tools;
import sk.iway.iwcm.common.EditorToolsForCore;
import sk.iway.iwcm.common.SearchTools;

public class SearchSnippet {

    private int prepend = 0;
    private int append = 0;

    private SearchDetails doc;
    private String textToFind;
    private HttpServletRequest request;
    private String textToFindAscLC;
    private String dataAsc;

    private List<String> tokens = new ArrayList<>();
    private List<String> tokensFound = new ArrayList<>();
    private List<String> snippets = new ArrayList<>();
    private String snippet;

    public SearchSnippet(SearchDetails doc, String textToFind, HttpServletRequest request) {
        this.doc = doc;
        this.textToFind = textToFind;
        this.request = request;

        this.prepend = Constants.getInt("searchSnippetPrepend");
        if (this.prepend == -1) {
            this.prepend = 100;
        }

        this.append = Constants.getInt("searchSnippetAppend");
        if (this.append == -1) {
            this.append = 100;
        }

        renderSnippet();
        renderHighlights();
    }

    public String getSnippet()
    {
        return snippet;
    }

    private void renderSnippet() {
        textToFindAscLC = DB.internationalToEnglish(textToFind).toLowerCase();

        //odstranme HTML kod a INCLUDE prikazy
        String dataOriginalNoHtml;

        if (SearchAction.shouldDoQuickSnippet(doc, request))
        {
            //tu ponechavame cely HTML kod, bolo by to narozne na odstranenie (velky HTML kod)
            dataOriginalNoHtml = doc.getDataOriginal();

            dataAsc = dataOriginalNoHtml;
            //pridame do ttfAscLC aj povodne slovicka (kedze ASC verziu nam to asi nenajde)
            textToFindAscLC += " " + textToFind;
        }
        else
        {
            //odstranme HTML kod a INCLUDE prikazy
            dataOriginalNoHtml = EditorToolsForCore.removeHtmlTagsKeepLength(doc.getDataOriginal());

            dataAsc = DB.internationalToEnglish(dataOriginalNoHtml).toLowerCase();
        }

        //kvoli efektivite scanovania si z povodneho HTML kodu spravime len kratke casti obsahujuce hladane slova
        //scanujeme nad ASC slovickami ale vytvarame to v diakritikovych hodnotach
        tokens = new ArrayList<>(Arrays.asList(Tools.getTokens(textToFindAscLC, " ")));
        tokensFound = new ArrayList<>();

        Pattern includeReplace = Pattern.compile("!INCLUDE.*?\\)!", Pattern.CASE_INSENSITIVE);
        Set<String> uzSomHladalSlovicka = new HashSet<>();

        for (String token : tokens) {
            if (tokensFound.contains(token) || uzSomHladalSlovicka.contains(token) || token.length() < 3) {
                continue;
            }

            uzSomHladalSlovicka.add(token);
            int i = dataOriginalNoHtml.indexOf(token);
            if (i==-1) i = dataAsc.indexOf(token);
            if (i != -1)
            {
                int start = i - prepend;
                int end = i + append;
                if (start < 0) start = 0;

                if (end > dataOriginalNoHtml.length()) end = dataOriginalNoHtml.length();

                String part = dataOriginalNoHtml.substring(start, end);
                part = clear(replaceINCLUDE(part, includeReplace));

                snippets.add(part);
                tokensFound = containsAny(part, tokens);
            }
        }

        if (snippets.isEmpty())
        {
            String data = (substring(dataOriginalNoHtml, prepend + append));
            //moze nastat situacia ze klucove slova su v title ani v dataOriginal nie su ale je tam !INCLUDE ktory by sa zobrazil vo vysledkoch vyhlavania.
            data = clear(replaceINCLUDE(data,includeReplace));
            snippets.add(data);
        }

        if (snippets.isEmpty())
        {
            String data = clear(substring(doc.getTitle(), prepend + append));
            snippets.add(data);
        }
    }

    private static String replaceINCLUDE(String dataParam, Pattern includeReplace)
    {
        String data = includeReplace.matcher(dataParam).replaceAll(" ");
        int includeIndex = data.indexOf("!INCLUDE");
        if (includeIndex!=-1)
        {
            //regexp nezbehol, zahod vsetko za !INCLUDE
            if (includeIndex<1) {
                data = "";
            }
            else {
                data = data.substring(0, includeIndex);
            }
        }
        return data;
    }

    private void renderHighlights() {
        String separator = "...";
        snippet = separator + Tools.join(snippets, separator + " " + separator) + separator;
        StringBuilder snippetData = new StringBuilder();

        //zvyrazni slova
        StringTokenizer wordTokenizer = new StringTokenizer(snippet);
        while (wordTokenizer.hasMoreTokens())
        {
            String word = wordTokenizer.nextToken();
            String wordAscLC = DB.internationalToEnglish(word).toLowerCase();

            boolean highlight = false;
            StringTokenizer st = new StringTokenizer(textToFindAscLC);
            while (st.hasMoreTokens())
            {
                if (wordAscLC.indexOf(st.nextToken())!=-1)
                {
                    highlight = true;
                    break;
                }
            }

            if (highlight) snippetData.append("<strong>");
            snippetData.append(word);
            if (highlight) snippetData.append("</strong>");
            snippetData.append(' ');
        }

        snippet = snippetData.toString().trim();
    }

    private String toAscLc(String text) {
        return DB.internationalToEnglish(text).toLowerCase();
    }

    private List<String> containsAny(String text, List<String> tokens) {
        String textAscLc = toAscLc(text);
        List<String> result = new ArrayList<>();
        for (String token : tokens) {
            if (textAscLc.contains(token)) {
                result.add(token);
            }
        }

        return result;
    }

    private String substring(String text, int end) {
        String result = text;
        int endier = end;
        if (endier > result.length()) {
            endier = result.length();
        }

        return result.substring(0, endier);
    }

    private String clear(String text) {
        String result = text;

        int openTagIndex = text.indexOf("<");
        int closeTagIndex = text.indexOf(">");

        if (closeTagIndex > -1 && (openTagIndex == -1 || openTagIndex > closeTagIndex)) {
            result = result.substring(closeTagIndex + 1);
        }

        result = SearchTools.htmlToPlain(result);
        result = Tools.replace(result, "\n", " ");
        result = Tools.replace(result, "\r", " ");
        result = Tools.replace(result, "\t", " ");

        return result;
    }
}