SearchTools.java
package sk.iway.iwcm.common;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import sk.iway.iwcm.Logger;
import sk.iway.iwcm.Tools;
public class SearchTools {
// ze ak je (String)output prazdny tak sa tam da tolko charov ...
public final static String[] checkInputParams = {"html_head", "html_data", "field_a", "field_b", "field_c", "field_d", "field_e", "field_f", "field_g", "field_h", "field_i", "field_j", "field_k", "field_l", "field_o", "field_n", "publish_start", "publish_end", "title", "publish_start_lt","publish_start_gt", "publish_end_gt", "publish_end_lt","temp_id", "perex_place", "keyword"};
/**
* Odstrani z HTML kodu riadiace bloky typu !INCLUDE(...)!, !PARAM(...)!
* @param html
* @return
*/
public static String removeCommands(String html)
{
if (html != null)
{
if (html.contains(")!"))
{
Pattern replace = Pattern.compile(
"(!INCLUDE.*?\\)!)|(!REMAP_PAGE.*?\\)!)|(!PARAMETER.*?\\)!)|(!REQUEST.*?\\)!)|(!LOGGED_USER.*?\\)!)", Pattern.CASE_INSENSITIVE);
html = replace.matcher(html).replaceAll("");
}
return html;
}
return null;
}
/**
* Vrati true ak posledny riadok html kodu obsahuje zadany text
* @param html
* @param text
* @return
*/
private static boolean lastLineContains(String html, String text)
{
try
{
//najdi posledny riadok
int i = html.lastIndexOf('\n');
if (i>0 && html.substring(i).indexOf(text)!=-1) return true;
} catch (Exception ex) {}
return false;
}
public static String htmlToPlain(String html)
{
if (html == null) return "";
html = removeCommands(html);
html = Tools.replace(html, " ", " ");
int failsafe = 0;
if (html != null)
{
//toto nefungovalo na nekorektny HTML kod (napr. useknuty pre fastSnippet)
//return new Source(html).getTextExtractor().toString();
StringTokenizer sTok = new StringTokenizer(html, "<>", true);
String pom = "";
StringBuilder plain = new StringBuilder();
while (sTok.hasMoreElements() && failsafe++ < 10000)
{
try
{
pom = sTok.nextToken();
if (pom.equals("<") && sTok.hasMoreElements())
{
String tagName = sTok.nextToken().toLowerCase();
if (tagName.startsWith("/p") || tagName.startsWith("/h") || tagName.startsWith("/ul") || tagName.startsWith("/div"))
{
//ten trim odstrani poslednu medzeru na konci riadku
plain.append("\n\n");
}
else if (tagName.startsWith("br") || tagName.startsWith("/tr") || tagName.startsWith("/li") || tagName.startsWith("ul") || tagName.startsWith("h") )
{
//ten trim odstrani poslednu medzeru na konci riadku
plain.append("\n");
}
else if (tagName.startsWith("li")) plain.append("* ");
else if (tagName.startsWith("span class='emailinput-") && plain.toString().trim().endsWith(":")==false && lastLineContains(plain.toString(), ":")==false)
{
//doplnime znak : za posledny text
plain.append(":");
}
if (sTok.hasMoreTokens())
sTok.nextToken();
}
else
{
/*
* if (pom.equals( ))
*/
if (pom!=null && Tools.isNotEmpty(pom.trim()))
{
//ak to nekonci na \n pridaj medzeru
if (plain.length()>0 && plain.toString().endsWith("\n")==false) plain.append(" ");
plain.append(pom.trim());
}
}
}
catch (Exception ex)
{
plain.append("- ");
Logger.error(SearchTools.class,"CHYBA PARSOVANIA HTML: " + html);
sk.iway.iwcm.Logger.error(ex);
}
}
plain = Tools.replace(plain, "\n\n\n", "\n\n");
plain = Tools.replace(plain, " ", " ");
plain = Tools.replace(plain, "&", "&");
plain = Tools.replace(plain, " \n", "\n");
plain = Tools.replace(plain, " \n", "\n");
plain = Tools.replace(plain, " \n", "\n");
String plainString = plain.toString().trim();
plainString = plainString.trim();
plainString = plainString.trim();
return plainString;
}
return null;
}
public static String[] getCheckInputParams()
{
return checkInputParams;
}
}