SeoTools.java

package sk.iway.iwcm.components.seo;

import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;

import sk.iway.Html2Text;

@SuppressWarnings("java:S1659")
public class SeoTools
{
	private static List<String> middleSentences, maxSentences, middleWords, maxWords;
	private static int sentencesCount, wordsCount, complexWordsCount;

	protected SeoTools() {
		//utility class
	}

	/**
	 * Metoda vracia zoznam objektov typu Density, ktory je vhodny na zobrazenie v tabulke.
	 *
	 * @param html			zdrojovy kod stranky
	 * @param keywords	klucove slova oddelene bodkociarkou ;
	 * @return
	 */
	public static List<Density> getKeywordDensityTable(String html, String [] keywords){

		Html2Text html2Text = new Html2Text(html);

		List<String> h1Elements = html2Text.getTextByElement("h1");
		List<String> h2Elements = html2Text.getTextByElement("h2");
		List<String> h3Elements = html2Text.getTextByElement("h3");
		List<String> strongElements = html2Text.getTextByElement("strong,b");
		List<String> italicElements = html2Text.getTextByElement("i,em");
		List<String> linkElements = html2Text.getTextByElement("a");

		List<Density> keywordDensityList = new ArrayList<>();
		for(String keyword : keywords){
			keyword = keyword.toLowerCase();
			Density keywordDensity = new Density(keyword);

			for(String e : h1Elements){
				keywordDensity.incrementH1(countOccurences(e.toLowerCase(), keyword));
			}

			for(String e : h2Elements){
				keywordDensity.incrementH2(countOccurences(e.toLowerCase(), keyword));
			}

			for(String e : h3Elements){
				keywordDensity.incrementH3(countOccurences(e.toLowerCase(), keyword));
			}

			for(String e : strongElements){
				keywordDensity.incrementStrong(countOccurences(e.toLowerCase(), keyword));
			}

			for(String e : italicElements){
				keywordDensity.incrementItalics(countOccurences(e.toLowerCase(), keyword));
			}

			for(String e : linkElements){
				keywordDensity.incrementLink(countOccurences(e.toLowerCase(), keyword));
			}


			keywordDensity.incrementAlltogether(countOccurences(html.toLowerCase(), keyword));

			keywordDensityList.add(keywordDensity);

		}

		return keywordDensityList;
	}

	/**
	 * Pomocna metoda pre metodu getKeywordDensityTable(String, String[])
	 * @param text 		obsahuje usek prehladavaneho zdrojoveho kodu
	 * @param keyword		klucove slovo
	 * @return
	 */
	private static int countOccurences(String text, String keyword){
		int count=0;
		while(text.contains(keyword)){
			count++;
			text = text.replaceFirst(keyword, "");
		}
		return count;
	}

	public static void countSentences(String html, int middle, int max){
		middleSentences = new ArrayList<>();
		maxSentences = new ArrayList<>();
		html = html.replaceAll("\\<[^>]*>","");
		html = html.replaceAll("\\![^!]*!","");
		if(html != null && html.contains(".")){
			String[] sentences = html.split("[.!?]");
			sentencesCount = sentences.length;
			for(int i=0; i < sentencesCount; i++){
				String[] words = sentences[i].split(" ");
				if(words.length >= middle && words.length < max){
					middleSentences.add(sentences[i]+".");
				}
				if(words.length >= max){
					maxSentences.add(sentences[i]+".");
				}
			}
		}
	}

	public static void countWords(String html, int middle, int max){
		middleWords = new ArrayList<>();
		maxWords = new ArrayList<>();
		complexWordsCount = 0;
		html = html.replaceAll("\\<[^>]*>","");
		html = html.replaceAll("\\![^!]*!","");
		html = html.replaceAll("[.!?]", " ");
		if(html != null && html.contains(" ")){
			String[] words = html.split(" ");
			wordsCount = words.length;
			for(int i=0; i< wordsCount; i++){	//prejde cez kazde slovo
				if(getSyllableCount(words[i]) > 2)
					complexWordsCount++;
				if(words[i].length() >= middle && words[i].length() < max){
					middleWords.add(words[i]);
				}
				if(words[i].length() >= max){
					maxWords.add(words[i]);
				}
			}
		}
	}

	/**
	 * Analyzuje čitateľnosť textu
	 * Používa techniku analyzy Gunning fog index - pre angličtinu -> možná nepresnosť
	 * a mnou navrhnutu funkciu pre počítanie slabík -> =dalšia možná nepresnosť
	 * Odhaduje počet rokov vzdelávania potrebný na porozumenie textu.
	 * Z praxe: 6 znamená vynikajúcu čitateľnosť. 8 až 10 sú časopisové a novinové články, poviedky, ľahko čitateľné a pochopiteľné.
	 * Index 11 až 14 už dosahujú odbornejšie články. Vedecké práce sa zvyčajne dostávajú na úroveň 15 až 20 a vyžadujú už od čitateľa plnú sústredenosť.
	 * Index nad 20 majú len texty, pri ktorých pisateľ celkom ignoruje čitateľa.
	 */
	public static double textReadibility(){
		return (0.4*(wordsCount/(double)sentencesCount)+100*(complexWordsCount/(double)wordsCount));
	}

	public static List<String> getMiddleSentences(){
		return middleSentences;
	}

	public static List<String> getMaxSentences(){
		return maxSentences;
	}

	public static List<String> getMiddleWords(){
		return middleWords;
	}

	public static List<String> getMaxWords(){
		return maxWords;
	}

	private static int getSyllableCount(String word)
	{
		int syllableCount = 0;
		Scanner sc = new Scanner(word);
		sc.useDelimiter("[aeiouyáéíúóýô]+");

		while(sc.hasNext())
		{
			syllableCount++;
			sc.next();
		}

		sc.close();

		if(word.startsWith("a") || word.startsWith("e") || word.startsWith("i") || word.startsWith("o") || word.startsWith("u") || word.startsWith("á")
					|| word.startsWith("é") || word.startsWith("í") || word.startsWith("ó") || word.startsWith("ú"))
		{
			syllableCount++;
		}

		return syllableCount;
	}
}