PDF.java

package sk.iway.iwcm.findexer;

import java.util.StringTokenizer;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import sk.iway.iwcm.Logger;
import sk.iway.iwcm.Tools;
import sk.iway.iwcm.io.IwcmInputStream;


/**
 *  Vytiahne z PDF vsetky texty
 *
 *@Title        WebJET
 *@Company      Interway s.r.o. (www.interway.sk)
 *@Copyright    Interway s.r.o. (c) 2001-2002
 *@author       $Author: jeeff $
 *@version      $Revision: 1.2 $
 *@created      Streda, 2004, január 21
 *@modified     $Date: 2004/01/27 17:48:00 $
 */
public class PDF
{

	/**
	 *  Gets the text attribute of the PDF class
	 *
	 *@param  fileName  Description of the Parameter
	 *@return           The text value
	 */
	public static String getText(String fileName)
	{
		StringBuilder sb = new StringBuilder();
		try
		{
			IwcmInputStream is = new IwcmInputStream(fileName);

			PDDocument pdfDocument = Loader.loadPDF(new RandomAccessReadBuffer(is), "");
	        try {
	      	   /*
	            if (pdfDocument.isEncrypted()) {
	                try {
	                    pdfDocument.decrypt("");
	                } catch (Exception e) {
	                    // Ignore
	                }
	            }
	            */

	            PDFTextStripper stripper = new PDFTextStripper();

				   stripper.setSortByPosition(false);
				   stripper.setSuppressDuplicateOverlappingText(true);

				   String text = stripper.getText(pdfDocument);

				   //System.out.println(text);

              StringTokenizer st = new StringTokenizer(text, "\n");
              while (st.hasMoreTokens())
              {
                 String line = fixVerticalText(st.nextToken());
                 if (Tools.isEmpty(line)) continue;
                 sb.append(line);
                 sb.append('\n');
              }

               //sb.append(text);

	        } finally {
	            pdfDocument.close();
	        }

			try { if (is!=null) is.close(); } catch (Exception ex) { sk.iway.iwcm.Logger.error(ex); }

			return(sb.toString());
			//return (sw.getBuffer().toString());
		}
		catch (Exception ex)
		{
			sk.iway.iwcm.Logger.error(ex);
		}

		return (null);
	}

   /**
    * v TB sme narazili na problem parsovania horizontalneho textu, ani aktualna verzia PDFboxu to neriesila
    * vystupom boli bloky textu typu silna sucast:
    * ssiil
    * naa
    * sssuuuu
    * ccaaaaa
    * ssttt
    * @param line
    * @return
    */
	private static String fixVerticalText(String line)
   {
      line = line.trim();
      if (line.length()>25 || line.contains(" ") || line.contains(",") || line.contains(".")) return line;
      if (line.length()<3) return "";

      char lastChar = line.charAt(0);
      StringBuilder sb = new StringBuilder();
      sb.append(lastChar);
      for (int i = 1; i<line.length(); i++)
      {
         char ch = line.charAt(i);
         if (lastChar != ch)
         {
            sb.append(ch);
            lastChar = ch;
         }
      }

      Logger.debug(PDF.class, "fixVerticalText: line="+line+" sb="+sb.toString());

      if (sb.length()<3) return "";

      return sb.toString();
   }
}