Tuesday, 28 October 2014

Extracting text from PDF files using Selenium + PDF Box

In many production environments, PDF files need to be checked before going to print  or send to customer in order to avoid Legal issues and costly reprints.This PDF files cannot be read by using Selenium. So, here we use PDFBOX, which is third party jar file that reads data from PDF Files. The below example illustrates how to read PDF file by opening them in the browser. To work with this, add the below jar file in classpath of eclipse along with selenium webdriver.
pdfbox-app-1.8.3.jar


import java.io.BufferedInputStream;
import java.io.IOException;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.util.PDFTextStripper;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.firefox.FirefoxDriver;

public class Sample {

 Static WebDriver driver;

   public Static void main(String args[]) throws IOException{
 try{
    // Proxy has to be set if we working under any firewal 
   System.setProperty("http.proxyHost", "proxyname.com");
System.setProperty("http.proxyPort", "portnumber");
System.setProperty("https.proxyHost", "proxyname.com");
System.setProperty("https.proxyPort", "portnumber");
driver = new FirefoxDriver();
 driver.get("http://keith-wood.name/realPerson.html");
 driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
 URL url = new URL(driver.getCurrentUrl()); 
 BufferedInputStream fileToParse=new BufferedInputStream(url.openStream());

 //parse()  --  This will parse the stream and populate the COSDocument object. 
 //COSDocument object --  This is the in-memory representation of the PDF document

 PDFParser parser = new PDFParser(fileToParse);
 parser.parse();

 //getPDDocument() -- This will get the PD document that was parsed. When you are done with this document you must call    close() on it to release resources
 //PDFTextStripper() -- This class will take a pdf document and strip out all of the text and ignore the formatting and such.

 String output=new PDFTextStripper().getText(parser.getPDDocument());
 System.out.println(output);
 parser.getPDDocument().close(); 
 driver.manage().timeouts().implicitlyWait(100, TimeUnit.SECONDS);
 }
 catch(Exception e){
 System.out.println(e.getMessage());
 }
  }

}


No comments:

Post a Comment