DocumentReader.java
package edu.odu.cs.cs350.acmClassifier;
//All the File handlers
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
//All the Tika Components
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
//Exception Handling
import org.xml.sax.SAXException;
public class DocumentReader {
// handler and contexts
BodyContentHandler handler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
ParseContext pcontext = new ParseContext();
FileInputStream inputstream;
PDFParser pdfparser = new PDFParser();
// Locates the filepath as inputted pdf file and opens the context and extracts
// the pdf file
public void importPDF(String filepath) throws FileNotFoundException {
inputstream = new FileInputStream(new File(filepath));
try {
pdfparser.parse(inputstream, handler, metadata, pcontext);
System.out.println(handler.toString());
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
}
}
public String getDocumentText() {
return handler.toString();
}
}