Parser.java
package edu.odu.cs.cs350.acmClassifier;
//import java.util.ArrayList;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
public class Parser {
/**
*
**/
public void Parse(){
//using apache tika read in file and output text
}
public void loadRawText(Document d)
{
DocumentReader d_reader = new DocumentReader();
try {
d_reader.importPDF(d.filePath);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
d.rawText=d_reader.getDocumentText();
}
/**
* @param d a document
*/
public void loadToken(Document d)
{
//ensure raw text has been loaded if null
if(d.rawText ==null)
this.loadRawText(d);
String words = d.rawText.replaceAll("[^a-zA-Z ]", "").toLowerCase();
//Split raw text
d.tokenizedText= new ArrayList<String>(Arrays.asList(words.split(" ")));
}
public void updateWordCounts(Document d)
{
//ensure load token has been called
d.wordFreqencyMap = new HashMap<String,Integer>();
for (String i : d.tokenizedText) {
Integer j = d.wordFreqencyMap.get(i);
if (j==null)
{
d.wordFreqencyMap.put(i,1);
}
else
{
d.wordFreqencyMap.put(i, d.wordFreqencyMap.get(i)+1);
}
}
}
/**
* @param d d, a document
*/
public void loadRawSignature(Document d)
{
//make sure update word counts has been called
if (d.wordFreqencyMap==null)
this.updateWordCounts(d);
//iterate over each key and load value into the word count list
for (HashMap.Entry<String,Integer> word : d.wordFreqencyMap.entrySet())
{
d.wordCounts.add(word.getValue());
}
}
/**
* @param d a document
* @return the document'snraw signature cotaining each word count
*/
public ArrayList<Integer> getRawSignature(Document d)
{
//call all of document's functions to set up raw signature list, wordcounts
this.loadRawSignature(d);
return d.wordCounts;
}
public static void normalize(Document d){
for (int i = 0; i < d.wordCounts.size(); i++){
if (d.wordCounts.get(i) >= 4){
d.normalizedWordCounts.add(1.0);
}
else{
d.normalizedWordCounts.add(0.0);
}
}
}
}