Document.java
package edu.odu.cs.cs350.acmClassifier;
import java.util.ArrayList;
/**
* A generic document, containing all the information Parser and Trainer need.
*/
public class Document implements Cloneable {
/**
*
* The raw text of the document.
*/
public String rawText;
/**
* The tokenized, unpruned text of the document.
*/
public ArrayList<String> tokenizedText;
/**
* An array of how many times each word in the document appears.
*/
public ArrayList<Integer> wordCounts;
/**
* An array of normalized word counts.
* Normalization has two steps:
* 1. If word count is greater than 4, set normalized word count to 1.
* Otherwise, set to 0.
* 2. Multiply normalized word count by the inverse document frequency.
*/
public ArrayList<Double> normalizedWordCounts;
/**
* The classification of the document.
*/
public ACMClass classification;
/**
* Construct an empty, unclassified Document.
*/
public Document() {
rawText = new String("");
tokenizedText = new ArrayList<String>();
wordCounts = new ArrayList<Integer>();
normalizedWordCounts = new ArrayList<Double>();
classification = new ACMClass();
}
/**
* Copy constructor.
*
* @param doc the document to copy from
*/
public Document(Document doc) {
// TODO
}
@Override
public Document clone() {
return new Document(this);
}
@Override
public String toString() {
// TODO
return "I am a Document";
}
}