Pages

Text Analysis | Document parser

Long back, I was involved in a project which deals with Text Analysis. It was a preety interesting project which I realized later, when the project was actually completed. Anyways, recently I was surfing odesk to see if there is something I could make a few bucks easily. I found an assignment project where the basic requirement was to be able to parse a text file, and generate a report of number of unique words, their counts etc. I did not get hired for the job, buy I decided to post my work here.

Backthen, the problem was actually to group news article. There are many sources of news in the internet. Our idea was to channel these pieces of articles into our application, run some analyzing and scoring algorithms, make a cluster of related news, and to present the related news in a more suggestive way according to the users interest.

package testapplications;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.StringTokenizer;
import java.util.Vector;

/**
 *
 * @author Rajan Prasad Upadhyay
 */
/*
 * This class holds one piece of text document. It splits the text into 
 * words along with their count, so that distinct words could be easily found 
 * out. used for indexing, and calculating similarity between documents in collection
 * class
 */
public class Document {

    /*
     * This class is made as a data-structure for holding the words along with their
     * counts in the text when the text is splitted into words in vector.
     */
    private class StringCounter {

        public String word = "";
        public int count = 1;

        public StringCounter(String str) {
            word = str;
        }

        public void increaseCount() {
            count++;
        }

    }

    String text;
    public Vector<String> words = new Vector();
    public Vector<StringCounter> count = new Vector();

    public void sentenceSplitter(String speech) {
        /*default found in java, not used here*/
        /*This will break the text into tokens, words*/
        //String speech = "My name is Rajan Prasad Upadhyay.I am twenty-two years old.";
        StringTokenizer st = new StringTokenizer(speech, ".");
        System.out.println(st.countTokens());
        //st.nextToken(".");
        while (st.hasMoreTokens()) {
            System.out.println(st.nextToken());
        }
    }

    public Document(List<String> lines) {
        text = "";
        for (String str : lines) {
            text += str;
        }
        parse();
        //System.out.println(text);
    }

    public Document(String text) {
        this.text = text;
        parse();
    }

    public Document() {
    }//never use like this
    /*internal operations methods not needed outside*/

    private boolean isValid(char c) {
        //
        if (c == '.' || c == '\'' || c == ',' || c == '?' || c == '!' || c == ':' || c == '\t' || c == '\n') {
            return false;
        } else {
            return true;
        }
    }

    private void parse() {
        /*split the text into words and store in vector*/
        String word = "";
        char c;
        int i = 0;
        int n = text.length();
        while (i < n) {
            c = text.charAt(i++);
            //System.out.print(c);
            if (c != ' ' && isValid(c)) {
                word += c;
                if (i == n) {
                    this.insertWord(word);
                    word = "";
                }
                continue;
            } else {
                if (word.length() > 0) {
                    //System.out.println(word);
                    this.insertWord(word);
                    word = "";
                } else {
                    //ie word.length=0
                    continue;
                }
            }//end else
        }//while

    }
    /*
     * These methods may be needed outside 
     */

    public void insertWord(String word) {
        /*
         * if word is not present in the vector then insert it into the vector
         * and allocate count 1 , otherwise increase the count associated with 
         * the word by 1.
         */
        word = word.trim().toLowerCase();
        //if you want case sensitive, comment the above line
        if (!words.contains(word)) {
            words.add(word);
            count.add(new StringCounter(word));
        } else {
            count.elementAt(words.indexOf(word)).increaseCount();
        }
    }

    public void printWeighedWords() {
        /*
         * This is just a debugging method
         */
        int max = getDistinctWordsCount();
        for (int i = 0; i < max; i++) {
            System.out.println(words.elementAt(i) + " :" + count.elementAt(i).count);
        }
    }

    public int getDistinctWordsCount() {
        /*
         * returns the total number of distinct words
         * ie. if the word "rain" is repeated many times
         * it returns as only one word
         */
        return words.size();
    }

    public int getTotalWordsCount() {
        /*
         * returns total number of words, counts
         * same word multiple times if it is repeated 
         * multiple times
         */
        int i = 0;
        for (int j = 0; j < count.size(); j++) {
            i += count.elementAt(j).count;
        }
        return i;
    }

    public int getCountOfTerm(String term) {
        /*
         * returns the count of term if it is present
         * otherwise returns zero
         */
        int ind = words.indexOf(term);
        if (ind > 0) {
            return count.elementAt(ind).count;
        } else {
            return 0;
        }
    }

    public String toString() {
        String doc = "[";
        int max = getDistinctWordsCount();
        for (int i = 0; i < max; i++) {
            doc += words.elementAt(i) + "(" + count.elementAt(i).count + ")";
            if (i < max - 1) {
                doc += ", ";
            }
        }
        doc += "]";
        return doc;

    }

    public static void main(String[] args) throws IOException {
       //Document doc = parseFile("assignment1.txt");

        Document doc = new Document("Hello World, My name is Rajan. and Hello world, "
                + "I enjoy doing this kind of works.");

        System.out.println("Total words count: " + doc.getTotalWordsCount());
        System.out.println("Distinct words count: " + doc.getDistinctWordsCount());
        System.out.println("Weighed words: \n");
        System.out.println(doc);

    }

    public static Document parseFile(String filepath) {
        // String filepath = "assignment1.txt";
        Charset ENCODING = StandardCharsets.UTF_8;

        Path path = Paths.get(filepath);

        //System.out.println(Files.readAllLines(path, ENCODING));
        Document doc = null;
        try {
            doc = new Document(Files.readAllLines(path, ENCODING));
            int count = doc.getDistinctWordsCount();
            System.out.println("Words count:  " + count);
            System.out.println("Words:  " + doc.words);
            doc.printWeighedWords();
        } catch (IOException e) {
            System.out.println("error");
            e.printStackTrace();
        }
        return doc;
    }
}