Pages

Text Analysis | Document parser

Long back, I was involved in a project which deals with Text Analysis. It was a preety interesting project which I realized later, when the project was actually completed. Anyways, recently I was surfing odesk to see if there is something I could make a few bucks easily. I found an assignment project where the basic requirement was to be able to parse a text file, and generate a report of number of unique words, their counts etc. I did not get hired for the job, buy I decided to post my work here.

Backthen, the problem was actually to group news article. There are many sources of news in the internet. Our idea was to channel these pieces of articles into our application, run some analyzing and scoring algorithms, make a cluster of related news, and to present the related news in a more suggestive way according to the users interest.

package testapplications;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.StringTokenizer;
import java.util.Vector;

/**
 *
 * @author Rajan Prasad Upadhyay
 */
/*
 * This class holds one piece of text document. It splits the text into 
 * words along with their count, so that distinct words could be easily found 
 * out. used for indexing, and calculating similarity between documents in collection
 * class
 */
public class Document {

    /*
     * This class is made as a data-structure for holding the words along with their
     * counts in the text when the text is splitted into words in vector.
     */
    private class StringCounter {

        public String word = "";
        public int count = 1;

        public StringCounter(String str) {
            word = str;
        }

        public void increaseCount() {
            count++;
        }

    }

    String text;
    public Vector<String> words = new Vector();
    public Vector<StringCounter> count = new Vector();

    public void sentenceSplitter(String speech) {
        /*default found in java, not used here*/
        /*This will break the text into tokens, words*/
        //String speech = "My name is Rajan Prasad Upadhyay.I am twenty-two years old.";
        StringTokenizer st = new StringTokenizer(speech, ".");
        System.out.println(st.countTokens());
        //st.nextToken(".");
        while (st.hasMoreTokens()) {
            System.out.println(st.nextToken());
        }
    }

    public Document(List<String> lines) {
        text = "";
        for (String str : lines) {
            text += str;
        }
        parse();
        //System.out.println(text);
    }

    public Document(String text) {
        this.text = text;
        parse();
    }

    public Document() {
    }//never use like this
    /*internal operations methods not needed outside*/

    private boolean isValid(char c) {
        //
        if (c == '.' || c == '\'' || c == ',' || c == '?' || c == '!' || c == ':' || c == '\t' || c == '\n') {
            return false;
        } else {
            return true;
        }
    }

    private void parse() {
        /*split the text into words and store in vector*/
        String word = "";
        char c;
        int i = 0;
        int n = text.length();
        while (i < n) {
            c = text.charAt(i++);
            //System.out.print(c);
            if (c != ' ' && isValid(c)) {
                word += c;
                if (i == n) {
                    this.insertWord(word);
                    word = "";
                }
                continue;
            } else {
                if (word.length() > 0) {
                    //System.out.println(word);
                    this.insertWord(word);
                    word = "";
                } else {
                    //ie word.length=0
                    continue;
                }
            }//end else
        }//while

    }
    /*
     * These methods may be needed outside 
     */

    public void insertWord(String word) {
        /*
         * if word is not present in the vector then insert it into the vector
         * and allocate count 1 , otherwise increase the count associated with 
         * the word by 1.
         */
        word = word.trim().toLowerCase();
        //if you want case sensitive, comment the above line
        if (!words.contains(word)) {
            words.add(word);
            count.add(new StringCounter(word));
        } else {
            count.elementAt(words.indexOf(word)).increaseCount();
        }
    }

    public void printWeighedWords() {
        /*
         * This is just a debugging method
         */
        int max = getDistinctWordsCount();
        for (int i = 0; i < max; i++) {
            System.out.println(words.elementAt(i) + " :" + count.elementAt(i).count);
        }
    }

    public int getDistinctWordsCount() {
        /*
         * returns the total number of distinct words
         * ie. if the word "rain" is repeated many times
         * it returns as only one word
         */
        return words.size();
    }

    public int getTotalWordsCount() {
        /*
         * returns total number of words, counts
         * same word multiple times if it is repeated 
         * multiple times
         */
        int i = 0;
        for (int j = 0; j < count.size(); j++) {
            i += count.elementAt(j).count;
        }
        return i;
    }

    public int getCountOfTerm(String term) {
        /*
         * returns the count of term if it is present
         * otherwise returns zero
         */
        int ind = words.indexOf(term);
        if (ind > 0) {
            return count.elementAt(ind).count;
        } else {
            return 0;
        }
    }

    public String toString() {
        String doc = "[";
        int max = getDistinctWordsCount();
        for (int i = 0; i < max; i++) {
            doc += words.elementAt(i) + "(" + count.elementAt(i).count + ")";
            if (i < max - 1) {
                doc += ", ";
            }
        }
        doc += "]";
        return doc;

    }

    public static void main(String[] args) throws IOException {
       //Document doc = parseFile("assignment1.txt");

        Document doc = new Document("Hello World, My name is Rajan. and Hello world, "
                + "I enjoy doing this kind of works.");

        System.out.println("Total words count: " + doc.getTotalWordsCount());
        System.out.println("Distinct words count: " + doc.getDistinctWordsCount());
        System.out.println("Weighed words: \n");
        System.out.println(doc);

    }

    public static Document parseFile(String filepath) {
        // String filepath = "assignment1.txt";
        Charset ENCODING = StandardCharsets.UTF_8;

        Path path = Paths.get(filepath);

        //System.out.println(Files.readAllLines(path, ENCODING));
        Document doc = null;
        try {
            doc = new Document(Files.readAllLines(path, ENCODING));
            int count = doc.getDistinctWordsCount();
            System.out.println("Words count:  " + count);
            System.out.println("Words:  " + doc.words);
            doc.printWeighedWords();
        } catch (IOException e) {
            System.out.println("error");
            e.printStackTrace();
        }
        return doc;
    }
}

9 comments:

  1. Obviously, these ends can frame the premise of some significant changes that may have been in the pipeline for quite a while. Data Analytics Course

    ReplyDelete
  2. Result – Unlike information mining, DA doesn't rotate around the recognizable proof of unfamiliar examples and concealed connections; rather, it centers around the determination of an end.Data Analytics Course

    ReplyDelete
  3. Cool stuff you have and you keep overhaul every one of us. data science course

    ReplyDelete
  4. There is a term involved in building artificial intelligence. It is called the Turing Test. A Turing test is to test an artificial intelligence to see if we could recognize it as a computer or we couldn't see any difference between that and a human intelligence data science course in india

    ReplyDelete

  5. Great to become visiting your weblog once more, it has been a very long time for me. Pleasantly this article i've been sat tight for such a long time. I will require this post to add up to my task in the school, and it has identical subject along with your review. Much appreciated, great offer. data science course in nagpur


    ReplyDelete
  6. Extremely overall quite fascinating post. I was searching for this sort of data and delighted in perusing this one. Continue posting. A debt of gratitude is in order for sharing.data science course in kolhapur

    ReplyDelete

If you like to say anything (good/bad), Please do not hesitate...