Monday, 20 October 2014

Sentimental Analysis using WordNet Dictionary

Sentimental Analysis using WordNet Dictionary

 This is the sample program to calculate the each and every word of semantic orientation by using sentiwordnet dictionary.


package com.orienit.hadoop.training;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;

    public class SentimentalWordNet {
        private String pathToSWN = "/home/hadoop/work/SentiWordNet_3.0.0.txt";
        private HashMap<String, Double> _dict;

        public SentimentalWordNet(){

            _dict = new HashMap<String, Double>();
            HashMap<String, Vector<Double>> _temp = new HashMap<String, Vector<Double>>();
            try{
                BufferedReader csv =  new BufferedReader(new FileReader(pathToSWN));
                String line = "";
                while((line = csv.readLine()) != null)
                {
                    String[] data = line.split("\t");
                    Double score = Double.parseDouble(data[2])-Double.parseDouble(data[3]);
                    String[] words = data[4].split(" ");
                    for(String w:words)
                    {
                        String[] w_n = w.split("#");
                        w_n[0] += "#"+data[0];
                        int index = Integer.parseInt(w_n[1])-1;
                        if(_temp.containsKey(w_n[0]))
                        {
                            Vector<Double> v = _temp.get(w_n[0]);
                            if(index>v.size())
                                for(int i = v.size();i<index; i++)
                                    v.add(0.0);
                            v.add(index, score);
                            _temp.put(w_n[0], v);
                        }
                        else
                        {
                            Vector<Double> v = new Vector<Double>();
                            for(int i = 0;i<index; i++)
                                v.add(0.0);
                            v.add(index, score);
                            _temp.put(w_n[0], v);
                        }
                    }
                }
                Set<String> temp = _temp.keySet();
                for (Iterator<String> iterator = temp.iterator(); iterator.hasNext();) {
                    String word = (String) iterator.next();
                    Vector<Double> v = _temp.get(word);
                    double score = 0.0;
                    double sum = 0.0;
                    for(int i = 0; i < v.size(); i++)
                        score += ((double)1/(double)(i+1))*v.get(i);
                    for(int i = 1; i<=v.size(); i++)
                        sum += (double)1/(double)i;
                    score /= sum;
                    String sent = "";
                    if(score>=0.75)
                        sent = "strong_positive";
                    else
                    if(score > 0.25 && score<=0.5)
                        sent = "positive";
                    else
                    if(score > 0 && score>=0.25)
                        sent = "weak_positive";
                    else
                    if(score < 0 && score>=-0.25)
                        sent = "weak_negative";
                    else
                    if(score < -0.25 && score>=-0.5)
                        sent = "negative";
                    else
                    if(score<=-0.75)
                        sent = "strong_negative";
                    _dict.put(word, score);
                }
            }
            catch(Exception e){
             //e.printStackTrace();
             }        

        }

public Double extract(String word)
{
    Double total = new Double(0);
    if(_dict.get(word+"#n") != null)
         total = _dict.get(word+"#n") + total;
    if(_dict.get(word+"#a") != null)
        total = _dict.get(word+"#a") + total;
    if(_dict.get(word+"#r") != null)
        total = _dict.get(word+"#r") + total;
    if(_dict.get(word+"#v") != null)
        total = _dict.get(word+"#v") + total;
    return total;
}

public static void main(String[] args) {
    SentimentalWordNet test = new SentimentalWordNet();
    String sentence="hey i had a wonderful an experience in barista";
    String[] words = sentence.split("\\s+");
    double totalScore = 0;
    for(String word : words) {
        word = word.replaceAll("([^a-zA-Z\\s])", "");
        if (test.extract(word) == null)
            continue;
        totalScore += test.extract(word);
    }
    if(totalScore == 0)
    {
      System.out.println("Neutral Statement :" + totalScore);
    } else if(totalScore > 0) {
      System.out.println("Postive Statement :" + totalScore);
    } else {
      System.out.println("Negative Statement :" + totalScore);
    }
}

}
 
OUTPUT:-
Postive Statement :0.34798245187641463


Note: Find the attached link for sentimentalwordnetdictionary. Remove the all comments and hash tag from the beginning and end of the file.
http://sentiwordnet.isti.cnr.it/download.php

XML parsing using PIG

This are the steps for parsing your XML files by PIG

---------------------------------------------------------------------------------------------------------------

Step 1: Set the classpath for pig bin
export PATH=/home/hadoop/work/pig-0.11.1/bin:$PATH

Step 2: Register the jar file

REGISTER '/home/hadoop/work/pig-0.11.1/contrib/piggybank/java/piggybank.jar'

Step 3: Load the data

xml = load '/home/hadoop/work/hadoop-1.1.2/conf/mapred-site.xml' USING 
org.apache.pig.piggybank.storage.XMLLoader('name') as(doc:chararray);
@ data looks like
<property>
<name>fs.default.name</name>
<value>hdfs://localhsot:8020</value>
</property>

Step 4: Parse the file and retrieve the value

value = foreach xml GENERATE FLATTEN(REGEX_EXTRACT_ALL(doc,'<name>(.*)</name>'))  AS name:chararray;

Step 5: show the value

dump value;
Output will be:
fs.default.name

Parse the multiple attribute file
@ data looks like
<property>
 <fname>kalyan</fname>
 <lname>hadoop</lname>
 <landmark>annapurna block</landmark>
 <city>hyderabad</city>
 <state>Telengana</state>
 <contact>1234567890</contact>
 <email>kalyan@gmail.com</email>
 <PAN_Card>0011542</PAN_Card>
 <URL>kalyanhadooptraining.blogspot.com</URL>
</property>

Load the data:
pigdata = load '/home/hadoop/work/input/file.txt' USING 
org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);

Parse the values:
values = foreach pigdata GENERATE FLATTEN(REGEX_EXTRACT_ALL(doc,'<property>\\s*<fname>(.*)</fname>\\s*<lname>(.*)</lname>\\s*<landmark>(.*)</landmark>\\s*<city>(.*)</city>\\s*<state>(.*)</state>\\s*<contact>(.*)</contact>\\s*<email>(.*)</email>\\s*<PAN_Card>(.*)</PAN_Card>\\s*<URL>(.*)</URL>\\s*</property>')) AS (fname:chararray, lname:chararray, landmark:chararray, city:chararray, state:chararray, contact:int, email:chararray, PAN_Card:long, URL:chararray);


dump values;
Output will be:
(kalyan,hadoop,annapurna block,hyderabad,Telengana,1234567890,kalyan@gmail.com,0011542,kalyanhadooptraining.blogspot.com)
Related Posts Plugin for WordPress, Blogger...