%matplotlib inline
from __future__ import print_function,division
import json
from operator import itemgetter
from collections import defaultdict
import numpy as np
from matplotlib import pyplot as plt
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
## loading movie review data:
## http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
data = load_files('txt_sentoken')
## First review and first label:
print(data.data[0])
print(data.target[0])
## Building the term document matrix using CountVectorizer
vec = CountVectorizer(min_df=50)
X = vec.fit_transform(data.data)
terms = vec.get_feature_names()
len(terms)
## METHOD 1: We estimate the positive_score as P(Positive|W)
def wordscore_pos():
total_count = X.sum(axis=0) # (shape 1, n_terms)
pos_count = X[data.target == 1].sum(axis=0) # shape (1, n_terms)
# make sure they are 1d np.arrays
total_count = np.asarray(total_count).ravel()
pos_count = np.asarray(pos_count).ravel()
prob = pos_count * 1.0 / total_count
return zip(terms,prob)
## most "negative" words
negative_movies=sorted(wordscore_pos(), key=itemgetter(1), reverse=False)[:20]
negative_movies
positive_movies=sorted(wordscore_pos(),key = itemgetter(1),reverse = True)[:20]
positive_movies
sentiment_classifier=MultinomialNB()
sentiment_classifier.fit(X,data.target)
predicted_classes_train=sentiment_classifier.predict(X)
print("Accuracy on train: {:.2f}%".format(np.mean(predicted_classes_train == data.target) * 100))
# don't forget to check this against the majority baseline:
sum(data.target)/len(data.target)
# METHOD 2: Use class probabilities already calculated by your NB classifier
positive_lexicon=[]
negative_lexicon=[]
# P(word|positive)
positives_probs=sentiment_classifier.feature_log_prob_[0,:]
# P(word|Kris)
negative_probs=sentiment_classifier.feature_log_prob_[1,:]
logodds=positives_probs-negative_probs
#positive
print("\nFeatures that are most indicative of positive sentiment:\n")
for i in np.argsort(logodds)[:20]:
print(terms[i])
print("\n\nFeatures that are most indicative of negative sentiment\n")
#negative
for i in np.argsort(-logodds)[:20]:
print(terms[i])
# put the top/bottom words in the positive/negative lexicons
for i in np.argsort(logodds)[:500]:
positive_lexicon.append(terms[i])
#negative
for i in np.argsort(-logodds)[:500]:
negative_lexicon.append(terms[i])
positive_lexicon=set(positive_lexicon)
negative_lexicon=set(negative_lexicon)
Now we can try to apply what we learned to a data without labels
## Loading the Kardashian data
with open("kardashian-transcripts.json", "rb") as f:
transcripts = json.load(f)
msgs = [m['text'].lower() for transcript in transcripts
for m in transcript if m['speaker'] == 'KIM']
## using the same transformation to get a term-document matrix (where terms match the ones in the movie-review data)
X2=vec.transform(msgs)
labels=sentiment_classifier.predict(X2)
#Looking at the classes assigned by the classifier:
zip(labels,msgs)[:20]
# label distribution (everything has an assigned class, even though not everything might be subjective)
plt.hist(labels.tolist())
## We can look at the predicted class probabilities
label_prob = classifier.predict_proba(X2)
positive_label_probabilities = label_prob[:, 1]
plt.hist(positive_label_probabilities)
## documents that are considered most negative
kard_sentiment=sorted(set(zip(positive_label_probabilities, msgs)),
key=itemgetter(0),
reverse=False)
for sent_score, m in kard_sentiment[:20]:
print (sent_score, m)
print ("lexicon items:",", ".join(list(set(m.split()).intersection(negative_lexicon))))
print()
## documents that are considered most negative
kard_sentiment=sorted(set(zip(positive_label_probabilities, msgs)),
key=itemgetter(0),
reverse=False)
for sent_score, m in kard_sentiment[-20:]:
print (sent_score, m)
print ("lexicon items:",", ".join(list(set(m.split()).intersection(positive_lexicon))))
print()
We see that the polarity lexicon does not generalize well from one dataset to another. What can we do?