%matplotlib inline
from __future__ import print_function
import json
from operator import itemgetter
from collections import defaultdict
import numpy as np
from matplotlib import pyplot as plt
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.naive_bayes import MultinomialNB
## loading movie review data:
## http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
data = load_files('txt_sentoken')
## First review and first label:
print(data.data[0])
print(data.target[0])
## Building the term document matrix using CountVectorizer
vec = CountVectorizer(min_df=50)
X = vec.fit_transform(data.data)
terms = vec.get_feature_names()
len(terms)
X[data.target == 1].sum(axis=0).shape
## We estimate the positive_score as P(Positive|W)
def wordscore_pos():
total_count = X.sum(axis=0) # (shape 1, n_terms)
pos_count = X[data.target == 1].sum(axis=0) # shape (1, n_terms)
# make sure they are 1d np.arrays
total_count = np.asarray(total_count).ravel()
pos_count = np.asarray(pos_count).ravel()
prob = pos_count * 1.0 / total_count
return zip(terms,prob)
## most "negative" words
negative_movies=sorted(wordscore_pos(), key=itemgetter(1), reverse=False)[:20]
negative_movies
positive_movies=sorted(wordscore_pos(),key = itemgetter(1),reverse = True)[:20]
positive_movies
dict(wordscore_pos())['voice']
Now we can try to apply what we learned to a data without labels
## Loading the Kardashian data (from assingmnet 2)
with open("kardashian-transcripts.json", "rb") as f:
transcripts = json.load(f)
msgs = [m['text'].lower() for transcript in transcripts
for m in transcript if m['speaker'] == 'KIM']
classifier=MultinomialNB()
classifier.fit(X,data.target)
## using the same transformation to get a term-document matrix (where terms match the ones in the movie-review data)
X2=vec.transform(msgs)
labels=classifier.predict(X2)
#Looking at the classes assigned by the classifier:
zip(labels,msgs)[:20]
# label distribution (everything has an assigned class, even though not everything might be subjective)
plt.hist(labels.tolist())
## We can look at the predicted class probabilities
label_prob = classifier.predict_proba(X2)
positive_label_probabilities = label_prob[:, 1]
classifier.classes_
## documents that are considered most negative
sorted(set(zip(positive_label_probabilities, msgs)),
key=itemgetter(0),
reverse=False)[:20]
We see that the polarity lexicon does not generalize well from one dataset to another. In the next lecture we'll see how to address this problem.