Info/CS 4300: Language and Information - in-class demo

Lecture 11

Text Classification

In [30]:
from __future__ import print_function
import json
import numpy as np

from sklearn.cross_validation import ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB


from sklearn.metrics import classification_report, confusion_matrix

Load the data

In [31]:
with open("kardashian-transcripts.json", "rb") as f:
    transcripts = json.load(f)
In [32]:
kris_msgs = [m['text'] for transcript in transcripts for m in transcript
             if m['speaker'] == 'KRIS']


bruce_msgs = [m['text'] for transcript in transcripts for m in transcript
              if m['speaker'] == 'BRUCE']
In [33]:
kris_classes = ['Kris' for _ in kris_msgs]


bruce_classes = ['Bruce' for _ in bruce_msgs]


msgs = kris_msgs + bruce_msgs
msgs = np.array(msgs)

classes = kris_classes + bruce_classes
classes = np.array(classes)

Leave out a test set

In [34]:
nr_mesages=len(msgs)

shuffle_split = ShuffleSplit(nr_mesages, test_size=0.5, random_state=0)
In [35]:
train_idx, test_idx = next(iter(shuffle_split))  #iterator

train_idx[:10]
Out[35]:
array([5815, 8522, 8344, 4095, 5408, 5757, 3221, 4114, 8621, 8788])
In [36]:
msgs_train = msgs[train_idx]
msgs_test =  msgs[test_idx]

classes_train = classes[train_idx]
classes_test = classes[test_idx]

Majority baseline

First, let's see how well we can do without looking at the features at all. We will learn the majority class from the train set and always predict that

In [139]:
n_kris=len([1 for c in classes_train if c == "Kris"])
n_bruce=len([1 for c in classes_train if c == "Bruce"])


print("number of messages in Kris class:",n_kris)
print("number of messages in Bruce class:",n_bruce)

if n_kris>n_bruce:
    print("Majority class is Kris")
    majority_class="Kris"
else:
    print("Majority class is Bruce")
    majority_class="Bruce"


n_train = len(classes_train)

##always predict majority class
majority_classes_test= [majority_class for _ in msgs_test]

print("Majority baseline accuracy: {:.2f}%".format(np.mean(majority_classes_test == classes_test) * 100))
number of messages in Kris class: 2847
number of messages in Bruce class: 2078
Majority class is Kris
Majority baseline accuracy: 56.83%

Getting features (term-doc matrix)

In [158]:
## getting term-doc matrix
vectorizer = CountVectorizer(ngram_range=(1, 2))  # for  unigrams only use ngram_range=(1, 1)
vectorizer.fit(msgs_train)

terms = vectorizer.get_feature_names()
terms[-10:]
Out[158]:
[u'yourself out',
 u'yourselves',
 u'yourselves single',
 u'youto',
 u'youto make',
 u'yum',
 u'zack',
 u'zack you',
 u'zito',
 u'zito rico']
In [159]:
term_document_matrix_train = vectorizer.transform(msgs_train)
term_document_matrix_train
Out[159]:
<4925x15630 sparse matrix of type '<type 'numpy.int64'>'
	with 57528 stored elements in Compressed Sparse Row format>
In [160]:
print(confusion_matrix(classes_test, majority_classes_test))  
[[   0 2126]
 [   0 2799]]

Building a classifier

In [161]:
#classifier=MultinomialNB()
classifier=BernoulliNB()

classifier.fit(term_document_matrix_train,classes_train)
Out[161]:
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

Evaluate

In [162]:
term_document_matrix_test = vectorizer.transform(msgs_test)
term_document_matrix_test

predicted_classes_test = classifier.predict(term_document_matrix_test)
In [163]:
predicted_classes_train = classifier.predict(term_document_matrix_train)
print("Accuracy: {:.2f}%".format(np.mean(predicted_classes_train == classes_train) * 100))
Accuracy: 89.34%

In [129]:
print("Accuracy: {:.2f}%".format(np.mean(predicted_classes_test == classes_test) * 100))
Accuracy: 70.56%

In [123]:
classifier.classes_  #checking the order of the classes (for the confusion matrix)
Out[123]:
array(['Bruce', 'Kris'], 
      dtype='|S5')
In [108]:
print(confusion_matrix(classes_test, predicted_classes_test))  
[[ 987 1139]
 [ 311 2488]]

In [109]:
print(classification_report(classes_test, predicted_classes_test))
             precision    recall  f1-score   support

      Bruce       0.76      0.46      0.58      2126
       Kris       0.69      0.89      0.77      2799

avg / total       0.72      0.71      0.69      4925


features

In [110]:
print(terms[-5:])
classifier.feature_count_[:, -5:]   #classes by words
[u'yum', u'zack', u'zack you', u'zito', u'zito rico']

Out[110]:
array([[ 0.,  1.,  1.,  1.,  1.],
       [ 1.,  0.,  0.,  0.,  0.]])
In [116]:
# P(feature|Bruce)
bruce_probs=classifier.feature_log_prob_[0,:]

# P(feature|Kris)
kris_probs=classifier.feature_log_prob_[1,:]


logodds=bruce_probs-kris_probs

#kris
print("\nFeatures that are most indicative of Kris:\n")
for i in np.argsort(logodds)[:10]:
    print(terms[i])
    
print("\n\nFeatures that are most indicative of Bruce\n")
#bruce
for i in np.argsort(-logodds)[:10]:
    print(terms[i])

Features that are most indicative of Kris:

stop it
feels
okay well
birthday
that enough
every time
feelings
whoo
for her
her to


Features that are most indicative of Bruce

few pounds
pounds
lose few
could lose
kimberly
he been
difference
like what
hand hug
mean this

In []: