Info/CS 4300: Language and Information - in-class demo

Lecture 19

Looking closer at text classification:

Adding conversational features

In [59]:
from __future__ import print_function
import json
import numpy as np

from sklearn.cross_validation import ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2


from sklearn.metrics import classification_report, confusion_matrix

from scipy import sparse as sp 
In [60]:
EXTRA=False # set to True to activate the extra feature (length in our case, but could be easily changed)

Load the data

In [61]:
with open("kardashian-transcripts.json", "rb") as f:
    transcripts = json.load(f)
In [62]:
transcripts[1][1]
Out[62]:
{u'episode_title': u'Keeping Up With the Kardashians - Shape Up or Ship Out',
 u'speaker': u'KHLOE',
 u'text': u'I mean, I want him to feel like a man.',
 u'timestamp': u'00:00:46',
 u'toks': [u'i',
  u'mean',
  u',',
  u'i',
  u'want',
  u'him',
  u'to',
  u'feel',
  u'like',
  u'a',
  u'man',
  u'.'],
 u'transcript_id': u'kardashians/153950'}
In [63]:
kris_msgs = [m['text'] for transcript in transcripts for m in transcript
             if m['speaker'] == 'KRIS']

## who is the previous speaker
kris_prev = [("to_"+transcript[k-1]['speaker'] if k>0 else "_T") 
             for transcript in transcripts for k,m in enumerate(transcript)
             if m['speaker']=='KRIS'] 




bruce_msgs = [m['text'] for transcript in transcripts for m in transcript
              if m['speaker'] == 'BRUCE']

bruce_prev = [("to_"+transcript[k-1]['speaker'] if k>0 else "_T") 
             for transcript in transcripts for k,m in enumerate(transcript)
             if m['speaker']=='BRUCE'] 

bruce_prev[:10]


## we need to (a) merge multiple lines from the same person (data representation issue)
## and (b) treat both KRIS and BRUCE as unknown labels when they appear as previous speakers
kris_prev2 =[]
for m in kris_prev:
    if m=="to_KRIS" or m=="to_BRUCE":
        kris_prev2.append("_M")
    else:
        kris_prev2.append(m)

kris_prev = kris_prev2

bruce_prev2 =[]
for m in bruce_prev:
    if m=="to_KRIS" or m=="to_BRUCE":
        bruce_prev2.append("_M")
    else:
        bruce_prev2.append(m)

bruce_prev = bruce_prev2
In [64]:
kris_msgs = [target+" "+m for target,m in zip(kris_prev,kris_msgs)]
bruce_msgs = [target+" "+m for target,m in zip(bruce_prev,bruce_msgs)]

kris_classes = ['Kris' for _ in kris_msgs]


bruce_classes = ['Bruce' for _ in bruce_msgs]


msgs = kris_msgs + bruce_msgs
msgs = np.array(msgs)

classes = kris_classes + bruce_classes
classes = np.array(classes)

kris_msgs[:10]
Out[64]:
[u'to_KIM Like last week.',
 u'to_KIM I have one gray hair, and in order for her make it all blend, she has to do the whole head.',
 u'_M Right, K?',
 u'to_KENDALL Right.',
 u'_M Will you guys look at this new dress I have in my closet?',
 u"to_KHLOE I'm asking you to approve.",
 u'_M I have 24 hours, okay?',
 u"_M It's right over there.",
 u'_M Look at the dress.',
 u'to_KHLOE Yep.']
In [65]:
## This is how zip works
zip([1,2,3],["a","b","c"])
Out[65]:
[(1, 'a'), (2, 'b'), (3, 'c')]

Leave out a test set

In [66]:
nr_mesages=len(msgs)

shuffle_split = ShuffleSplit(nr_mesages, test_size=0.5, random_state=0)
In [67]:
train_idx, test_idx = next(iter(shuffle_split))  #iterator

train_idx[:10]
Out[67]:
array([5815, 8522, 8344, 4095, 5408, 5757, 3221, 4114, 8621, 8788])
In [68]:
msgs_train = msgs[train_idx]
msgs_test =  msgs[test_idx]

classes_train = classes[train_idx]
classes_test = classes[test_idx]

Majority baseline

First, let's see how well we can do without looking at the features at all. We will learn the majority class from the train set and always predict that

In [69]:
n_kris=len([1 for c in classes_train if c == "Kris"])
n_bruce=len([1 for c in classes_train if c == "Bruce"])


print("number of messages in Kris class:",n_kris)
print("number of messages in Bruce class:",n_bruce)

if n_kris>n_bruce:
    print("Majority class is Kris")
    majority_class="Kris"
else:
    print("Majority class is Bruce")
    majority_class="Bruce"


n_train = len(classes_train)

##always predict majority class
majority_classes_test= [majority_class for _ in msgs_test]

print("Majority baseline accuracy: {:.2f}%".format(np.mean(majority_classes_test == classes_test) * 100))
number of messages in Kris class: 2847
number of messages in Bruce class: 2078
Majority class is Kris
Majority baseline accuracy: 56.83%

Getting features (term-doc matrix)

In [80]:
## getting term-doc matrix
vectorizer = CountVectorizer(ngram_range=(1, 2))  # for  unigrams only use ngram_range=(1, 1)
vectorizer.fit(msgs_train)

terms = vectorizer.get_feature_names()
terms[-10:]
Out[80]:
[u'yourself out',
 u'yourselves',
 u'yourselves single',
 u'youto',
 u'youto make',
 u'yum',
 u'zack',
 u'zack you',
 u'zito',
 u'zito rico']
In [85]:
term_document_matrix_train = vectorizer.transform(msgs_train)
term_document_matrix_train
Out[85]:
<4925x17046 sparse matrix of type '<type 'numpy.int64'>'
	with 67387 stored elements in Compressed Sparse Row format>
In [86]:
fsel=SelectKBest(score_func=chi2,k=1000)
fsel.fit(term_document_matrix_train,classes_train)
Out[86]:
SelectKBest(k=1000, score_func=<function chi2 at 0x10c9c8c80>)
In [87]:
term_document_matrix_train=fsel.transform(term_document_matrix_train)
term_document_matrix_train.shape
Out[87]:
(4925, 1000)

Add extra features, such as length in characters

In [88]:
def extra_feature(d):
    return len(d)


if EXTRA:
    extra_train = [extra_feature(d) for d in msgs_train]
    extra_train = np.atleast_2d(extra_train)
    extra_train =extra_train.T
    extra_train
In [89]:
if EXTRA:
    ## merging the extra feature vector with the old feature matrix
    term_document_matrix_train =  sp.hstack([extra_train,term_document_matrix_train])

Building a classifier

In [90]:
#classifier=MultinomialNB()
classifier=LinearSVC()

classifier.fit(term_document_matrix_train,classes_train)
Out[90]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

Evaluate

In [91]:
term_document_matrix_test = vectorizer.transform(msgs_test)
term_document_matrix_test = fsel.transform(term_document_matrix_test)

if EXTRA:
    extra_test = [extra_feature(d) for d in msgs_test]
    extra_test = np.atleast_2d(extra_test)
    extra_test =extra_test.T

    term_document_matrix_test =  sp.hstack([extra_test,term_document_matrix_test])

predicted_classes_test = classifier.predict(term_document_matrix_test)
In [92]:
predicted_classes_train = classifier.predict(term_document_matrix_train)
print("Accuracy on train: {:.2f}%".format(np.mean(predicted_classes_train == classes_train) * 100))
Accuracy on train: 80.95%
In [93]:
print("Accuracy on test: {:.2f}%".format(np.mean(predicted_classes_test == classes_test) * 100))
Accuracy on test: 68.57%
In [46]:
classifier.classes_  #checking the order of the classes (for the confusion matrix)
Out[46]:
array(['Bruce', 'Kris'], 
      dtype='|S5')
In [47]:
print(confusion_matrix(classes_test, predicted_classes_test))  
[[ 566 1560]
 [ 273 2526]]
In [48]:
print(classification_report(classes_test, predicted_classes_test))
             precision    recall  f1-score   support

      Bruce       0.67      0.27      0.38      2126
       Kris       0.62      0.90      0.73      2799

avg / total       0.64      0.63      0.58      4925

In [ ]: