In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import json
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
with open("movie_scripts_data.json") as f:
    data = json.loads(f.readlines()[0])
print("Loaded {} movie transcripts".format(len(data)))
print("Each movie transcript is a dictionary with the following keys...")
print(data[0].keys())


print("The index of \"{}\" is {}".format(data[7]['movie_name'], movie_id_to_index[data[7]['movie_id']]))
Loaded 617 movie transcripts
Each movie transcript is a dictionary with the following keys...
dict_keys(['movie_name', 'movie_id', 'categories', 'script'])
The index of "spare me" is 7

We can see that each movie is assigned an "index" (from 0 to 616). These will correspond to the rows of a document-by-term-count matrix.

In [3]:
count_vec = CountVectorizer(stop_words='english', max_df=0.8, min_df=10,
                            max_features=1000, binary = True)

term_doc_matrix = count_vec.fit_transform([x['script'] for x in data])
print(term_doc_matrix.shape)

# word index
features = count_vec.get_feature_names()
print(features[:100])
(617, 1000)
['able', 'absolutely', 'accept', 'accident', 'account', 'act', 'acting', 'action', 'actually', 'address', 'admit', 'advice', 'afford', 'afraid', 'afternoon', 'age', 'agent', 'ago', 'agree', 'ah', 'ahead', 'ain', 'air', 'alive', 'alright', 'amazing', 'america', 'american', 'angry', 'animal', 'answer', 'answers', 'anybody', 'anymore', 'apart', 'apartment', 'apologize', 'appreciate', 'area', 'aren', 'arm', 'arms', 'army', 'arrest', 'art', 'asked', 'asking', 'asleep', 'ass', 'asshole', 'attack', 'attention', 'awful', 'baby', 'bag', 'ball', 'bank', 'bar', 'bastard', 'bathroom', 'beat', 'beautiful', 'beauty', 'bed', 'beer', 'beg', 'begin', 'beginning', 'belong', 'best', 'bet', 'bigger', 'biggest', 'birthday', 'bit', 'bitch', 'black', 'blame', 'blew', 'blind', 'blood', 'blow', 'blue', 'board', 'boat', 'body', 'book', 'books', 'born', 'boss', 'bother', 'bought', 'bout', 'box', 'boy', 'boys', 'brain', 'brains', 'break', 'breakfast']
In [4]:
term_doc_matrix = term_doc_matrix.toarray()
In [5]:
term_doc_matrix[:1].tolist()
Out[5]:
[[0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  1]]
In [6]:
# from sklearn.preprocessing import normalize
# doc_by_vocab2 = normalize(doc_by_vocab, axis=0)
# term_doc_matrix = doc_by_vocab.T
cooccurence_matrix = np.dot(term_doc_matrix.T, term_doc_matrix)
In [7]:
def find_most_similar_words(word, similarity_matrix, topk=10):
    if word not in features:
        print(word, 'is OOV.')
        return None
    idx = features.index(word)
    sorted_words = np.argsort(similarity_matrix[idx])[::-1]
    print('Most similar {} words to "{}" are:'.format(topk, word))
    for i in range(topk):
        j = sorted_words[i]
        print(features[j], similarity_matrix[idx, j])
In [8]:
find_most_similar_words('computer', similarity_matrix = cooccurence_matrix)
Most similar 10 words to "computer" are:
computer 104
hell 93
thanks 91
okay 91
working 89
kill 87
thank 87
haven 87
seen 86
nice 86

Hmm, not great. The issue is that this does not account for the words occuring together by chance alone. E.g., "hell" is quite a popular word in movies. We can account for the probability of the two words co-occuring by change using (a version of) PMI.

In [9]:
pa = np.sum(term_doc_matrix,0)
In [10]:
pa.shape
Out[10]:
(1000,)
In [11]:
PMI_part = cooccurence_matrix / pa
In [12]:
PMI = PMI_part.T / pa
In [13]:
find_most_similar_words('computer',PMI, 10)
Most similar 10 words to "computer" are:
computer 0.009615384615384616
security 0.0034097108565193674
area 0.0031561949500880796
mission 0.002987303958177745
team 0.002968508002065049
using 0.002939735423811857
officer 0.0028702640642939152
contact 0.002837326607818411
main 0.002772002772002772
reading 0.002765208647561589
In [14]:
# cooccurence_matrix / np.reshape(np.sum(term_doc_matrix, axis=0), (1000, 1))