In [1]:
from __future__ import print_function
import numpy as np
import json
import warnings
warnings.filterwarnings("ignore") #just for some sklearn stuff.
In [2]:
with open("kickstarter.jsonlist") as f:
    data = [x for x in json.loads(f.readlines()[0]) if len(x['text'].split()) > 50]
print("{} projects loaded".format(len(data)))
np.random.shuffle(data) #to prove i'm not cheating...
print(data[0].keys())
45129 projects loaded
[u'raised', u'sub_category', u'text', u'creator_num_backed', u'featured', u'result', u'duration', u'category', u'goal', u'creator_facebook_connect', u'projectId', u'lon', u'has_video', u'comments', u'faqs', u'start_date', u'rewards', u'end_date', u'parent_category', u'updates', u'lat', u'short_text', u'name', u'url', u'backers']

Here's a trick -- you can encode categorical variables as indicator features! We'll use this strategy for the project's starting year.

In [3]:
year_to_index = {y:i for i,y in enumerate(set([x['start_date'][:4]
                                              for x in data]))}
index_to_year = {i:y for y,i in year_to_index.iteritems()}

ordered_years = [index_to_year[i] for i in range(len(index_to_year))]

print(ordered_years)
[u'2009', u'2011', u'2010', u'2012']
In [4]:
def control_features(p_in):
    '''Extracts some basic control features for a non-language baseline'''
    names = []
    vals = []
    
    #Was the project "featured" on kickstarter?
    names.append("was_featured")
    vals.append(p_in["featured"])
    
    #Did the creator have facebook connected to their kickstarter?
    names.append("had_facebook")
    vals.append(p_in["creator_facebook_connect"])
    
    #Did the creator have a video?
    names.append("had_video")
    vals.append(p_in["has_video"])
    
    #How much money were they trying to raise? log-scaled.
    names.append("log_goal")
    vals.append(np.log(p_in["goal"]))

    #How many rewards did they offer?
    names.append('rewards')
    vals.append(len(p_in['rewards']))
    
    #What is the project's year?
    names.extend(['created_' + y for y in ordered_years])
    year_indicator = np.zeros(len(ordered_years))
    year_indicator[year_to_index[p_in['start_date'][:4]]] = 1
    vals.extend(year_indicator)

    return names, np.array(vals, dtype = np.float32)
In [5]:
names, vals = control_features(data[0])
print(data[0]['name'])
for n,v in zip(names, vals):
    print("{}:{:.3f}".format(n,v))
'Cockney: Regeneration'. London's last days (before the Olympics). A documentary
was_featured:0.000
had_facebook:0.000
had_video:0.000
log_goal:8.517
rewards:6.000
created_2009:0.000
created_2011:0.000
created_2010:1.000
created_2012:0.000
In [6]:
Y = np.array([p['result'] for p in data],dtype = np.float32)
Xcontrol = np.vstack([control_features(p)[1] for p in data])
print(Y.shape)
print(Xcontrol.shape)
bl_acc = np.sum(Y==1.)*1./len(Y)
print("Baseline 1: Constant Prediction -- {:.4f}".format(np.max([bl_acc, 1.-bl_acc])))
(45129,)
(45129, 9)
Baseline 1: Constant Prediction -- 0.5187
In [7]:
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

def get_cv_acc(X_in, Y_in, cv = 10):
    accs = []
    for split in range(cv):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_in,
                                                                             Y_in,
                                                                             test_size=.2,
                                                                             random_state=split)
        #you should optimize hyperparameters, but that's another story for another day.
        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        accs.append(accuracy_score(y_test, lr.predict(X_test)))
    return np.mean(accs), lr
In [8]:
acc, model_example = get_cv_acc(Xcontrol, Y)
print("Baseline 2: Control features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
    print("{}: {:.4f}".format(feat, coef))
Baseline 2: Control features -- 0.6655

was_featured: 2.5285
had_facebook: 0.0256
had_video: 0.8398
log_goal: -0.6350
rewards: 0.1360
created_2009: 0.5919
created_2011: 0.7834
created_2010: 0.7283
created_2012: 0.6719
bias: 2.7755
In [9]:
# Lets see if normalizing feature-wise helps! This makes interpretability a bit tricker...
from sklearn.preprocessing import normalize
Xcontrol_norm = normalize(Xcontrol, axis=0)
acc, model_example = get_cv_acc(Xcontrol_norm, Y)
print("Baseline 3: Normalized Control features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
    print("{}: {:.4f}".format(feat, coef))
Baseline 3: Normalized Control features -- 0.5188

was_featured: 9.9851
had_facebook: 0.0632
had_video: 5.0117
log_goal: -3.1953
rewards: 5.7406
created_2009: 0.0227
created_2011: 1.8019
created_2010: -0.3952
created_2012: -1.8250
bias: 0.0308

Well, okay, let's not do that then (for now).

Lets add some topic model features!

In [10]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
import string
exclude = set(string.punctuation)
docs = [''.join([ch for ch in p['text'].lower() + " " + p['name'].lower()
                if ch not in exclude]) for p in data]
cv = CountVectorizer(stop_words = 'english', max_df=.7, min_df=50,
                     max_features=6000)
counts = cv.fit_transform(docs)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
In [11]:
n_topic = 15
print(counts.shape)
model = LDA(n_topics=n_topic, max_iter=10, n_jobs=4, verbose=1)
res = model.fit_transform(counts)
(45129, 6000)
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   10.9s finished
In [12]:
feature_names = cv.get_feature_names()
print_top_words(model, feature_names, 20)
Topic #0:
goal help project kickstarter make pledge money thank support rewards need new raise want time reach dont receive like friends
Topic #1:
design use product iphone app production designs project designed kickstarter case ipad using new available products video like make software
Topic #2:
like make just want im people time need going know really money help way good little dont ive things love
Topic #3:
documentary project people travel american history stories women world video interviews series america trip country culture years time states journey
Topic #4:
tour new city party york help 2011 road house music summer event shows 2012 fans austin money year rock band
Topic #5:
film production short crew films movie feature cast equipment festival director project festivals shoot money budget sound producer shooting make
Topic #6:
life world project help people story time love make hope way support thank like goal want need believe lives family
Topic #7:
game games play cards players new card player level character world characters team time fun set zombie playing development kickstarter
Topic #8:
school students children community people world social kids program public project education schools earth human work learn research youth change
Topic #9:
light black water project white space camera man piece power 3d burning time set used large wood metal ship lights
Topic #10:
art food project local community work create space artists materials gallery help new artist fashion collection exhibition painting business small
Topic #11:
book print books project art printing comic work novel cover edition magazine writing publishing issue printed copies series pages published
Topic #12:
story man life years family young old death girl new love father night john day time dark written home world
Topic #13:
new artists dance festival arts work theatre performance production theater play art york company stage san music musical 2011 support
Topic #14:
music album record songs recording cd band studio new song video musicians release ep project recorded sound time mastering years

These are the distributions of each topic by documents

In [13]:
print(res.shape)
np.set_printoptions(precision=3)
res = normalize(res, axis=1, norm = 'l1')
print(res[:3,:])
print(np.sum(res[:3,:],axis=1))
(45129, 15)
[[  6.601e-04   6.601e-04   6.601e-04   3.437e-01   6.601e-04   2.936e-02
    1.239e-01   1.662e-02   1.692e-01   6.601e-04   7.678e-02   6.601e-04
    6.601e-04   2.353e-01   6.601e-04]
 [  8.027e-02   3.745e-04   4.275e-02   4.248e-02   5.568e-02   3.745e-04
    7.478e-02   3.745e-04   3.745e-04   5.252e-02   3.745e-04   3.745e-04
    3.745e-04   6.485e-01   3.745e-04]
 [  4.357e-04   4.357e-04   3.303e-01   1.232e-02   4.357e-04   4.357e-04
    9.546e-02   4.969e-02   1.892e-01   4.357e-04   4.357e-04   2.936e-01
    2.592e-02   4.357e-04   4.357e-04]]
[ 1.  1.  1.]
In [14]:
Xall = np.hstack([Xcontrol, res])
print(Xall.shape)
(45129, 24)
In [15]:
acc, model_example = get_cv_acc(res, Y)
print("Baseline 4: Language features -- {:.4f}".format(acc))
print()
for feat, coef in zip(['topic-{}'.format(i) for i in range(n_topic)] + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
    print("{}: {:.4f}".format(feat, coef))
Baseline 4: Language features -- 0.6161

topic-0: 2.1993
topic-1: -2.0803
topic-2: 0.1134
topic-3: -0.6508
topic-4: 0.3789
topic-5: 0.0496
topic-6: -0.9671
topic-7: -0.7818
topic-8: -0.6217
topic-9: 0.6491
topic-10: -0.2392
topic-11: -0.5681
topic-12: -0.2334
topic-13: 2.1574
topic-14: 0.5291
bias: -0.0657
In [16]:
acc, model_example = get_cv_acc(Xall, Y)
print("Baseline 5: All features -- {:.4f}".format(acc))
print()
for feat, coef in zip(names + ['topic-{}'.format(i) for i in range(n_topic)] + ['bias'], list(model_example.coef_[0,:]) + [model_example.intercept_[0]]):
    print("{}: {:.4f}".format(feat, coef))
Baseline 5: All features -- 0.6881

was_featured: 2.6396
had_facebook: 0.0627
had_video: 0.8117
log_goal: -0.5913
rewards: 0.1268
created_2009: 0.4540
created_2011: 0.6764
created_2010: 0.5934
created_2012: 0.6037
topic-0: 1.7321
topic-1: -1.1163
topic-2: -0.1198
topic-3: -0.0102
topic-4: -0.0161
topic-5: 0.2327
topic-6: -0.4915
topic-7: -0.7442
topic-8: 0.2606
topic-9: 0.6220
topic-10: 0.0240
topic-11: -1.0359
topic-12: 0.2107
topic-13: 2.2725
topic-14: 0.5070
bias: 2.3275