Building the Model!¶

We're going to start by importing all libraries needed for the creation of the model. Refer to the requrements.txt file in the repository for the complete list and all version info.

from matplotlib import style
import pandas as pd
import numpy as np
%matplotlib inline
style.use("ggplot")
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Now let's read in the training set and take a look. 
df = pd.read_csv('~/rouest/project-submissions/data/train.csv')
df.head()

Now it's time to start building our feature space based on the EDA that we did.¶

# Even though no isFraud cases were to merchants, let's still break them out as a feature.
df['nameDestCat'] = df.nameDest.apply(lambda x: x[0])

# Now we need to select the features (and target) that we are using
dfalt = df.ix[:, [0, 1, 2, 4, 5, 7, 8, 9, 11]]

# And take a look . . .
dfalt.head()

As our EDA indicated, may of our float variables are heavily skewed.¶

Let's take the change to transform them with numpy's log1p to approximate a more normal distrbution (which will prove helpful in our support vector classification later) while retaining the 0 values.

dfalt.amount = np.log1p(dfalt.amount)
dfalt.oldbalanceOrg = np.log1p(dfalt.oldbalanceOrg)
dfalt.newbalanceOrig = np.log1p(dfalt.newbalanceOrig)
dfalt.oldbalanceDest = np.log1p(dfalt.oldbalanceDest)
dfalt.newbalanceDest = np.log1p(dfalt.newbalanceDest)

//anaconda/lib/python3.5/site-packages/pandas/core/generic.py:2701: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

# Now we can dummy out the type and nameDestCat features 
dfdum = pd.get_dummies(dfalt, prefix='is')

dfdum.head()

# Let's create our features list so that we can create some clusters for our dataframe
features = list(set(dfdum.columns) - {'isFraud'})

kmeans = KMeans(n_clusters=25, random_state=0, n_jobs=-1)
kmeans.fit(dfdum[features])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=25, n_init=10, n_jobs=-1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

# And then apply them to the dataset
predictedLabels = kmeans.predict(dfdum[features])
dfdum['predictedCluster'] = predictedLabels
dfdum['predictedCluster'] = dfdum.predictedCluster.astype('object')

# Let's take a look and then dummy it out again
dfdum.head()

dfclus = pd.get_dummies(dfdum)
dfclus.head()

In preparation for Grid Search, now we can create our final features set¶

features = list(set(dfclus.columns) - {'isFraud'})

#def rmse(estimator, X, y_true, greater_is_better=False):
    
#    preds = estimator.predict(X)
    
#    return mse(y_true, preds)**.5

grid = [{'C': [100, 10, 1, .1, .01, .001, .0001, .00001, .000001, .0000001]}] # Create the list for gridsearch

# Instantiate GridSearch and then . . . wait.
GridSearch = GridSearchCV(LinearSVC(random_state=0), grid, cv=5, n_jobs=-1)

# And fit! and wait! The nice thins is that GridSearch will automatically refit the best performing model 
# to the entire training set
GridSearch.fit(dfclus[features], dfclus.isFraud)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'C': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06, 1e-07]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

# Let's take a look at our best score:
GridSearch.best_score_

0.99925541591602507

# and parameters . . .
GridSearch.best_params_

{'C': 0.01}

# and estimator as a whole!
GridSearch.best_estimator_

LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

Rinse and Repeat¶

Now we'll repeat our feature engineering from above on the test set.

# Read in the dataset remembering that it does not have our target so the shape is slightly different.
dft = pd.read_csv('~/rouest/project-submissions/data/test.csv')

dft['nameDestCat'] = dft.nameDest.apply(lambda x: x[0])
dft.head()

dftalt = dft.ix[:, [0, 1, 2, 4, 5, 7, 8, 10]]

dftalt.amount = np.log1p(dftalt.amount)
dftalt.oldbalanceOrg = np.log1p(dftalt.oldbalanceOrg)
dftalt.newbalanceOrig = np.log1p(dftalt.newbalanceOrig)
dftalt.oldbalanceDest = np.log1p(dftalt.oldbalanceDest)
dftalt.newbalanceDest = np.log1p(dftalt.newbalanceDest)

//anaconda/lib/python3.5/site-packages/pandas/core/generic.py:2701: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

dftdum = pd.get_dummies(dftalt, prefix='is')

features = list(set(dftdum.columns) - {'isFraud'})

# Now we can fit our original kmeans to the test set.
kmeans.fit(dftdum[features])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=25, n_init=10, n_jobs=-1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

predictedLabels = kmeans.predict(dftdum[features])
dftdum['predictedCluster'] = predictedLabels
dftdum['predictedCluster'] = dftdum.predictedCluster.astype('object')

dftclus = pd.get_dummies(dftdum)
#dftclus.head()

features = list(set(dftclus.columns) - {'isFraud'})

The time has come!¶

To make our predictions on the modified test set.

# We're using the decision_function for roc_auc.
predictionLabel = GridSearch.decision_function(dftclus[features])

array([-0.84425539, -2.04256494, -4.49793334, ..., -2.0997208 ,
       -4.66266477, -3.88519909])

preds = predictionLabel.tolist()
submit = {'Prediction': preds, 'id': dft['id']}
submit = pd.DataFrame(submit)

# Create the csv for submission!
submit.to_csv('rouestsub26.csv', index = False)

	step	type	amount	nameOrig	oldbalanceOrg	newbalanceOrig	nameDest	oldbalanceDest	newbalanceDest	id
0	303	CASH_IN	185164.71	C1499985475	3075480.01	3260644.72	C1771727877	881991.88	696827.18	540576
1	356	CASH_IN	79083.65	C108745493	5489716.32	5568799.97	C1167754301	153219.51	74135.86	120014
2	10	TRANSFER	2336832.78	C975415534	147958.78	0.00	C718985478	5069347.06	7307970.46	623141
3	238	TRANSFER	228517.91	C1968162743	0.00	0.00	C1544755390	18768561.09	18997079.00	547737
4	133	CASH_IN	180179.73	C467196066	21448.00	201627.73	C1386847873	7160295.13	6980115.40	569291

	step	type	amount	oldbalanceOrg	newbalanceOrig	oldbalanceDest	newbalanceDest	nameDestCat
0	303	CASH_IN	185164.71	3075480.01	3260644.72	881991.88	696827.18	C
1	356	CASH_IN	79083.65	5489716.32	5568799.97	153219.51	74135.86	C
2	10	TRANSFER	2336832.78	147958.78	0.00	5069347.06	7307970.46	C
3	238	TRANSFER	228517.91	0.00	0.00	18768561.09	18997079.00	C
4	133	CASH_IN	180179.73	21448.00	201627.73	7160295.13	6980115.40	C

	step	type	amount	nameOrig	oldbalanceOrg	newbalanceOrig	nameDest	oldbalanceDest	newbalanceDest	id	nameDestCat
0	257	CASH_OUT	208758.36	C1628429873	186770.00	0.00	C703878081	98209.59	306967.94	98777	C
1	375	PAYMENT	8078.82	C1642149666	40458.00	32379.18	M1726657457	0.00	0.00	397457	M
2	168	CASH_OUT	60783.19	C346779827	0.00	0.00	C1063782390	747531.29	808314.48	592225	C
3	590	CASH_IN	267290.88	C713342712	4646500.36	4913791.24	C1390373129	299549.13	32258.25	425029	C
4	377	CASH_IN	226575.08	C1831624676	7615712.40	7842287.48	C1453112071	1822519.59	1595944.51	118706	C

	step	amount	oldbalanceOrg	newbalanceOrig	oldbalanceDest	newbalanceDest	is_CASH_IN	is_TRANSFER	is_C
0	303	12.129006	14.938972	14.997436	13.689939	13.454294	1.0	0.0	1.0
1	356	11.278274	15.518387	15.532690	11.939633	11.213668	1.0	0.0	1.0
2	10	14.664307	11.904696	0.000000	15.438723	15.804476	0.0	1.0	1.0
3	238	12.339374	0.000000	0.000000	16.747694	16.759796	0.0	1.0	1.0
4	133	12.101716	9.973433	12.214183	15.784062	15.758576	1.0	0.0	1.0

	step	amount	oldbalanceOrg	newbalanceOrig	oldbalanceDest	newbalanceDest	is_CASH_IN	is_CASH_OUT	is_PAYMENT	...	predictedCluster_22
0	257	12.248937	12.137639	0.000000	11.494869	12.634502	0.0	1.0	0.0	...	1.0
1	375	8.997125	10.608044	10.385302	0.000000	0.000000	0.0	0.0	1.0	...	0.0
2	168	11.015085	0.000000	0.000000	13.524533	13.602708	0.0	1.0	0.0	...	0.0
3	590	12.496097	15.351625	15.407557	12.610037	10.381560	1.0	0.0	0.0	...	0.0
4	377	12.330836	15.845724	15.875041	14.415731	14.282977	1.0	0.0	0.0	...	0.0