We're going to start by importing all libraries needed for the creation of the model. Refer to the requrements.txt file in the repository for the complete list and all version info.
from matplotlib import style
import pandas as pd
import numpy as np
%matplotlib inline
style.use("ggplot")
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Now let's read in the training set and take a look.
df = pd.read_csv('~/rouest/project-submissions/data/train.csv')
df.head()
# Even though no isFraud cases were to merchants, let's still break them out as a feature.
df['nameDestCat'] = df.nameDest.apply(lambda x: x[0])
# Now we need to select the features (and target) that we are using
dfalt = df.ix[:, [0, 1, 2, 4, 5, 7, 8, 9, 11]]
# And take a look . . .
dfalt.head()
Let's take the change to transform them with numpy's log1p to approximate a more normal distrbution (which will prove helpful in our support vector classification later) while retaining the 0 values.
dfalt.amount = np.log1p(dfalt.amount)
dfalt.oldbalanceOrg = np.log1p(dfalt.oldbalanceOrg)
dfalt.newbalanceOrig = np.log1p(dfalt.newbalanceOrig)
dfalt.oldbalanceDest = np.log1p(dfalt.oldbalanceDest)
dfalt.newbalanceDest = np.log1p(dfalt.newbalanceDest)
# Now we can dummy out the type and nameDestCat features
dfdum = pd.get_dummies(dfalt, prefix='is')
dfdum.head()
# Let's create our features list so that we can create some clusters for our dataframe
features = list(set(dfdum.columns) - {'isFraud'})
kmeans = KMeans(n_clusters=25, random_state=0, n_jobs=-1)
kmeans.fit(dfdum[features])
# And then apply them to the dataset
predictedLabels = kmeans.predict(dfdum[features])
dfdum['predictedCluster'] = predictedLabels
dfdum['predictedCluster'] = dfdum.predictedCluster.astype('object')
# Let's take a look and then dummy it out again
dfdum.head()
dfclus = pd.get_dummies(dfdum)
dfclus.head()
features = list(set(dfclus.columns) - {'isFraud'})
#def rmse(estimator, X, y_true, greater_is_better=False):
# preds = estimator.predict(X)
# return mse(y_true, preds)**.5
grid = [{'C': [100, 10, 1, .1, .01, .001, .0001, .00001, .000001, .0000001]}] # Create the list for gridsearch
# Instantiate GridSearch and then . . . wait.
GridSearch = GridSearchCV(LinearSVC(random_state=0), grid, cv=5, n_jobs=-1)
# And fit! and wait! The nice thins is that GridSearch will automatically refit the best performing model
# to the entire training set
GridSearch.fit(dfclus[features], dfclus.isFraud)
# Let's take a look at our best score:
GridSearch.best_score_
# and parameters . . .
GridSearch.best_params_
# and estimator as a whole!
GridSearch.best_estimator_
Now we'll repeat our feature engineering from above on the test set.
# Read in the dataset remembering that it does not have our target so the shape is slightly different.
dft = pd.read_csv('~/rouest/project-submissions/data/test.csv')
dft['nameDestCat'] = dft.nameDest.apply(lambda x: x[0])
dft.head()
dftalt = dft.ix[:, [0, 1, 2, 4, 5, 7, 8, 10]]
dftalt.amount = np.log1p(dftalt.amount)
dftalt.oldbalanceOrg = np.log1p(dftalt.oldbalanceOrg)
dftalt.newbalanceOrig = np.log1p(dftalt.newbalanceOrig)
dftalt.oldbalanceDest = np.log1p(dftalt.oldbalanceDest)
dftalt.newbalanceDest = np.log1p(dftalt.newbalanceDest)
dftdum = pd.get_dummies(dftalt, prefix='is')
features = list(set(dftdum.columns) - {'isFraud'})
# Now we can fit our original kmeans to the test set.
kmeans.fit(dftdum[features])
predictedLabels = kmeans.predict(dftdum[features])
dftdum['predictedCluster'] = predictedLabels
dftdum['predictedCluster'] = dftdum.predictedCluster.astype('object')
dftclus = pd.get_dummies(dftdum)
#dftclus.head()
features = list(set(dftclus.columns) - {'isFraud'})
To make our predictions on the modified test set.
# We're using the decision_function for roc_auc.
predictionLabel = GridSearch.decision_function(dftclus[features])
preds = predictionLabel.tolist()
submit = {'Prediction': preds, 'id': dft['id']}
submit = pd.DataFrame(submit)
# Create the csv for submission!
submit.to_csv('rouestsub26.csv', index = False)