In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import glob

#import prettyplotlib as ppl

sns.set_style('whitegrid')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [80]:
# folder storing csv
path2 =r'mpg/2010'
path1 =r'mpg/2009'

frame = pd.DataFrame()

# combining all csv report files
def df_create(path):
    allFiles = glob.glob(path + "/*.csv")
    list_ = []
    for file_ in allFiles:
        df = pd.read_csv(file_,index_col=None, header=0)
        print file_, df.shape
        list_.append(df)
        frame = pd.concat(list_, ignore_index=True)
    return frame
In [81]:
# Create two dataframes for two sets of vehicles data
frame2010 = df_create(path2)
frame2009 = df_create(path1)
mpg/2010/10tstcar.csv (3277, 64)
mpg/2010/11tstcar.csv (3762, 64)
mpg/2010/12tstcar.csv (4144, 64)
mpg/2010/13tstcar.csv (4261, 64)
mpg/2010/14tstcar.csv (4764, 67)
mpg/2010/15tstcar.csv (4832, 67)
mpg/2009/00tstcar.csv (2660, 51)
mpg/2009/01tstcar.csv (2433, 51)
mpg/2009/02tstcar.csv (2363, 51)
mpg/2009/03tstcar.csv (2497, 51)
mpg/2009/04tstcar.csv (2489, 51)
mpg/2009/05tstcar.csv (2666, 51)
mpg/2009/06tstcar.csv (2693, 51)
mpg/2009/07tstcar.csv (2845, 51)
mpg/2009/08tstcar.csv (3015, 51)
mpg/2009/09tstcar.csv (2950, 50)
In [82]:
frame2009.head(2)
Out[82]:
AVRG_CD CH_CD CLS_TYP_CD CL_NM CMYT_CO2_FE_MSR CMYT_CO_FE_MSR CMYT_HC_FE_MSR CMYT_NOX_MSR CMYT_PM_MSR DRV_SYS_CD ... VC_AXLE_RAT_MSR VC_CMPRSN_RAT_MSR VC_CNFG_NUM VC_CYL_CNT VC_DSN_ETW_MSR VC_NV_RAT_MSR VC_RTD_HP_MSR VEH_FL_TYP_CD VI_MFR_CD VI_MFR_NM
0 NaN C C NEON 321.0 0.77 0.093 0.07 NaN F ... 2.98 9.8 0 4 2875 44.2 132 6 20 Chrysler LLC
1 NaN H C NEON 223.0 0.15 0.007 NaN NaN F ... 2.98 9.8 0 4 2875 44.2 132 6 20 Chrysler LLC

2 rows × 51 columns

Columns descriptions for 2000-2009 data

  • 'VC_CYL_CNT' : cylinder counts
  • 'CL_NM' : model
  • 'TRNS' - transmission type
trns transmission code
C4 Manual 4-Speed (Creeper) (M-4)
M3 Manual Three-Speed
M4 Manual Four-Speed (No Creeper)
M5 Manual Five-Speed
SA Semi-Automatic
A3 Automatic 3-Speed (No Lockup)
L3 Lock-Up/Automatic/3-Speed
A4 Automatic 4-Speed (No Lockup)
L4 Lock-Up/Automatic/4-Speed
C5 Manual 5-Speed (Creeper) (M-5)
S2 Semi-Automatic Two Speed
S3 Semi-Automatic Three Speed
S4 Semi-Automatic Four Speed
S5 Semi-Automatic Five Speed
AV Automatic Variable Gear Ratios
M6 Manual Six Speed
A5 Automatic 5-Speed (No Lockup)
L5 Lock-Up/Automatic/5-Speed
C6 Manual 6-Speed (Creeper) (M-6)
A6 Automatic 6-Speed (No Lockup)
S6 Semi-Automatic Six Speed
  • 'VEH_FL_TYP_CD': fuel type
Type fuel type code
06 Unleaded (at EPA 96 RON)
09 Diesel (at EPA #2 Diesel)
22 Special unleaded (91 RON)
23 Carb Phase II Gasoline
33 Methanol(M85)
39 Ethanol
41 CNG
  • 'CH_CD' - city (C) or highway (H)
  • 'VC_CMPRSN_RAT_MSR' - compression ratio
  • 'DRV_SYS_CD' - R:rwd, F:fwd, 4:awd
  • 'MDLYR_DT' - Year manufactured
  • 'TPF_MFR_CSTDN_MSR' - Combined average MPG
  • 'VC_AXLE_RAT_MSR' - Gear ratio
  • 'VC_RTD_HP_MSR' - Rated horsepowers
  • 'VI_MFR_NM' - Maker
  • 'VC_DSN_ETW_MSR' - Weight of vehicle (lbs)
  • 'GBE_CID_MSR' - engine displacement in cu-in

=======

  • 'CLS_TYP_CD' - Car / Truck
In [83]:
labels_1 = ['VI_MFR_NM','CL_NM', 'MDLYR_DT', 'VC_CYL_CNT', 'GBE_CID_MSR', 
          'TRNS', 'VEH_FL_TYP_CD', 'CH_CD', 'VC_CMPRSN_RAT_MSR',
          'VC_AXLE_RAT_MSR','DRV_SYS_CD','TPF_MFR_CSTDN_MSR', 
          'VC_RTD_HP_MSR', 'VC_DSN_ETW_MSR']
In [84]:
labels_2 = ['# of Cylinders and Rotors','# of Gears','Equivalent Test Weight (lbs.)', 
            'Axle Ratio', 'Model Year','RND_ADJ_FE','Rated Horsepower',
            'Represented Test Veh Make','Represented Test Veh Model',
           'Test Veh Displacement (L)','Tested Transmission Type Code',
            'Test Fuel Type Cd','Drive System Code']
In [85]:
veh_2009 = frame2009[labels_1]
veh_2010 = frame2010[labels_2]
In [86]:
# rename column descriptions 2009

columns_1 = ['maker', 'model', 'year', 'cylinder', 'displacement', 'trans',
            'fuel_type', 'env', 'compression_rat', 'gear_rat', 'type_axel',
             'avg_mpg', 'rated_hp', 'wt']

veh_2009.columns = columns_1
In [87]:
# function convert cubic in to Liter engine displacement

def liter_conversion(num):
    lit = num/61
    return round(lit, 1)
In [88]:
veh_2010 = veh_2010[['Represented Test Veh Make', 'Represented Test Veh Model',
                   'Model Year','# of Cylinders and Rotors', 
                   'Test Veh Displacement (L)','# of Gears', 
                   'Test Fuel Type Cd',
                   'Axle Ratio','Tested Transmission Type Code', 
                   'RND_ADJ_FE','Rated Horsepower','Equivalent Test Weight (lbs.)',  
                    'Drive System Code']]
In [89]:
print veh_2009.columns.tolist()
['maker', 'model', 'year', 'cylinder', 'displacement', 'trans', 'fuel_type', 'env', 'compression_rat', 'gear_rat', 'type_axel', 'avg_mpg', 'rated_hp', 'wt']
In [90]:
# rename column descriptions 2009

columns_2 = ['maker', 'model', 'year', 'cylinder', 'displacement', 'number_of_gear',
            'fuel_type','gear_rat', 'trans','avg_mpg', 'rated_hp', 'wt', 'type_axel']

veh_2010.columns = columns_2
In [91]:
###############################
# DROP NaN row                #
# WARNING: only run this ONCE #
###############################

veh_2009 = veh_2009.dropna()
veh_2010 = veh_2010.dropna()
In [92]:
veh_2009.head(2)
Out[92]:
maker model year cylinder displacement trans fuel_type env compression_rat gear_rat type_axel avg_mpg rated_hp wt
0 Chrysler LLC NEON 2000 4 122 A3 6 C 9.8 2.98 F 14.37 132 2875
1 Chrysler LLC NEON 2000 4 122 A3 6 H 9.8 2.98 F 14.37 132 2875
In [93]:
print veh_2009['year'].unique()
[2000 2001 2002 2003 2004 2005 2006 2007 2008 2009]
In [94]:
veh_2010.head(2)
Out[94]:
maker model year cylinder displacement number_of_gear fuel_type gear_rat trans avg_mpg rated_hp wt type_axel
2 Aston Martin DB9 2010 12.0 5.9 6 61 3.15 SA 16.1 470 4500 R
3 Aston Martin DB9 2010 12.0 5.9 6 61 3.15 SA 27.2 470 4500 R
In [95]:
print veh_2009.isnull().sum()
maker              0
model              0
year               0
cylinder           0
displacement       0
trans              0
fuel_type          0
env                0
compression_rat    0
gear_rat           0
type_axel          0
avg_mpg            0
rated_hp           0
wt                 0
dtype: int64
In [ ]:
 
In [96]:
##################
# simple 2D plot #
##################

def plot_num(df,a,b):
    fig = plt.figure(figsize=(12,9))
    ax = fig.gca()
    ax.scatter(df[a], df[b], lw=2.5, alpha=0.4, c='red', label=a+' vs '+b)
    ax.set_xlabel(a, fontsize = 16)
    ax.set_ylabel(b, fontsize = 16)
    lr = np.polyfit(df[a], df[b],1)
    lr_ = np.poly1d(lr)
    ax.plot(df[a], lr_(df[a]), c='steelblue', ls='--', lw=3, alpha=0.4)
    ax.legend(loc='best',fancybox=True, framealpha=0.2, fontsize=16)
    plt.show()
In [97]:
##############################
# Plot 3 variables with cmap #
##############################

def plot_2va(df,a,b,color):
    fig = plt.figure(figsize=(12,9))
    ax = fig.gca()
    ax.scatter(df[a], df[b], lw=2.5, alpha=0.4, c=color, s=40)
    ax.set_xlabel(a, fontsize = 20)
    ax.set_ylabel(b, fontsize = 20)
    
    # linear fit line
    lr = np.polyfit(df[a], df[b], 1)
    lr_ = np.poly1d(lr)
    plt.plot(df[a], lr_(df[a]), c='darkred', lw=3, alpha=0.6)
    
    handles, labels = plt.gca().get_legend_handles_labels()
    print handles, labels
    by_label = {l:h for l, h in zip(labels, handles)}
    plt.legend(by_label.values(), by_label.keys() ,
              loc='best', fontsize=16)

    plt.show()
In [98]:
# veh_2010[veh_2010['trans']=='AMS'].head(3)
veh_2010['type_axel'].unique()
Out[98]:
array(['R', 'F', 'A', '4', 'P'], dtype=object)
In [99]:
#########################################
# convert transmission type to category #
# 1 - automatic                         # 
# 2 - semi-automatic                    #
# 3 - manual                            #
#########################################

def split_str(num):
    splitted_string = list(num)
    try:
        for a, b in zip(splitted_string[0], splitted_string[1]):
            if a in ['A', 'C','L','O']:
                return 1, b
            elif a == 'S':
                return 2, b
            else:
                return 3, b
    except:
        return num

def trans_conv(df, ori):
    trans_type = []
    num_gear = []
    for i in df[ori]:
        try:
            a,b  = split_str(i)
            trans_type.append(a)
            num_gear.append(b)
        except:
            trans_type.append(i)
            num_gear.append(1)
    return pd.Series(trans_type), pd.Series(num_gear)

####################################
# fuel type                        #
# 1 - regular                      #
# 2 - premium                      #
# 3 - other(E85, methanol, desiel) #
####################################

def fuel_conv(num):
    
    if num in [61, 38, 41]:
        return 1
    elif num in [6, 19, 22, 23, 27, 26, 44]:
        return 2
    else:
        return 3

#############
# axel type #
# 1 - FWD   #
# 2 - RWD   #
# 3 - AWD   #
#############

def axel_conv(df, col):    
    series = df[col].map({'F':1, 'R':2, '4':3, 'A':3, 'P':5})
    return series
In [100]:
# Convert and split transmission type
veh_2009['trans_type'], veh_2009['number_of_gear'] = trans_conv(veh_2009, 'trans')

# Covert fuel type
veh_2009['fuel_type'] = veh_2009['fuel_type'].apply(fuel_conv)

# Convert drive-train type
veh_2009['type_axel'] = axel_conv(veh_2009, 'type_axel')

# Displacement to Liters conversion
veh_2009['displacement'] = veh_2009['displacement'].apply(liter_conversion)

# Gear number replcement
veh_2009['number_of_gear'] = veh_2009['number_of_gear'].replace({'V':1,'U':1 })
veh_2009['number_of_gear'] = pd.to_numeric(veh_2009['number_of_gear'])

# Convert fuel type for veh2010 dataset
veh_2010['fuel_type'] = veh_2010['fuel_type'].apply(fuel_conv)
In [101]:
#################
# Convert trans #
# for veh_2010  #
#################

def trans_conv_2(num):
    if num in ['CVT', 'A', 'AM', 'OT']:
        return 1 # For automatic
    elif num in ['M']:
        return 3 # For manual
    else:
        return 2 # For semi-automatic
In [102]:
# Convert transmission type for veh2010 dataset
veh_2010['trans'] = veh_2010['trans'].apply(trans_conv_2)
In [103]:
##################################
# Indicate 1/0 for yes/no        #
# whether the MPG is at least 20 #
##################################

def mpg_20(num):
    if num >= 20:
        return 1
    else:
        return 0

veh_2009['mpg_20'] = veh_2009['avg_mpg'].apply(mpg_20)
veh_2010['mpg_20'] = veh_2010['avg_mpg'].apply(mpg_20)
In [104]:
veh_2009 = veh_2009.dropna()
In [105]:
print veh_2009.shape, veh_2010.shape
(13886, 17) (22707, 14)
In [106]:
color = veh_2009['cylinder'].map({2:'r', 3:'b', 4:'y',
                                       5:'violet', 6:'springgreen',
                                       8:'steelblue', 10:'darkred',
                                       12:'salmon', 16:'peru'})
In [107]:
veh_2009 = veh_2009[['maker', 'model', 'year', 'cylinder', 
                     'displacement','number_of_gear','fuel_type',
                     'gear_rat', 'type_axel','avg_mpg', 'rated_hp',
                     'wt', 'trans_type','mpg_20']]
In [108]:
veh_2010.columns
Out[108]:
Index([u'maker', u'model', u'year', u'cylinder', u'displacement',
       u'number_of_gear', u'fuel_type', u'gear_rat', u'trans', u'avg_mpg',
       u'rated_hp', u'wt', u'type_axel', u'mpg_20'],
      dtype='object')
In [109]:
veh_2009.head(2)
Out[109]:
maker model year cylinder displacement number_of_gear fuel_type gear_rat type_axel avg_mpg rated_hp wt trans_type mpg_20
0 Chrysler LLC NEON 2000 4 2.0 3.0 2 2.98 1 14.37 132 2875 1.0 0
1 Chrysler LLC NEON 2000 4 2.0 3.0 2 2.98 1 14.37 132 2875 1.0 0
In [110]:
plot_num(veh_2009, 'cylinder', 'avg_mpg')
In [111]:
plot_num(veh_2009, 'cylinder', 'rated_hp')
In [112]:
veh_2009 = veh_2009[veh_2009['avg_mpg'] > 0]
In [113]:
veh_2009.head(2)
Out[113]:
maker model year cylinder displacement number_of_gear fuel_type gear_rat type_axel avg_mpg rated_hp wt trans_type mpg_20
0 Chrysler LLC NEON 2000 4 2.0 3.0 2 2.98 1 14.37 132 2875 1.0 0
1 Chrysler LLC NEON 2000 4 2.0 3.0 2 2.98 1 14.37 132 2875 1.0 0
In [114]:
veh_2010 = veh_2010[veh_2010['avg_mpg'] > 0]
In [115]:
veh_2010.head(2)
Out[115]:
maker model year cylinder displacement number_of_gear fuel_type gear_rat trans avg_mpg rated_hp wt type_axel mpg_20
2 Aston Martin DB9 2010 12.0 5.9 6 1 3.15 2 16.1 470 4500 R 0
3 Aston Martin DB9 2010 12.0 5.9 6 1 3.15 2 27.2 470 4500 R 1
In [116]:
#############
# axel type #
#############
# 1 - FWD   #
# 2 - RWD   #
# 3 - AWD   #
#############

# def axel_conv_2(num):
#     if num in ['A', 4, 'P']:
#         return 3
#     elif num == 'F':
#         return 1
#     else:
#         return 2
In [117]:
# MAKE SURE ALL VEHICLES ARE FOSSIL FUEL USERS #

veh_2010 = veh_2010[veh_2010['avg_mpg'] <= 100]

# Convert drivetrain type for veh2010 dataset
veh_2010['type_axel'] = axel_conv(veh_2010, 'type_axel')

# Rename ONE column in veh2010 dataframe
veh_2010.rename(columns={'trans':'trans_type'}, inplace=True)
In [118]:
### Combine all datasets ##
veh_total = pd.concat([veh_2009, veh_2010])
In [119]:
veh_total.head(2)
Out[119]:
avg_mpg cylinder displacement fuel_type gear_rat maker model mpg_20 number_of_gear rated_hp trans_type type_axel wt year
0 14.37 4.0 2.0 2 2.98 Chrysler LLC NEON 0 3.0 132 1.0 1 2875 2000
1 14.37 4.0 2.0 2 2.98 Chrysler LLC NEON 0 3.0 132 1.0 1 2875 2000
In [163]:
## round the average mpg ##
veh_total['avg_mpg'] = veh_total['avg_mpg'].map(lambda x:round(x))
In [164]:
from ipywidgets import interact, interactive, fixed, DOMWidget, IntSlider
import ipywidgets as widgets
from ipywidgets import *
import ipywidgets as widgets
from IPython.display import display
from IPython import display as idisplay

#############################
# convert year to dt series #
#############################

def date_year(num):
    num_ = str(num)+"-1-1"
#     num_ = pd.to_datetime(num_)
    return num_

###################################
# Plot average mpg on time series #
# ** Specify the engine size      #
##################################

def plot_avg(engine_size, df, time , mpg, title):

    df_ = df[df['cylinder'] == engine_size]
    df_ = df_[[time, mpg]]
    df_ = df_.sort_values(by=time, ascending=True)
    min_mpg = []
    max_mpg = []
    
    for year in df_[time].unique().tolist():
        min_val = np.min(df_[df_[time]==year][mpg])
        max_val = np.max(df_[df_[time]==year][mpg])
        min_mpg.append(min_val)
        max_mpg.append(max_val)
    
    df_[time] = df_[time].map(date_year)
    df_[time] = pd.to_datetime(df_[time])

    df_ = df_.groupby(time).mean().reset_index()
    df_['Min_mpg'] = min_mpg
    df_['Max_mpg'] = max_mpg
    
    df_ = df_.set_index(time).resample('12M').mean()
    fig = plt.figure(figsize=(12,5))
    ax = fig.gca()
    
    plt.plot(df_.iloc[:,0], c='b', marker = '8',label='avg_mpg for %d cylinders' % engine_size)
    plt.plot(df_.iloc[:,1], c='r',marker = '8',ls='--', label='Min_mpg for %d cylinders'%engine_size)
    plt.plot(df_.iloc[:,2], c='green',marker = '8', ls='--',label='Max_mpg for %d cylinders'%engine_size)

    plt.ylim([5,80])
    plt.xlabel('Year', fontsize=18)
    plt.ylabel('MPG', fontsize=18)
    plt.title(title, fontsize=18)
    plt.legend(fontsize=14, bbox_to_anchor=(1.4,1), )
    plt.show()
    
def ridge_plot_runner1(cylinder=4):
    plot_avg(cylinder, veh_total, 'year', 'avg_mpg', 'Vehicles (2000-2016)')

interact(ridge_plot_runner1, cylinder=(4, 16, 2))
In [165]:
from patsy import dmatrices

y_lr, X_lr = dmatrices('mpg_20 ~ year + cylinder + displacement + \
                           number_of_gear + gear_rat + rated_hp + wt +\
                           C(type_axel) + C(fuel_type) + C(trans_type) - 1',
                           veh_total, return_type="dataframe")

y_lr = np.ravel(y_lr)
In [166]:
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
In [167]:
# Predict whether vehicle can reach at least 20 MPG
# REQUIRED INFOMATION:
# year, # of cylinder, engine displacement
# number of gear, gear ratio, rated horsepower
# curb weight, drivetrain, fuel type, transmission type

def logistic_regression_EDA(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
    
    # logistic regression model, and fit with X_train and y_train
    model = LogisticRegression()
    model = model.fit(X_train, y_train)
    
    # Cross_val check for overfitting
    cv_scores = cross_val_score(model ,X, y, cv=10)
    
    print "====Logistic Regression on guessing 20 MPG ===="
    print "Prediction accuracy: ",round(model.score(X_test, y_test), 3)
    print "Target mean: ", round(np.mean(y_test), 3)
    print "Target prediction rate: ", 1-round(np.mean(y_test), 3)
    print "Cross_validated score: %0.3f" %(np.mean(cv_scores))
In [168]:
logistic_regression_EDA(X_lr, y_lr)
====Logistic Regression on guessing 20 MPG ====
Prediction accuracy:  0.788
Target mean:  0.549
Target prediction rate:  0.451
Cross_validated score: 0.751

Logistic Regression outcome

  • Since the target prediction rate yield 78.8% chance of guessing any car which makes at least 20 MPG
  • Our prediction yielded 32% higher in accuracy compared to baseline guess (~5.5%)
In [459]:
##################
# ROC curve plot #
##################
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
import matplotlib.mlab as mlab
import math
from sklearn.metrics import confusion_matrix
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from collections import Counter


def ROC_plot(clf ,X, Y):
    clf.fit(X, Y)
    preds = clf.predict_proba(X)[:,1]
    preds_ = clf.predict(X)
    fpr, tpr, _ = roc_curve(Y, preds)
    roc_auc = auc(fpr, tpr)
    
    # Plot ROC curve
    fig = plt.figure(figsize=(10,8))
    ax = fig.gca()
    ax.plot(fpr, tpr, label = 'ROC curve, AUC = %0.2f' % roc_auc)
    ax.plot([0,1], [0,1], ls='--', c='black')
    ax.legend(loc='best', fontsize=18)
    ax.set_ylabel('True Positive Rate', fontsize=16)
    ax.set_xlabel('False Positive Rate', fontsize=16)
    ax.set_ylim([0,1.02])
    ax.set_xlim([0,1.02])
    
    fig2 = plt.figure(figsize=(10,3))
    ax2 = fig2.gca()
        
    cf_array = np.array(confusion_matrix(Y, preds_, labels=[1,0]))

    cf_df = pd.DataFrame(cf_array, index=['Positive', 'Negative'],
                         columns=['predicted_Positive','predicted_Negative'])
    print cf_df
     

    
    a = np.linspace(0, 1, len(preds_))
    b = np.linspace(0, 1, len(fpr))
    
    tp_ = (tpr-np.mean(tpr))/(np.std(tpr))
    fp_ = (fpr-np.mean(fpr))/(np.std(fpr))
     
    sigma_a = np.std(tp_)
    sigma_b = np.std(fp_)
    
    mu_a = np.mean(fpr)
    mu_b = np.mean(tpr)
    
    ax2.plot(b,mlab.normpdf(tpr, mu_a, sigma_b))
    ax2.plot(b,mlab.normpdf(fpr, mu_b, sigma_b))

    ax2.axvline(np.mean(mlab.normpdf(tpr, mu_a, sigma_b)), ls='--', c='black')
    ax2.axvline(np.mean(mlab.normpdf(fpr, mu_b, sigma_b)), ls='--', c='blue')
    
    ax2.set_xlim([0,1])
    plt.show()
In [ ]:
 
In [460]:
lr_model = LogisticRegression()

ROC_plot(lr_model, X_lr, y_lr)
          predicted_Positive  predicted_Negative
Positive               16027                3235
Negative                4410               11462
In [461]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

#############################################
# Plot the coefficients of regularization   #
# using Ridge                               #
#############################################


def ridge_regression_plot(df, target, max_arg):
    
    columns = df.columns.tolist()
    y = df[target]
    X = df[[i for i in columns if i != target]]

    colors = ['blue', 'green', 'red', 'cyan',
             'magenta', 'yellow', 'black',
                'indigo', 'lightgreen', 'lightblue',
                 'gray', 'orange']
    weights, params = [],[]
    
    for c in np.logspace(-4, 2, 200):
        clf = Ridge(alpha=10**c, normalize=False)
        clf.fit(X,y)
        weights.append(clf.coef_)
        params.append(10**c)
    
    weights = pd.DataFrame(weights)
    fig = plt.figure(figsize=(15,11))
         
    for i in range(10):
        plt.plot(params[:int(max_arg)], weights.iloc[:int(max_arg),i],c=colors[i], label = X.columns[i], lw=2)
                     
    plt.axvline(params[int(max_arg)], lw=1.,c='black',ls='--')
    plt.axhline(0, lw=3.5,c='black',ls='--')
    plt.xlim([np.min(params), 10**8])
    plt.ylim([-2.5, 2.5])
    plt.xscale('log')
    plt.legend(loc='best', fontsize=16)
    plt.xlabel('alpha')
    plt.ylabel('weights')
    plt.title('Ridge coefficients as a function of the regularization', fontsize=17)
    plt.show()
In [172]:
df_ridge = veh_total[['year', 'cylinder', 
                     'displacement', 'number_of_gear', 
                     'fuel_type', 'gear_rat', 'trans_type', 
                     'avg_mpg', 'rated_hp', 'wt', 'type_axel']]
In [173]:
def ridge_plot(max_arg):
    ridge_regression_plot(df_ridge, 'avg_mpg', max_arg)
    
    
w = widgets.interactive(ridge_plot, 
                        max_arg = widgets.FloatSlider(value=200, min=100, max=199., 
                                                      step=5, description='alpha log10: ')
                        )

display(w)
In [174]:
from sklearn.ensemble import GradientBoostingRegressor, partial_dependence
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.grid_search import GridSearchCV
In [175]:
def gdbr_features(df,target,predictors):
    target_df = df[target]
    predictors_df = df[predictors]
    X_train, X_test, y_train, y_test = train_test_split(predictors_df, target_df, test_size=0.25)

    model = GradientBoostingRegressor()
    grid = {'n_estimators':[100, 300, 500, 1000],
            'max_depth':[3,5,7]}
    gs = GridSearchCV(model,grid, n_jobs=-1)
    gs.fit(predictors_df, target_df)
    
    feat = {'feature':predictors_df.columns, 'gdbr_feature_importances': gs.best_estimator_.feature_importances_}
    features = pd.DataFrame({key:val for key,val in feat.iteritems()}).sort_values(by='gdbr_feature_importances', 
                                                                                  ascending=True)
    return features, gs, y_test
In [176]:
# Declare features to be used for predictions
feat = ['year', 'cylinder', 'displacement', 'rated_hp', 'wt']
features_df, gdbr_, gdbr_y_test = gdbr_features(df_ridge, 'avg_mpg', feat) 

X = df_ridge[feat]
y = df_ridge['avg_mpg']
In [177]:
print 'GradientDescentBoostedRegressor GS score: %0.3f' %gdbr_.best_score_
print features_df
X.hist(figsize=(9,7))
plt.show()
GradientDescentBoostedRegressor GS score: 0.005
        feature  gdbr_feature_importances
1      cylinder                  0.022195
4            wt                  0.145726
0          year                  0.193489
2  displacement                  0.270870
3      rated_hp                  0.367720