#!/usr/bin/python

""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()


#########################################################
### your code goes here ###


print ("This is my code.\n")


features_train = features_train[:len(features_train)/100] 
labels_train = labels_train[:len(labels_train)/100] 

from sklearn import svm

## clf = svm.SVC(kernel="linear")
clf = svm.SVC(kernel="rbf", C=10000.0)

print ("C = ", getattr(clf, 'C'))


t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

t1 = time()
print (clf.score(features_test, labels_test))
print "scoring time:", round(time()-t1, 3), "s"

pred = clf.predict(features_test)
pred.sum()  ## number of Chris's emails


'''
linear kernel:
with entire data set:

training time: 163.169 s
0.984072810011
scoring time: 17.188 s


with smaller data set:

training time: 0.089 s
0.884527872582
scoring time: 0.961 s

rbf kernel:

training time: 0.101 s
0.616040955631
scoring time: 1.101 s

C=10.0

training time: 0.102 s
0.616040955631
scoring time: 1.101 s

C=100.0

training time: 0.101 s
0.616040955631
scoring time: 1.101 s

C=1000.0


training time: 0.097 s
0.821387940842
scoring time: 1.052 s

C=10000.0

training time: 0.096 s
0.892491467577
scoring time: 0.886 s

('C = ', 10000.0)
training time: 0.099 s
0.892491467577
scoring time: 0.885 s

for x in dir(clf):
  print (x, getattr(clf, x))

complete dataset:

('C = ', 10000.0)
training time: 109.979 s
0.990898748578
scoring time: 11.163 s

'''


#########################################################