本文整理汇总了Python中tester.dump_classifier_and_data函数的典型用法代码示例。如果您正苦于以下问题:Python dump_classifier_and_data函数的具体用法?Python dump_classifier_and_data怎么用?Python dump_classifier_and_data使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了dump_classifier_and_data函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: finish
def finish():
try:
if best_solution_so_far:
print ""
print "============================="
print "Optimization Path:"
print "============================="
for solution in optimization_path:
print_result_item(solution[0])
print solution[1]
print ""
print "============================="
print "Final Solution:"
print "============================="
print best_solution_so_far
clf = best_solution_so_far[0][3]
features_list = best_solution_so_far[1]
dump_classifier_and_data(clf, my_dataset, features_list)
print "Model saved with success."
else:
print ""
print "No solution found"
except Exception as e:
print e
开发者ID:luiscruz,项目名称:udacity_data_analyst,代码行数:25,代码来源:poi_id.py
示例2: setup_and_test
def setup_and_test(my_dataset, features_list, classifier):
# Dump classifier and features list, so we can test them
dump_classifier_and_data(classifier, my_dataset, features_list)
# load up student's classifier, dataset, and feature_list
clf, dataset, feature_list = load_classifier_and_data()
# Run testing script
test_classifier(clf, dataset, feature_list)
return
开发者ID:joashxu,项目名称:enron,代码行数:10,代码来源:utils.py
示例3: detect_poi
def detect_poi():
### Load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
### Task 1: Remove outliers
data_dict.pop('TOTAL',0)
### Task 2: Select what features
### 'stk_pay_ratio','to_poi_ratio', 'from_poi_ratio','bonus_salary_ratio'
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
my_dataset = data_dict
stk_pay_ratio(my_dataset)
from_poi_ratio(my_dataset)
to_poi_ratio(my_dataset)
bonus_salary_ratio(my_dataset)
### Task 3: Feature Selection
### Generate a set of 15 feature lists from these 4 features
### This way, all possible combinations of these features are tested
all_features_list = fList_set()
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation in tester.py
metrics = []
clf = GaussianNB()
### ptest uses Stratified shuffle split cross validation and calculates the precision
### Find the precision for every list
for i in range(0,15):
metrics.append(ptest(clf,my_dataset,all_features_list[i]))
### Go for the feature list that produces the best precision.
### For this dataset only, it is harder to get a high precision.
best = np.array(metrics).argmax()
### Run test_classifier to print evaluation metrics to console
test_classifier(clf, my_dataset,all_features_list[best])
### Now use the same feature list to run the decison tree classifier
features_list = all_features_list[best]
### Task 4: Try a varity of classifiers
samples_split_values = [2,4]
samples_leaf_values = [1,2]
for split in samples_split_values:
for leaf in samples_leaf_values:
clf = tree.DecisionTreeClassifier(min_samples_split=split,\
min_samples_leaf=leaf)
test_classifier(clf, my_dataset, features_list)
print_feature_importances(features_list, clf)
###Choose best classfier and feature set
clf = GaussianNB()
### Dump classifier, dataset, and features_list
dump_classifier_and_data(clf, my_dataset, features_list)
开发者ID:RaphaelTam,项目名称:Enron_Bad_Guys,代码行数:54,代码来源:poi.id.py
示例4: main
def main():
data_dict = pickle.load(open("final_project_dataset.pkl", "r"))
my_dataset = data_dict
my_dataset = AddFeatures(my_dataset)
# Exclude using Discretion.
Exc1 = ["email_address"]
# Replaced by creating better versions of the features
Exc2 = ["to_messages", "from_messages", "from_this_person_to_poi", "from_poi_to_this_person"]
# Exclude because Highly Correlated with stronger features
Exc3 = [
"deferral_payments",
"expenses",
"deferred_income",
"restricted_stock_deferred",
"director_fees",
"long_term_incentive",
"bonus",
"total_payments",
"salary",
"total_stock_value",
"restricted_stock",
"exercised_stock_options",
"other",
]
exclude = Exc1 + Exc2 + Exc3
# QueryDataSet(my_dataset)
# ShowCorrel(my_dataset)
features_list = next(my_dataset.itervalues()).keys()
for i in exclude:
features_list.remove(i)
features_list.insert(0, features_list.pop(features_list.index("poi")))
data = featureFormat(my_dataset, features_list, sort_keys=True)
### Extract features and labels from dataset for local testing
labels, features = targetFeatureSplit(data)
features_train, features_test, labels_train, labels_test = train_test_split(
features, labels, test_size=0.1, random_state=42, stratify=labels
)
# clf=TuneSVM(features, labels,features_list)
# clf=TuneKNN(features, labels,features_list)
# clf=NoTuneDT(features, labels,features_list)
# clf=TuneDT(features,labels,features_list)
features_list.insert(0, "poi")
dump_classifier_and_data(clf, my_dataset, features_list)
test_classifier(clf, my_dataset, features_list)
开发者ID:datalord123,项目名称:MachineLearning,代码行数:44,代码来源:poi_id.py
示例5: Pipeline
testing_features_list = [u'poi']
for feature in features_list_score_order:
testing_features_list.append(feature)
pipe = Pipeline([('impute', Imputer(strategy='median')),
('classify', GaussianNB(priors=[(i/2.)*.1, (1 - (i/2.)*.1)]))])
total_predictions, accuracy, precision, recall, f1, f2 = \
test_classifier(pipe, my_dataset, testing_features_list, folds=200)
acc.append(accuracy)
prec.append(precision)
reca.append(recall)
acc_all.append(acc)
prec_all.append(prec)
reca_all.append(reca)
results_dict['prec' + str(i)] = prec
results_dict['reca' + str(i)] = reca
results_dict['acc' + str(i)] = acc
#tuneNB()
test_df = pd.DataFrame(results_dict)
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results
features_list_score_order = [u'poi', u'exercised_stock_options', u'total_stock_value', u'bonus']
pipe = Pipeline([('impute', Imputer(strategy='median')),
('classify', GaussianNB(priors=[.15, .85]))])
total_predictions, accuracy, precision, recall, f1, f2 = \
test_classifier(pipe, my_dataset, features_list_score_order, folds=1000)
dump_classifier_and_data(pipe, my_dataset, features_list_score_order)
开发者ID:eistre91,项目名称:DataAnalystNanodegree,代码行数:30,代码来源:poi_id.py
示例6: SelectKBest
# #print self.X_fit+X
# best_words=self.wt.transform(self.X_fit+X)
# word_pca = self.pca.fit_transform(best_words)
# qqq = np.array(word_pca)[np.arange(len(self.y_fit)),:]
# best_pca_train = self.pt.fit_transform(qqq,self.y_fit)
# self.clf.fit(best_pca_train,self.y_fit)
# #x=remove_low_frequency_words(X)
# best_pca_test = self.pt.transform(np.array(word_pca)[np.arange(len(X))+len(self.X_fit)])
# #word_pca = self.pca.transform(best_words)
# #best_pca = self.pt.transform(word_pca)
# return self.clf.predict(best_pca_test)
## create filtered_gnb classifier
#word_transformer = SelectKBest(f_regression,200)
#pca = PCA(n_components=86)
#pca_transformer = SelectKBest(f_classif,20)
#classifier1 = DecisionTreeClassifier(min_samples_leaf=2)
#classifier2 = GaussianNB()
#classifier3 = KNeighborsClassifier()
#filtered_gnb=FilteredGNB(word_transformer,pca,pca_transformer,classifier1)
#print "FILTERED GNB CLASSIFIER USING ALL WORD FEATURES"
#test_classifier(filtered_gnb, my_dataset, ["poi"]+ words.tolist(),folds=5)
print "Gaussian NB with Word PCA Features:"
test_classifier(GaussianNB(), my_dataset, ["poi"]+ best_word_pca_features)
### Dump your classifier, dataset, and features_list so
### anyone can run/check your results.
dump_classifier_and_data(GaussianNB(), my_dataset, ["poi"]+best_word_pca_features)
开发者ID:avisochek,项目名称:enron_fraud_detection,代码行数:30,代码来源:poi_id.py
示例7: build_model
# Without new features
_ = build_model(original_features, estimator, {}, use_kbest=True, k=['all'], use_scaler=True)
# With grand_total
_ = build_model(original_features + ['grand_total'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True)
# With from_poi_ratio
_ = build_model(original_features + ['from_poi_ratio'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True)
# With to_poi_ratio
_ = build_model(original_features + ['to_poi_ratio'], estimator, {}, use_kbest=True, k=['all'], use_scaler=True)
# ----------------------------------------------------------
# Final Model
# ----------------------------------------------------------
final_model, final_features = build_model(original_features + ['grand_total'],
estimator, {},
use_kbest=True,
use_scaler=True)
test_classifier(final_model, data_dict, final_features, folds=1000)
# ----------------------------------------------------------
# Dump Classifier and Data
# ----------------------------------------------------------
dump_classifier_and_data(final_model, data_dict, final_features)
开发者ID:nhtruong,项目名称:ud120-projects,代码行数:29,代码来源:poi_id.py
示例8: evaluate_clasifier
def evaluate_clasifier(df, extras, algo, dump=False):
"""Evaluate and possibly store classifier and data"""
if not dump:
# Only redirect output for the search
orig_stdout, logfile = init_logfile(extras, algo)
### Task 3: Create new feature(s)
df = create_features(df, *extras)
### Extract features and labels from dataset for local testing
dfx, dfy = features_split_df(df)
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
split_indices = StratifiedShuffleSplit(dfy, n_iter=1000, test_size=0.1)
features_list = ['poi'] + dfx.columns.values.tolist()
pipeline, params = create_pipeline(
algo,
extras,
is_search=(not dump),
max_features=len(dfx.columns))
grid_searcher = GridSearchCV(
pipeline,
param_grid=params,
cv=split_indices,
n_jobs=-1,
scoring=create_scorer(),
verbose=0)
t0 = time()
with warnings.catch_warnings():
warnings.simplefilter('ignore', UserWarning)
grid_searcher.fit(dfx, y=dfy)
print '\nTime to fit: {:0>8}\n'.format(dt.timedelta(seconds=(time() - t0)))
print "Best parameters set:"
print grid_searcher.best_params_
print ''
print 'Grid score:'
for params, mean_score, scores in grid_searcher.grid_scores_:
print "%0.3f for %r" % (mean_score, params)
print ''
selector = grid_searcher.best_estimator_.named_steps['selection']
scored = pd.DataFrame(zip(
dfx.columns.tolist(),
selector.scores_,
selector.get_support()))
scored.columns = ['Feature', 'Score', 'Selected']
scored = scored.sort_values(by=['Score'], ascending=False)
scored.index = range(1, len(scored) + 1)
n_selected = len(scored[scored.Selected])
print 'Scored features: {} selected'.format(n_selected)
print scored
print ''
# n_pca_components = grid_searcher.best_estimator_.named_steps[
# 'reducer'].n_components_
# print "Reduced to {0} PCA components".format(n_pca_components)
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
clf = grid_searcher.best_estimator_
### Store to my_dataset for easy export below.
df = features_combine_df(dfx, dfy)
my_dataset = df.to_dict(orient='index')
test_classifier(clf, my_dataset, features_list)
if dump:
dump_classifier_and_data(clf, my_dataset, features_list)
else:
close_logfile(orig_stdout, logfile)
开发者ID:j-bennet,项目名称:udacity-nano-da,代码行数:96,代码来源:poi_id.py
示例9: zip
param_grid=tree_param_grid,
scoring="recall")
### Show results of parameter tuning
grid_search.fit(features_train, labels_train)
print "\nbest estimator: \n", (grid_search.best_estimator_),\
"\n best score:\n",grid_search.best_score_ ,\
"\n best params:\n",grid_search.best_params_
clf = grid_search.best_estimator_
features_selected_bool = clf.named_steps['skb'].get_support()
features_selected_list = [x for x, y in zip(features_selected_list[1:],
features_selected_bool ) if y]
print "\nselected features: ", features_selected_list
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, my_dataset, ["poi"]+features_selected_list)
开发者ID:olix20,项目名称:Enron-fraud-analysis,代码行数:30,代码来源:poi_id.py
示例10: Pipeline
pipe = Pipeline(steps=[('skbest', SelectKBest(score_func=f_classif)), ('clf', GaussianNB())])
cv = StratifiedShuffleSplit(labels,n_iter = 60,random_state = 42)
b_grid_search = grid_search.GridSearchCV(pipe, param_grid = clf_params,cv = cv,scoring = 'precision')
b_grid_search.fit(features_saved,labels_saved)
print 'Time:',round(time()-t0,3) ,'s\n'
t0 = time()
# pick a winner
best_clf_nb = b_grid_search.best_estimator_
print best_clf_nb
found_skb_nb=best_clf_nb.steps[0][1]
found_clf_nb=best_clf_nb.steps[1][1]
features=found_skb_nb.fit_transform(features_saved,labels_saved)
features_list_to_use_nb=np.asarray(all_features_list_saved)[found_skb_nb.get_support()].tolist()
print "\nFeatures used:"
print features_list_to_use_nb
test_classifier(found_clf_nb, dataset_to_export, ['poi']+features_list_to_use_nb)
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(svm_clf, dataset_to_export, ['poi']+features_list)
开发者ID:buinyi,项目名称:Intro-To-Machine-Learning,代码行数:29,代码来源:poi_id.py
示例11: main
def main():
print "=========="
import sys
#import os
import pickle
from time import time
## evaluation
from sklearn.metrics import precision_score, recall_score
import matplotlib.pyplot as plt
import pandas as pd
#from ggplot import *
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
#from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
###############################################################################
###############################################################################
###############################################################################
## current file running
print "Running:", sys.argv[0].split("/")[-1]
t_start_all = time()
### import helper functions
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
## make sure 'tester' in same dir
from tester import dump_classifier_and_data
## moving loading dict code to be consistent with 'validate.py' ex from prev.
## lesson.
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
'''
#Example structure of data_dict:
>>> data_dict
{'METTS MARK': {'salary': 365788,
'to_messages': 807,
'deferral_payments': 'NaN',
'total_payments': 1061827,
'exercised_stock_options': 'NaN',
'bonus': 600000,
'restricted_stock': 585062,
'shared_receipt_with_poi': 702,
'restricted_stock_deferred': 'NaN',
'total_stock_value': 585062,
'expenses': 94299,
'loan_advances': 'NaN',
'from_messages': 29,
'other': 1740,
'from_this_person_to_poi': 1,
'poi': False,
'director_fees': 'NaN',
'deferred_income': 'NaN',
'long_term_incentive': 'NaN',
'email_address': '[email protected]',
'from_poi_to_this_person': 38
},
'BAXTER JOHN C': {'salary': 267102,
'to_messages': 'NaN',
'deferral_payments': 1295738,
'total_payments': 5634343,
'exercised_stock_options': 6680544,
'bonus': 1200000,
'restricted_stock': 3942714,
'shared_receipt_with_poi': 'NaN',
'restricted_stock_deferred': 'NaN',
'total_stock_value': 10623258,
'expenses': 11200,
'loan_advances': 'NaN',
'from_messages': 'NaN',
'other': 2660303,
'from_this_person_to_poi': 'NaN',
'poi': False,
'director_fees': 'NaN',
'deferred_income': -1386055,
'long_term_incentive': 1586055,
'email_address': 'NaN',
'from_poi_to_this_person': 'NaN'
},
...
'''
###############################################################################
###############################################################################
###############################################################################
print "----------"
'''
##### Task 0. Data Exploration
#.........这里部分代码省略.........
开发者ID:ximiki,项目名称:udacity_P5,代码行数:101,代码来源:poi_id_B1_1605131351.py
示例12: zip
feats.append(y)
for x,y in zip(full_features_list[1:], clf_best.named_steps['skb'].scores_):
list_scores.append({'feature_list' : x, "scores" : y})
print feats
print pd.DataFrame(list_scores)
print "---------------------------------------------------------------"
for param_name in sorted(grid_search.param_grid.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
print "GridSearch time:"
time1 = round(time()-t0,2)
print time1
print "test_classifier time:"
t1 = time()
test_classifier(clf_best, my_dataset, full_features_list)
time2 = round(time()-t1, 2)
print time2
print "total time:", time2+time1
print "-----------------------------------------------------------------------"
###############################################################################
## Tune classifier
### Generates the necessary .pkl files for validating results.
if full_report:
for clf in [ dtc, gnc, knn, abc, rfc ]:
test_classifier(clf, my_dataset, features_list)
dump_classifier_and_data(clf_best, my_dataset, full_features_list)
开发者ID:ondramie,项目名称:machine-learning-python-enron-employees,代码行数:30,代码来源:poi_id.py
示例13: make_pipeline
# decision_function_shape='ovo', degree=3, gamma='auto',
# kernel='linear', max_iter=-1, probability=False,
# random_state=20160308, shrinking=False, tol=0.001,
# verbose=False))
pipe = make_pipeline(
Imputer(axis=0, copy=True, missing_values='NaN',
strategy='median', verbose=0),
ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
criterion='gini', max_depth=None,
max_features='sqrt', max_leaf_nodes=None,
min_samples_leaf=3, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=30,
n_jobs=-1, oob_score=False,
random_state=20160308, verbose=0,
warm_start=False))
#pipe = make_pipeline(
# Imputer(axis=0, copy=True, missing_values='NaN',
# strategy='median', verbose=0),
# SelectFpr(alpha=0.05, score_func=f_classif),
# ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
# criterion='gini', max_depth=None,
# max_features='sqrt', max_leaf_nodes=None,
# min_samples_leaf=3, min_samples_split=2,
# min_weight_fraction_leaf=0.0, n_estimators=30,
# n_jobs=-1, oob_score=False,
# random_state=20160308, verbose=0,
# warm_start=False))
# Task 6: Dump your classifier, dataset, and features_list
dump_classifier_and_data(pipe, df.to_dict(orient='index'), ['poi'] + F_ALL_NEW)
开发者ID:pqmagicwu,项目名称:uda-da-p5-enron-fraud-detection,代码行数:30,代码来源:poi_id.py
示例14: Pipeline
t= time.time()
pipeline = Pipeline([('normalization', scaler),
('classifier', KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto',
leaf_size=30, p=1, metric='minkowski'))])
test_classifier(pipeline, enron_data, features_select(4))
print time.time()-t
# ###Data dump
# In[45]:
### Dump your classifier, dataset, and features_list so
### anyone can run/check your results.
dump_classifier_and_data(pipeline, enron_data, features_select(4))
# ###Additional methods to explore include:
#
# * using k-fold cross-validation to improve model validation
# In[ ]:
开发者ID:BlaneG,项目名称:Udacity_Intro_machine_learning,代码行数:26,代码来源:poi_id.py
示例15: time
dtc_clf = sklearn.tree.DecisionTreeClassifier()
dtcclf = grid_search.GridSearchCV(dtc_clf, parameters, scoring = scoring, cv = cv)
dtcclf.fit(features, labels)
print dtcclf.best_estimator_
print dtcclf.best_score_
print 'Processing time:',round(time()-t0,3) ,'s'
#Classifier validation
##DecisionTreeClassifier Validation 1 (StratifiedShuffleSplit, folds = 1000)
t0 = time()
dtc_best_clf = dtcclf.best_estimator_
test_classifier(dtc_best_clf, enron_data, eng_feature_list)
print 'Processing time:',round(time()-t0,3) ,'s'
##DecisionTreeClassifier Validation 2 (Randomized, partitioned trials, n=1,000)
t0 = time()
dtc_best_clf = dtcclf.best_estimator_
evaluate.evaluate_clf(dtc_best_clf, features, labels, num_iters=1000, test_size=0.3)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print 'Processing time:',round(time()-t0,3) ,'s'
#Dump my classifier
dump_classifier_and_data(dtc_best_clf, enron_data, eng_feature_list)
开发者ID:myaqoob67,项目名称:Udacity_ML,代码行数:29,代码来源:poi_id.py
示例16: train_test_split
#print "accuracy score is ",accuracy
#print "recall score is ",recall
#print "precision score is ",precision
# Example starting point. Try investigating other evaluation techniques!
#from sklearn.cross_validation import train_test_split
#features_train, features_test, labels_train, labels_test = \
# train_test_split(features, labels, test_size=0.3, random_state=42)
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, my_dataset, testfeatureswithpoi)
##MYCODE :convert my_dataset dictionary to list
#import csv
#dictlist=[]
#temp=[]
#fieldnames=['name']
#for name,detail in my_dataset.iteritems():
# temp.append(name)
# for key,value in detail.iteritems():
# temp.append(value)
开发者ID:Maggiebj,项目名称:P5_enron_project,代码行数:31,代码来源:poi_id.py
示例17: main
#.........这里部分代码省略.........
best_clf_config_list = clf_collection.sort_values(['precision','recall',
'accuracy','number of features'],
ascending=[False,False,False,True])
clf_collection.sort_values(['precision', 'recall',
'accuracy', 'number of features'],
ascending=[False, False, False, True])
# dump the results of all the tested classifiers and related configurtion
# and train/test setup
clf_collection.to_csv("training_data.csv", sep=',', encoding='utf-8')
# iterating through all the classifiers chosen
print "Validating list of best classifiers: "
for index, best_clf_config in best_clf_config_list.iterrows():
# go for the best, instantiate it and dump the data
best_clf_class_id = best_clf_config["class_id"]
best_clf_params = best_clf_config["best parameters"]
for id, clf_class, clf_kwargs, feat_scaling in gl_clf_list:
if id == int(best_clf_class_id):
try:
# instantiate classifier
best_clf = clf_class(**best_clf_params)
best_clf_org = clf_class(**best_clf_params)
if best_clf_config["features_scaled"]:
# train the algorithm
best_clf.fit(my_features_train_scaled, my_labels_train)
best_clf_org.fit(orig_features_train_scaled, orig_labels_train)
else:
# train the algorithm
best_clf_org.fit(orig_features_train, orig_labels_train)
best_clf.fit(my_features_train, my_labels_train)
print "start original data set"
# test with original data set
#v_o_total_predictions, v_o_accuracy, v_o_precision, v_o_recall, v_o_f1, v_o_f2 =\
# test_classifier(best_clf_org, orig_dataset, orig_features_list)
#clf_best_collection.loc[1000 + index] = (best_clf_config["class_id"],
# best_clf_config["clf"],
# best_clf_config['features_scaled'],
# len(orig_features_list),
# str(my_features_list),
# v_o_accuracy, v_o_precision,
# v_o_recall, best_clf_params,
# best_clf_config["best estimator"],
# True,
# create_new_message_features,
# create_new_finance_features,
# PCA_info)
# dump final information
dump_classifier_and_data(best_clf, my_dataset, my_features_list)
print "start original data set with new features"
#test with newly created features on top of the original data set
v_total_predictions, v_accuracy, v_precision, v_recall, v_f1, v_f2 =\
test_classifier(best_clf, my_dataset, my_features_list, do_perform_PCA,
pca_components, best_clf_config['features_scaled'])
clf_best_collection.loc[index] = (best_clf_config["class_id"],
best_clf_config["clf"],
best_clf_config['features_scaled'],
len(my_features_list),
str(my_features_list),
v_accuracy, v_precision, v_recall,
best_clf_params,
best_clf_config["best estimator"],
False,
create_new_message_features,
create_new_finance_features,
PCA_info
)
except TypeError:
clf_best_collection.loc[index] = (best_clf_config["class_id"],
best_clf_config["clf"],
best_clf_config['features_scaled'],
len(my_features_list),
str(my_features_list),
"Error", "Error", "Error",
best_clf_params,
best_clf_config["best estimator"],
False,
create_new_message_features,
create_new_finance_features,
PCA_info
)
cbc = clf_best_collection.sort_values(['precision', 'recall', 'accuracy'],
ascending=[False, False,
False])
# write classifier validation result to file
cbc.to_csv(output_file_results, sep=',', encoding='utf-8')
print "###################################################################"
print "###################################################################"
开发者ID:cellarstar,项目名称:udacity,代码行数:101,代码来源:poi_id.py
示例18: accuracy_score
print "For optimum",score,":"
for name in classifier_names:
print " ",name,": ",best_performance[score][name]
# acc = accuracy_score(pred, labels_test)
# print ""
# print "Accuracy:",acc," (Good predictions / All predictions)"
# pre = precision_score(pred, labels_test)
# print "Precision:",pre," (Real POIs / Predicted POIs)"
# rec = recall_score(pred, labels_test)
# print "Recall:",rec," (Identified POIs / All POIs)"
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
# Example starting point. Try investigating other evaluation techniques!
# features_train, features_test, labels_train, labels_test = \
# train_test_split(features, labels, test_size=0.3, random_state=42)
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
# dump_classifier_and_data(clf, my_dataset, features_list)
dump_classifier_and_data(clf, data_dict, features_list)
开发者ID:DaniGate,项目名称:ud120-projects,代码行数:30,代码来源:poi_id.py
示例19: SelectKBest
('select_features', SelectKBest(f_classif, k=opt_features)),
('reduce_dim', PCA()),
('naive', GaussianNB())])
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("")
print("Efficiency of selected algorithm:")
print 'F1 score:\t', '{0:.2f}'.format(f1_score(labels_test, pred))
print 'Accuracy:\t', '{0:.2f}'.format(accuracy_score(labels_test, pred))
print 'Precision:\t', '{0:.2f}'.format(precision_score(labels_test, pred))
print 'Recall:\t', '{0:.2f}'.format(recall_score(labels_test, pred))
scores = clf.named_steps['select_features'].scores_
features_selected_bool = clf.named_steps['select_features'].get_support(indices=True)
features_selected = [features_list[i+1] for i in features_selected_bool]
features_scores = [scores[i] for i in features_selected_bool]
print("")
print('Feature scores:')
for i in range(len(features_scores)):
print features_selected[i], '{0:.2f}'.format(features_scores[i])
features_selected.insert(0, 'poi')
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, my_dataset, features_selected)
开发者ID:nikita-barsukov,项目名称:intro-to-ml,代码行数:30,代码来源:poi_id.py
示例20: test_clf
test_clf(grid_search, labels, features, parameters)
clf = AdaBoostClassifier()
parameters = {'n_estimators': [10, 20, 30, 40, 50],
'algorithm': ['SAMME', 'SAMME.R'],
'learning_rate': [.5,.8, 1, 1.2, 1.5]}
grid_search = GridSearchCV(clf, parameters)
print '\nAdaBoost:'
test_clf(grid_search, labels, features, parameters)
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42)
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, b
|
请发表评论