# Import libraries necessary for this project import numpy as np import pandas as pd from time import time from IPython.display import display # Allows the use of display() for DataFrames # Import train_test_split from sklearn.model_selection import train_test_split %matplotlib inline
1 2
data = pd.read_csv("processed_data1.csv") target = data['target']
1
data = data.drop('Unnamed: 0', axis = 1)
1
features = data.drop('target', axis = 1)
分训练集和测试集
1 2 3 4 5 6 7 8 9
# Split the 'features' and 'income' data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)
# Show the results of the split print("Training set has {} samples.".format(X_train.shape[0])) print("Testing set has {} samples.".format(X_test.shape[0]))
Training set has 32950 samples.
Testing set has 8238 samples.
from sklearn.metrics import fbeta_score from sklearn.metrics import accuracy_score
deftrain_predict(learner, sample_size, X_train, y_train, X_test, y_test): ''' inputs: - learner: the learning algorithm to be trained and predicted on - sample_size: the size of samples (number) to be drawn from training set - X_train: features training set - y_train: income training set - X_test: features testing set - y_test: income testing set ''' results = {} # Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:]) start = time() # Get start time learner = learner.fit(X_train[:sample_size:], y_train[:sample_size].values.ravel()) end = time() # Get end time # Calculate the training time results['train_time'] = end - start # Get the predictions on the test set(X_test), # then get predictions on the first 300 training samples(X_train) using .predict() start = time() # Get start time predictions_test = learner.predict(X_test) predictions_train = learner.predict(X_train[:300]) end = time() # Get end time # Calculate the total prediction time results['pred_time'] = end - start # Compute accuracy on the first 300 training samples which is y_train[:300] results['acc_train'] = accuracy_score(y_train[:300], predictions_train) # Compute accuracy on test set using accuracy_score() results['acc_test'] = accuracy_score(y_test, predictions_test) # Compute F-score on the the first 300 training samples using fbeta_score() results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta = 0.5) # Compute F-score on the test set which is y_test results['f_test'] = fbeta_score(y_test, predictions_test, beta = 0.5) # Success print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size)) # Return the results return results
# Import the three supervised learning models from sklearn from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier import random random.seed(2020) # Initialize the three models clf_A = LogisticRegression(solver = 'liblinear') # set the default value manually in order to get rid of warnings clf_B = RandomForestClassifier(n_estimators = 100) # set the default value manually in order to get rid of warnings clf_C = KNeighborsClassifier() # set the default value manually in order to get rid of warnings
# Calculate the number of samples for 1%, 10%, and 100% of the training data samples_100 = len(y_train) samples_10 = int(len(y_train)*0.1) samples_1 = int(len(y_train)*0.01)
# Collect results on the learners results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = \ train_predict(clf, samples, X_train, y_train, X_test, y_test)
LogisticRegression trained on 329 samples.
LogisticRegression trained on 3295 samples.
LogisticRegression trained on 32950 samples.
RandomForestClassifier trained on 329 samples.
RandomForestClassifier trained on 3295 samples.
RandomForestClassifier trained on 32950 samples.
KNeighborsClassifier trained on 329 samples.
KNeighborsClassifier trained on 3295 samples.
KNeighborsClassifier trained on 32950 samples.
from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer random.seed(42) # Initialize the classifier clf = KNeighborsClassifier()
# Create the parameters list you wish to tune, using a dictionary if needed. k_range = list(range(1,10)) parameters = { 'algorithm' : ['auto','ball_tree','kd_tree','brute'], 'weights' : ['uniform','distance'], 'n_neighbors' : k_range}
# Make an fbeta_score scoring object using make_scorer() scorer = make_scorer(fbeta_score, beta = 0.5)
# Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV() grid_obj = GridSearchCV(clf, parameters, scoring=scorer, cv = 10)
# Fit the grid search object to the training data and find the optimal parameters using fit() grid_fit = grid_obj.fit(X_train, y_train.values.ravel())
# Get the estimator best_clf = grid_fit.best_estimator_
# Make predictions using the unoptimized and model predictions = (clf.fit(X_train, y_train.values.ravel())).predict(X_test) best_predictions = best_clf.predict(X_test)
# Report the before-and-afterscores print("Unoptimized model\n------") print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))) print("\nOptimized Model\n------") print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))) print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))) print('best params are:',str(grid_obj.best_params_))
Unoptimized model
------
Accuracy score on testing data: 0.8958
F-score on testing data: 0.4799
Optimized Model
------
Final accuracy score on the testing data: 0.8989
Final F-score on the testing data: 0.4920
best params are: {'algorithm': 'brute', 'n_neighbors': 9, 'weights': 'uniform'}
Cutoff value
1 2 3 4
data = pd.read_csv("processed_data5.csv") target = data['target'] data = data.drop('Unnamed: 0', axis = 1) features = data.drop('target', axis = 1)
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# accurate and f score in test set with cutoff = 0.5 random.seed(42)
# Import a supervised learning model that has 'feature_importances_' from sklearn.model_selection import RandomizedSearchCV
# Train the supervised model on the training set using .fit(X_train, y_train) param_dist = {"max_depth": [3, None], "n_estimators": list(range(10, 200)), "max_features": list(range(1, X_test.shape[1]+1)), "min_samples_split": list(range(2, 11)), "min_samples_leaf": list(range(1, 11)), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} model = RandomizedSearchCV(clf_B, param_distributions=param_dist) model.fit(X_train, y_train.values.ravel()) # TODO: Extract the feature importances using .feature_importances_ importances = model.best_estimator_.feature_importances_
C:\Users\jasonguo\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
warnings.warn(CV_WARNING, FutureWarning)
# Plot import matplotlib.pyplot as pl deffeature_plot(importances, X_train, y_train): # Display the five most important features indices = np.argsort(importances)[::-1] columns = X_train.columns.values[indices[:5]] values = importances[indices][:5]
# Creat the plot fig = pl.figure(figsize = (9,5)) pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16) pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \ label = "Feature Weight") pl.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \ label = "Cumulative Feature Weight") pl.xticks(np.arange(5), columns) pl.xlim((-0.5, 4.5)) pl.ylabel("Weight", fontsize = 12) pl.xlabel("Feature", fontsize = 12) pl.legend(loc = 'upper center') pl.tight_layout() pl.show()
# Reduce the feature space X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]] X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]
# Train on the "best" model found from grid search earlier clf = (clone(best_clf)).fit(X_train_reduced, y_train)
# Make new predictions reduced_predictions = clf.predict(X_test_reduced)
# Report scores from the final model using both versions of data print("Final Model trained on full data\n------") print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))) print("\nFinal Model trained on reduced data\n------") print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5)))
Final Model trained on full data
------
Accuracy on testing data: 0.8989
F-score on testing data: 0.4920
Final Model trained on reduced data
------
Accuracy on testing data: 0.8934
F-score on testing data: 0.4420