Scikit-learn Flashcards
shuffle the data and normalize features
df_shuffled = shuffle(df, random_state=123) X = scale(df_shuffled[df_shuffled.columns[:-1]]) y = df_shuffled["cnt"]
Train lasso regressior
from sklearn import cross_validation, datasets, linear_model, metrics
regression_lasso = linear_model.Lasso()
regression_lasso.fit(X, y)
Train ridge regressior
from sklearn import cross_validation, datasets, linear_model, metrics
regression_ridge = linear_model.Ridge(random_state=1)
regression_ridge.fit(X, y)
Для каждого значения коэффициента из alphas обучите регрессор Lasso # и запишите веса в соответствующую строку матрицы coefs_lasso
from sklearn import cross_validation, datasets, linear_model, metrics
coefs_lasso = np.zeros((alphas.shape[0], X.shape[1])) coefs_ridge = np.zeros((alphas.shape[0], X.shape[1]))
1)alphas = np.arange(1, 500, 50)
ind = 0 for a in alphas: lassor = Lasso(alpha=a) lassor.fit(X, y) coefs_lasso[ind] = lassor.coef_ ridger = Ridge(alpha=a) ridger.fit(X, y) coefs_ridge[ind] = ridger.coef_ ind += 1
2)def get_trained_coefs(clf):
clf.fit(X=X, y=y)
return clf.coef_
for idx,tmp in enumerate(coefs_lasso): alpha = alphas[idx] coefs_lasso[idx] = get_trained_coefs(clf = Lasso(alpha=alpha)) coefs_ridge[idx] = get_trained_coefs(clf = Ridge(alpha=alpha))
Train LassoCV regressor on range of alpha
from sklearn import cross_validation, datasets, linear_model, metrics
from linear_model import LassoCV
alphas = np.arange(1, 100, 1) model_LCV = LassoCV(alphas=alphas).fit(X, y)
Find alpha which correspond to minimum MSE in LassoCV
from sklearn import cross_validation, datasets, linear_model, metrics
from linear_model import LassoCV
alphas = np.arange(1, 100, 1)
LassoCV(alphas=alphas).fit(X, y).alpha_
One-hot encoding transformation of categorical features into binary
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction import DictVectorizer as DV
encoder = DV(sparse = False) encoded_data = encoder.fit_transform(categorial_data.T.to_dict().values())
Scale numerical features in train and test sets
scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test)
Train classifier with Cross-Validation making balancing adjustment
estimator_bal_strat = LogisticRegression(penalty=’l2’, class_weight=’balanced’)
#FIRST TRAIN ESTIMATOR estimator_bal_strat.fit(X_zer_train_strat, y_strat_train) optimizer_bal_strat = GridSearchCV(estimator_bal_strat, param_grid, cv=3) #SECOND TRAIN OPTIMIZER zer_bal_strat_optimizer = optimizer_bal_strat.fit(X_zer_train_strat, y_strat_train)
Find ROC AUC score
auc_strat_balanced = roc_auc_score(y_strat_test, bal_strat_optimizer.best_estimator_.predict_proba(X_test_strat)[:,1])
Add polinomial features
”"”Initialize polinomial transformation class”””
transform = PolynomialFeatures(2)
“"”Train polinomial transformation on train set, then apply it to test set”””
data_train_poly = transform.fit_transform(X_train_strat_real)
data_test_poly = transform.transform(X_test_strat_real)
Train model using polinomial, stratified, balanced numerical features
1)Split into Train and Test sets
2)Train polinomial transformation on train set, then apply it to test set
3)Scale numerical features
Combine numerical and categorical features
4)Train classifier with Cross-Validation making balancing adjustment
#FIRST TRAIN ESTIMATOR
#SECOND TRAIN OPTIMIZER
Train Logistic regression using polinomial, stratified, balanced numerical features
1)Split into Train and Test sets
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
def plot_scores(optimizer):
scores = [[item[0][‘C’],
item[1],
(np.sum((item[2]-item[1])2)/(item[2].size-1))0.5] for item in optimizer.grid_scores_]
scores = np.array(scores)
plt.semilogx(scores[:,0], scores[:,1])
plt.fill_between(scores[:,0], scores[:,1]-scores[:,2],
scores[:,1]+scores[:,2], alpha=0.3)
plt.show()
(X_train_strat_real,
X_test_strat_real,
y_poly_train, y_poly_test) = train_test_split(X_real_zeros, y,
test_size=0.3,
random_state=0,stratify=y)
(X_train_poly_cat_oh,
X_test_poly_cat_oh) = train_test_split(X_cat_oh,
test_size=0.3,
random_state=0,stratify=y)
#2) """Initialize polinomial transformation class""" transform = PolynomialFeatures(2) """Train polinomial transformation on train set, then apply it to test set""" data_train_poly = transform.fit_transform(X_train_strat_real) data_test_poly = transform.transform(X_test_strat_real)
#3)Scale numerical features scaler = StandardScaler().fit(data_train_poly) X_real_train_poly = scaler.transform(data_train_poly) X_real_test_poly = scaler.transform(data_test_poly)
#combine scaled real and categorical features X_train_poly = np.hstack((X_real_train_poly, X_train_poly_cat_oh)) X_test_poly = np.hstack((X_real_test_poly, X_test_poly_cat_oh))
#FIRST TRAIN ESTIMATOR estimator_poly = LogisticRegression(penalty='l2', class_weight='balanced', fit_intercept=False) estimator_poly.fit(X_train_poly, y_poly_train)
#SECOND TRAIN OPTIMIZER optimizer_poly = GridSearchCV(estimator_poly, param_grid, cv=3) zer_poly_optimizer = optimizer_poly.fit(X_train_poly, y_poly_train)
#AUC ROC score auc_poly_balanced = roc_auc_score(y_poly_test, zer_poly_optimizer.best_estimator_.predict_proba(X_test_poly)[:,1])
plot_scores(zer_poly_optimizer)
print zer_poly_optimizer.best_params_
print auc_poly_balanced
Train tree classifier with minimum 3 elements at a leaf
from sklearn import cross_validation, datasets, metrics, tree
estimator = tree.DecisionTreeClassifier(random_state = 1, min_samples_leaf = 3)
estimator.fit(train_data, train_labels)
Create Bagging Classifier with n=100 estimators
Find its cross_val_score on 10 folds
from sklearn import cross_validation, datasets, metrics, tree
from sklearn.ensemble import BaggingClassifier
tree_clf = tree.DecisionTreeClassifier() bagging_clf = BaggingClassifier(tree_clf,n_estimators=100) scores_bag = cross_val_score(bagging_clf,X,y,cv=10)