Visualization Flashcards
write to activate plotting
%pylab inline
%matplotlib inline
sign the axis on plot
xlabel(‘Iteration number’)
ylabel(‘MSE’)
draw histogram
data.plot(y=’Height’, kind=’hist’,
color=’red’, title=’Height (inch.) distribution’)
draw pairplot from seaborn library
%import seaborn as sns
sns.pairplot(data)
draw boxplot
sns.boxplot(x=’weight_category’, y=’Height’, data=data)
draw scatterplot
data.plot(‘Weight’, ‘Height’, kind=’scatter’,title=’Height(inches)/Weight(pounds)’)
create axis split
x = np.linspace(60, 180, 100)
draw 2 lines with legend at the upper left corner of the plot
line1, = plt.plot(x, y_1, color='magenta',label = 'height = 60 + 0.05 * weight ') line2, = plt.plot(x, y_2, color='green',label = 'height = 50 + 0.16 * weight ')
plt. legend(handles=[line1,line2], loc=2)
plt. show()
draw plot by applying function to range of X (np array)
w1_range = np.linspace(-0.5, 1, 100) err = [squared_error(w, data) for w[1] in w1_range]
plt.plot(w1_range, err)
get current axis
fig = plt.figure() ax = fig.gca(projection='3d')
create grid
X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
plot surface for X, Y, Z ;sign the axis
surf = ax.plot_surface(X, Y, Z)
ax. set_xlabel(‘X’)
ax. set_ylabel(‘Y’)
ax. set_zlabel(‘Z’)
plt. show()
visualize dependencies between features and target variable in pandas dataset df
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10))
for idx, feature in enumerate(df.columns[:-1]):
df.plot(feature, “cnt”, subplots=True, kind=”scatter”, ax=axes[idx / 4, idx % 4])
correlation of features with target variable ‘cnt’ in df dataset
X_f = df.ix[:,range(df.shape[1]-1)]
X_f.corrwith(df[‘cnt’])
shuffle the dataset before constructing regression model, normalize features
df_shuffled = shuffle(df, random_state=123) X = scale(df_shuffled[df_shuffled.columns[:-1]]) y = df_shuffled["cnt"]
plot weights of model’s parameters depending on learning parameter alpha
plt.figure(figsize=(8, 5))
for coef, feature in zip(coefs_lasso.T, df.columns):
plt.plot(alphas, coef, label=feature, color=np.random.rand(3))
plt.legend(loc=”upper right”, bbox_to_anchor=(1.4, 0.95))
plt.xlabel(“alpha”)
plt.ylabel(“feature weight”)
plt.title(“Lasso”)
plot scatterplot of classification_problem [2 features, 3classes] using 1)map and lambda functions 2direct request 3)list comprehension
pylab.figure(figsize=(8,6))
1)
pylab.scatter(map(lambda x: x[0], classification_problem[0]), map(lambda x: x[1], classification_problem[0]),
c=classification_problem[1], cmap=colors, s=100)
2)
pylab.scatter(classification_problem[0][:,0], classification_problem[0][:,1],
c=classification_problem[1], cmap=colors, s=100)
3)
pylab.scatter([x[0] for x in classification_problem[0]],[x[1] for x in classification_problem[0]],
c=classification_problem[1], cmap=colors, s=100)
Train tree classifier , plot decision surface and data points
from sklearn import cross_validation, datasets, metrics, tree
#1)Split the data into train and test sets train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(classification_problem[0], classification_problem[1], test_size = 0.3, random_state = 1)
#2)Ancillary function for defining grid def get_meshgrid(data, step=.05, border=.5,): x_min, x_max = data[:, 0].min() - border, data[:, 0].max() + border y_min, y_max = data[:, 1].min() - border, data[:, 1].max() + border return np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step))
#3)Function to plot surface and points def plot_decision_surface(estimator, train_data, train_labels, test_data, test_labels, colors = colors, light_colors = light_colors): #fit model estimator.fit(train_data, train_labels)
#set figure size pyplot.figure(figsize = (16, 6))
#plot decision surface on the train data pyplot.subplot(1,2,1) xx, yy = get_meshgrid(train_data) mesh_predictions = np.array(estimator.predict(np.c_[xx.ravel(), yy.ravel()])).reshape(xx.shape) pyplot.pcolormesh(xx, yy, mesh_predictions, cmap = light_colors) pyplot.scatter(train_data[:, 0], train_data[:, 1], c = train_labels, s = 100, cmap = colors) pyplot.title('Train data, accuracy={:.2f}'.format(metrics.accuracy_score(train_labels, estimator.predict(train_data))))
#plot decision surface on the test data pyplot. subplot(1,2,2) pyplot. pcolormesh(xx, yy, mesh_predictions, cmap = light_colors) pyplot. scatter(test_data[:, 0], test_data[:, 1], c = test_labels, s = 100, cmap = colors) pyplot. title('Test data, accuracy={:.2f}'.format(metrics.accuracy_score(test_labels, estimator.predict(test_data))))
plot_decision_surface(tree.DecisionTreeClassifier(random_state = 1, min_samples_leaf = 3),
train_data, train_labels, test_data, test_labels)
create new figure
plt. figure()
plt. show()
plot histograms of features
data[real_features].hist(bins=100,figsize=(20, 20))
plot heatmap of correlations
seaborn.heatmap(data[real_features].corr(), square=True)
plot countplots for features ‘MK1_’, ‘MK_2’, ‘MK_3’ in dataset data with splitting by target variable ‘Response’
fig,axes = plt.subplots(2, 2, figsize=(35, 35), sharey=True)
medical_key = [‘MK1_’, ‘MK_2’, ‘MK_3’]
for i in range(len(medical_key)):
seaborn.countplot(x=medical_key[i], data=data, hue=”Response”, ax=axes[i / 2, i % 2])
1)plot TSNE representation of features with different colours corresponding to different classes(values of target variable data.Response)
from sklearn.manifold import TSNE
import matplotlib.cm as cm
model = TSNE(random_state = 321) tsne_representation = model.fit_transform(data_subset)
colors = cm.rainbow(np.linspace(0, 1, len(set(response_subset))))
for y, c in zip(set(data.Response), colors):
plt.scatter(tsne_representation[response_subset.values==y, 0],
tsne_representation[response_subset.values==y, 1], c=c, alpha=0.5, label=str(y))
plt.legend()
plot pdf histograms for features ‘F_1’, ‘F_2’, ‘F_3’ of dataset data with dropped from them NA values
fig, axes = plt.subplots(1, 3, figsize=(14,6))
k = 0
for i in [‘F_1’, ‘F_2’, ‘F_3’]:
seaborn.distplot(data[i].dropna(), bins=50, ax=axes[k])
k+=1