import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix
data = { 'age': [22, 25, 47, 35, 46, 23, 36, 59, 50, 29], 'salary': [1500, 1800, 3000, 2400, 4000, 1200, 3200, 5000, 3500, 2200], 'gender': ['male', 'female', 'female', 'male', 'male', 'female', 'male', 'female', 'male', 'female'], 'purchased': [0, 0, 1, 1, 1, 0, 1, 1, 1, 0] # 0 = No, 1 = Yes }
df = pd.DataFrame(data)
df = df.drop_duplicates()
df.fillna(method='ffill', inplace=True)
df = pd.get_dummies(df, columns=['gender'], drop_first=True)
X = df.drop('purchased', axis=1) y = df['purchased']
sns.pairplot(df, hue='purchased') plt.title('Data Distribution by Classes') plt.show()
plt.figure(figsize=(8, 6)) sns.heatmap(df.corr(), annot=True, cmap='coolwarm') plt.title('Correlation Between Variables') plt.show()
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42) X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
model = RandomForestClassifier(random_state=42) # Setting random_state for reproducibility.
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
conf_matrix = confusion_matrix(y_val, y_pred) plt.figure(figsize=(6, 5)) # Adjusting figure size for better visualization. sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues') plt.ylabel('Actual') plt.xlabel('Predicted') plt.title('Confusion Matrix') plt.show()
import pickle with open('rf_model.pkl', 'wb') as file: pickle.dump(model, file)
- Clarity and Precision: The tutorial is well-structured, with each step separated and clearly identified, making it easy to understand.
- Correct Code: The code appears correct and functional. Including a random_state in the Random Forest model allows reproducible results, which is a recommended practice. Instructions Followed: The instructions are logical and easy to follow, with explanations provided for each step.
- Visualizations: The visualizations were well-applied, providing a better understanding of the data.
- Additional Comment: Titles were included for the visualizations, aiding in the presentation and interpretation of the graphs. - Considering hyperparameter tuning is a good point, although marked as optional.