-
Notifications
You must be signed in to change notification settings - Fork 1
/
wine_quality_analysis.py
87 lines (70 loc) · 3.08 KB
/
wine_quality_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
# Define a function to concatenate the red and white wine datasets
def join(temp1, temp2):
temp1['color'] = 'r' # Add 'r' in the color column for 'Red wine'
temp2['color'] = 'w' # Add 'w' in the color column for 'White wine'
return pd.concat([temp1, temp2])
# Load the red and white wine datasets
red = pd.read_csv("winequality-red.csv", sep = ';')
white = pd.read_csv("winequality-white.csv", sep = ';')
# Remove duplicate rows in both datasets
red = red.drop_duplicates()
white = white.drop_duplicates()
# Combine red and white wine datasets
wine = join(red.copy(), white.copy())
# Display summary statistics for red wine dataset
red_des = red.describe()
print("Red Wine Description:\n", red_des)
# Display summary statistics for white wine dataset
white_des = white.describe()
print("\nWhite Wine Description:\n", white_des)
# Display summary statistics for the combined wine dataset
wine_des = wine.describe()
print("\nCombined Wine Description:\n", wine_des)
# Create pair plots to visualize relationships between features and wine quality for red wine
sns.pairplot(red, hue='quality')
plt.title('Pair Plot for Red Wine')
plt.show()
# Create pair plots to visualize relationships between features and wine quality for white wine
sns.pairplot(white, hue='quality')
plt.title('Pair Plot for White Wine')
plt.show()
# Create pair plots to visualize relationships between features and wine quality for combined wine dataset
sns.pairplot(wine, hue='quality')
plt.title('Pair Plot for Combined Wine Dataset')
plt.show()
# Create a histogram to visualize the distribution of wine quality scores
plt.figure(figsize=(8, 5))
sns.histplot(wine['quality'], bins=range(1, 11), kde=True)
plt.xlabel('Wine Quality')
plt.ylabel('Frequency')
plt.title('Distribution of Wine Quality Scores')
plt.show()
# Compute the correlation matrix and create a heatmap
correlation_matrix = wine.drop(columns=['color']).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix[['quality']], annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Between Features and Wine Quality')
plt.show()
# Identify important factors influencing wine quality using Random Forest Regressor
# Separate features (X) and target (y)
X = wine.drop(columns=['color']).drop('quality', axis=1)
y = wine['quality']
# Train a Random Forest Regressor model
model = RandomForestRegressor(random_state=42)
model.fit(X, y)
# Get feature importances
feature_importances = model.feature_importances_
# Create a DataFrame to visualize feature importances
imp_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
imp_df = imp_df.sort_values(by='Importance', ascending=False)
# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(data=imp_df, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance for Wine Quality Prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()