红酒质量预测

  1. Random Forest Classifier
  2. SVM classifier
  3. Neural Network
1
2
3
4
5
6
7
8
9
10
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
1
2
wine = pd.read_csv("C:\\Users\\董润泽\\Desktop\\winequality-red.csv",sep=";")
wine.head()

fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1
wine.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
1
wine.isnull().sum()
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
1
2
3
4
bins = (2, 6.5, 8)
group_names=['bad', 'good']
wine['quality'] = pd.cut(wine['quality'], bins = bins, labels = group_names)
wine['quality'].unique()
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-13-99255379e85c> in <module>
      1 bins = (2, 6.5, 8)
      2 group_names=['bad', 'good']
----> 3 wine['quality'] = pd.cut(wine['quality'], bins = bins, labels = group_names)
      4 wine['quality'].unique()


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\reshape\tile.py in cut(x, bins, right, labels, retbins, precision, include_lowest, duplicates)
    239                               include_lowest=include_lowest,
    240                               dtype=dtype,
--> 241                               duplicates=duplicates)
    242 
    243     return _postprocess_for_cut(fac, bins, retbins, x_is_series,


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\reshape\tile.py in _bins_to_cuts(x, bins, right, labels, precision, include_lowest, dtype, duplicates)
    342 
    343     side = 'left' if right else 'right'
--> 344     ids = ensure_int64(bins.searchsorted(x, side=side))
    345 
    346     if include_lowest:


TypeError: '<' not supported between instances of 'float' and 'str'
1
wine.head()

fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 bad
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 bad
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 bad
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 bad
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 bad
1
label_quality = LabelEncoder()
1
wine['quality'] = label_quality.fit_transform(wine['quality'])
1
wine.head(10)

fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 0
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 0
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 0
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 0
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 0
5 7.4 0.66 0.00 1.8 0.075 13.0 40.0 0.9978 3.51 0.56 9.4 0
6 7.9 0.60 0.06 1.6 0.069 15.0 59.0 0.9964 3.30 0.46 9.4 0
7 7.3 0.65 0.00 1.2 0.065 15.0 21.0 0.9946 3.39 0.47 10.0 1
8 7.8 0.58 0.02 2.0 0.073 9.0 18.0 0.9968 3.36 0.57 9.5 1
9 7.5 0.50 0.36 6.1 0.071 17.0 102.0 0.9978 3.35 0.80 10.5 0
1
wine['quality'].value_counts()
0    1382
1     217
Name: quality, dtype: int64
1
sns.countplot(wine['quality'])
<matplotlib.axes._subplots.AxesSubplot at 0x2076c534cc0>

png

1
2
3
#seperate the dataset as response variable and feature variable
X = wine.drop('quality' , axis=1)
y = wine['quality']
1
2
#Train and test splitting of data
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state = 42)
1
2
3
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Random Forest Classifier

1
2
3
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train)
pred_rfc = rfc.predict(X_test)
1
2
3
4
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test,pred_rfc))
#对于坏酒,264对,9错
#好酒,13对,24错
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       273
           1       0.73      0.51      0.60        47

    accuracy                           0.90       320
   macro avg       0.82      0.74      0.77       320
weighted avg       0.89      0.90      0.89       320

[[264   9]
 [ 23  24]]

SVM classifier

1
2
3
clf=svm.SVC()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
1
2
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test,pred_clf))
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       273
           1       0.71      0.26      0.37        47

    accuracy                           0.88       320
   macro avg       0.80      0.62      0.65       320
weighted avg       0.86      0.88      0.85       320

[[268   5]
 [ 35  12]]

Neural Network

1
2
3
mlpc = MLPClassifier(hidden_layer_sizes=(11,11,11),max_iter=500)
mlpc.fit(X_train,y_train)
pred_mlpc = mlpc.predict(X_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neural_network\multilayer_perceptron.py:566: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (500) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
1
2
print(classification_report(y_test, pred_clf))
print(confusion_matrix(y_test,pred_clf))
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       273
           1       0.71      0.26      0.37        47

    accuracy                           0.88       320
   macro avg       0.80      0.62      0.65       320
weighted avg       0.86      0.88      0.85       320

[[268   5]
 [ 35  12]]
1
2
3
4
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,pred_rfc))
print(accuracy_score(y_test,pred_mlpc))
print(accuracy_score(y_test,pred_clf))
0.875
0.884375
0.875

转载请注明来源,欢迎对文章中的引用来源进行考证,欢迎指出任何有错误或不够清晰的表达。可以在下面评论区评论,也可以邮件至 2470290795@qq.com

文章标题:红酒质量预测

文章字数:1.3k

本文作者:runze

发布时间:2020-01-28, 18:58:38

最后更新:2020-02-04, 21:21:40

原始链接:http://yoursite.com/2020/01/28/%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98/%E7%BA%A2%E9%85%92%E8%B4%A8%E9%87%8F%E9%A2%84%E6%B5%8B/%E7%BA%A2%E9%85%92%E6%95%B0%E6%8D%AE%E9%9B%86/

版权声明: "署名-非商用-相同方式共享 4.0" 转载请保留原文链接及作者。

目录
×

喜欢就点赞,疼爱就打赏