from sklearn import datasets
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris['data'] # iris.data
y = iris['target'] # iris.target
features = iris['feature_names'] # iris.feature_names
print(features)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
iris_df = pd.DataFrame(X,
columns=['sepal_length', 'sepal_width',
'petal_length', 'petal_width'])
# 데이터 프레임에 컬럼(변수, 특성)을 추가
iris_df['species'] = y
print(iris_df.iloc[:5, :])
print(iris_df.describe())
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
sepal_length sepal_width petal_length petal_width species
count 150.000000 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333 1.000000
std 0.828066 0.435866 1.765298 0.762238 0.819232
min 4.300000 2.000000 1.000000 0.100000 0.000000
25% 5.100000 2.800000 1.600000 0.300000 0.000000
50% 5.800000 3.000000 4.350000 1.300000 1.000000
75% 6.400000 3.300000 5.100000 1.800000 2.000000
max 7.900000 4.400000 6.900000 2.500000 2.000000
데이터의 개괄적인 특징 파악하기
sns.pairplot(iris_df, hue='species', vars=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
plt.show()
# 데이터(X)와 타겟(y)을 학습/검증 세트로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1217)
# 분류 알고리즘 중에서 Logistic Regression을 선택
log_reg = LogisticRegression()
# 모델 적합(fitting)/학습(training)
log_reg.fit(X_train, y_train)
# 예측/테스트
predictions = log_reg.predict(X_test)
print('y true:', y_test)
print('y pred:', predictions)
y true: [1 2 2 0 1 2 1 2 0 0 1 1 2 1 1 0 2 0 1 0 2 0 0 1 1 1 1 0 2 2]
y pred: [1 2 2 0 1 2 1 2 0 0 1 1 2 1 1 0 2 0 1 0 2 0 0 2 1 1 1 0 2 1]
# 성능 측정
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))
[[ 9 0 0]
[ 0 11 1]
[ 0 1 8]]
precision recall f1-score support
0 1.00 1.00 1.00 9
1 0.92 0.92 0.92 12
2 0.89 0.89 0.89 9
accuracy 0.93 30
macro avg 0.94 0.94 0.94 30
weighted avg 0.93 0.93 0.93 30