import math
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
iris = load_iris()
print(iris.keys())
print(iris.DESCR)
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
...
중요한 정보만 확인
:Attribute Information:
- sepal length in cm
- sepal width in cm
- petal length in cm
- petal width in cm
- class:
- Iris-Setosa
- Iris-Versicolour
- Iris-Virginica
...
X = iris.data
y = iris.target
features = iris.feature_names # iris['feature_names']
for i in range(len(features)):
plt.scatter(X[:, i], y, label=features[i])
plt.legend()
plt.show()
# petal-length, petal-width가 class(품종)을 분류할 때 상관 관계가 높아 보임.
X = X[:, 2:4] # pl, pw만 선택
print(X[:5])
[[1.4 0.2]
[1.4 0.2]
[1.3 0.2]
[1.5 0.2]
[1.4 0.2]]
# setosa 5개, setosa가 아닌 품종 5개를 샘플링
indices = [x for x in range(0, 100, 10)]
sample_data = np.c_[X[indices, :], y[indices]]
print(sample_data)
[[1.4 0.2 0. ]
[1.5 0.2 0. ]
[1.7 0.2 0. ]
[1.6 0.2 0. ]
[1.3 0.3 0. ]
[4.7 1.4 1. ]
[3.5 1. 1. ]
[4.8 1.8 1. ]
[3.8 1.1 1. ]
[4.4 1.2 1. ]]
def logistic(x):
"""Logistic Sigmoid 함수"""
return 1 / (1 + math.exp(-x))
def predict(row, betas):
"""row의 x1, x2 값과 betas의 b0, b1, b2를 사용해서
회귀식 y = b0 + b1 * x1 + b2 * x2를 만들고,
회귀식을 로지스틱 함수의 파라미터에 전달해서 예측값(y_hat)을 알아냄."""
# y_hat = betas[0] + betas[1] * row[0] + betas[2] * row[1]
y_hat = betas[0]
for i in range(len(betas)-1):
y_hat += betas[i + 1] * row[i]
return logistic(y_hat)
임의의 b0, b1, b2를 만들어 보자
np.random.seed(1218)
betas = np.random.random(3)
print('betas: ', betas)
betas: [0.70502709 0.70065136 0.26294298]
우선 아무 betas 값(기울기)에 대하여 sample의 0, 1행의 값을 집어넣은 결과값으로 prediction을 실행한 결과는 다음과 같다.
betas값을 prediction에 적합하도록 수정하는 것이 머신러닝의 목표이다.
for sample in sample_data:
prediction = predict(sample, betas)
# 오류 = 실제값 - 예측값
error = sample[-1] - prediction
print(f'True: {sample[-1]}, Prediction: {prediction}, Error: {error}')
True: 0.0, Prediction: 0.8504999461891724, Error: -0.8504999461891724
True: 0.0, Prediction: 0.8591917071831203, Error: -0.8591917071831203
True: 0.0, Prediction: 0.875307331698573, Error: -0.875307331698573
True: 0.0, Prediction: 0.8674568903800843, Error: -0.8674568903800843
True: 0.0, Prediction: 0.8448486959523984, Error: -0.8448486959523984
True: 1.0, Prediction: 0.9874599531039632, Error: 0.012540046896036827
True: 1.0, Prediction: 0.9683314340420521, Error: 0.03166856595794787
True: 1.0, Prediction: 0.989454462878024, Error: 0.010545537121976034
True: 1.0, Prediction: 0.9748331713767564, Error: 0.025166828623243598
True: 1.0, Prediction: 0.983752330133631, Error: 0.016247669866369052
def coefficient_sgd(dataset, learning_rate, epochs):
# coefficient_ Stochastic Gradient Descent
"""회귀식 y = b0 + b1 * x1 + b2 * x2의 계수들(b0, b1, b2)을
stochastic gradient descent 방법으로 근사값을 추정(estimate)
최소값 : gradient 반대 방향으로, 최대값이면 gradient 방향으로 움직이도록 설계"""
# 회귀식에서 처음에 사용할 betas의 초깃값을 0으로 시작
betas = [0 for _ in range(len(dataset[0]))] # 컬럼 개수만큼 0으로 채운다
for epoch in epochs: # epochs 회수만큼 반복
# sse: sum of squared errors
sse = 0
for sample in dataset: # 데이터 세트에서 row 개수만큼 반복
y_hat = prediction(sample, betas)
error = sample[-1] - y_hat # 오차 = 실제값 - 예측값
sse += error**2
# 계수들(b0, b1, b2)를 아래와 같은 방법으로 업데이트
# b_new = b + learning_rate * error * prediction * (1 - prediction) * x
betas[0] = betas[0] + learning_rate * error * prediction * (1 - prediction)
# ㄴ> Error를 미분한 식
for i in range(len(sample)-1):
betas[i + 1] = betas[i + 1] + learning_rate * error * prediction *\
(1 - prediction) * sample[i]
learning_rate = 0.3
epochs = 100
betas = coefficient_sgd(sample_data, learning_rate, epochs)
print('beta = ', betas)
>>> epoch=0, learning_rate=0.3, sum_of_squared_errors=2.2365202020173363
>>> epoch=1, learning_rate=0.3, sum_of_squared_errors=2.0155712568783812
>>> epoch=2, learning_rate=0.3, sum_of_squared_errors=1.9208590592662582
>>> epoch=3, learning_rate=0.3, sum_of_squared_errors=1.7974599812276866
>>> epoch=4, learning_rate=0.3, sum_of_squared_errors=1.6779336672262013
>>> epoch=5, learning_rate=0.3, sum_of_squared_errors=1.56684920144075
>>> epoch=6, learning_rate=0.3, sum_of_squared_errors=1.4645573717082794
>>> epoch=7, learning_rate=0.3, sum_of_squared_errors=1.3707617246019508
>>> epoch=8, learning_rate=0.3, sum_of_squared_errors=1.2849788636208537
>>> epoch=9, learning_rate=0.3, sum_of_squared_errors=1.2066392414351124
>>> epoch=10, learning_rate=0.3, sum_of_squared_errors=1.1351377870128383
>>> epoch=11, learning_rate=0.3, sum_of_squared_errors=1.069867932063553
>>> epoch=12, learning_rate=0.3, sum_of_squared_errors=1.0102442840033752
>>> epoch=13, learning_rate=0.3, sum_of_squared_errors=0.9557165583507462
>>> epoch=14, learning_rate=0.3, sum_of_squared_errors=0.9057770197274222
>>> epoch=15, learning_rate=0.3, sum_of_squared_errors=0.859963357344933
>>> epoch=16, learning_rate=0.3, sum_of_squared_errors=0.8178585349974232
>>> epoch=17, learning_rate=0.3, sum_of_squared_errors=0.7790887735979094
>>> epoch=18, learning_rate=0.3, sum_of_squared_errors=0.7433204920721501
>>> epoch=19, learning_rate=0.3, sum_of_squared_errors=0.7102567674155491
>>> epoch=20, learning_rate=0.3, sum_of_squared_errors=0.6796336762057731
>>> epoch=21, learning_rate=0.3, sum_of_squared_errors=0.6512167383288808
>>> epoch=22, learning_rate=0.3, sum_of_squared_errors=0.6247975869139859
>>> epoch=23, learning_rate=0.3, sum_of_squared_errors=0.6001909248575226
>>> epoch=24, learning_rate=0.3, sum_of_squared_errors=0.5772317881762699
>>> epoch=25, learning_rate=0.3, sum_of_squared_errors=0.5557731123165441
>>> epoch=26, learning_rate=0.3, sum_of_squared_errors=0.5356835840394737
>>> epoch=27, learning_rate=0.3, sum_of_squared_errors=0.5168457547947859
>>> epoch=28, learning_rate=0.3, sum_of_squared_errors=0.49915438900173803
>>> epoch=29, learning_rate=0.3, sum_of_squared_errors=0.48251502066292057
>>> epoch=30, learning_rate=0.3, sum_of_squared_errors=0.46684269313789645
>>> epoch=31, learning_rate=0.3, sum_of_squared_errors=0.45206085900175996
>>> epoch=32, learning_rate=0.3, sum_of_squared_errors=0.4381004192823343
>>> epoch=33, learning_rate=0.3, sum_of_squared_errors=0.42489888375811524
>>> epoch=34, learning_rate=0.3, sum_of_squared_errors=0.41239963626822534
>>> epoch=35, learning_rate=0.3, sum_of_squared_errors=0.40055129106597237
>>> epoch=36, learning_rate=0.3, sum_of_squared_errors=0.3893071281117396
>>> epoch=37, learning_rate=0.3, sum_of_squared_errors=0.37862459684599353
>>> epoch=38, learning_rate=0.3, sum_of_squared_errors=0.3684648794199088
>>> epoch=39, learning_rate=0.3, sum_of_squared_errors=0.35879250560694986
>>> epoch=40, learning_rate=0.3, sum_of_squared_errors=0.3495750126938729
>>> epoch=41, learning_rate=0.3, sum_of_squared_errors=0.3407826445744211
>>> epoch=42, learning_rate=0.3, sum_of_squared_errors=0.3323880850629534
>>> epoch=43, learning_rate=0.3, sum_of_squared_errors=0.3243662211261037
>>> epoch=44, learning_rate=0.3, sum_of_squared_errors=0.3166939323141852
>>> epoch=45, learning_rate=0.3, sum_of_squared_errors=0.30934990317436684
>>> epoch=46, learning_rate=0.3, sum_of_squared_errors=0.3023144558567413
>>> epoch=47, learning_rate=0.3, sum_of_squared_errors=0.29556940049268615
>>> epoch=48, learning_rate=0.3, sum_of_squared_errors=0.28909790124133994
>>> epoch=49, learning_rate=0.3, sum_of_squared_errors=0.2828843561721689
>>> epoch=50, learning_rate=0.3, sum_of_squared_errors=0.2769142893859906
>>> epoch=51, learning_rate=0.3, sum_of_squared_errors=0.271174253978975
>>> epoch=52, learning_rate=0.3, sum_of_squared_errors=0.2656517446287191
>>> epoch=53, learning_rate=0.3, sum_of_squared_errors=0.2603351187325213
>>> epoch=54, learning_rate=0.3, sum_of_squared_errors=0.2552135251587856
>>> epoch=55, learning_rate=0.3, sum_of_squared_errors=0.25027683978600884
>>> epoch=56, learning_rate=0.3, sum_of_squared_errors=0.2455156071024281
>>> epoch=57, learning_rate=0.3, sum_of_squared_errors=0.2409209872252654
>>> epoch=58, learning_rate=0.3, sum_of_squared_errors=0.2364847077733389
>>> epoch=59, learning_rate=0.3, sum_of_squared_errors=0.2321990200921551
>>> epoch=60, learning_rate=0.3, sum_of_squared_errors=0.22805665938772812
>>> epoch=61, learning_rate=0.3, sum_of_squared_errors=0.22405080837541616
>>> epoch=62, learning_rate=0.3, sum_of_squared_errors=0.2201750640939553
>>> epoch=63, learning_rate=0.3, sum_of_squared_errors=0.21642340757342288
>>> epoch=64, learning_rate=0.3, sum_of_squared_errors=0.21279017607978418
>>> epoch=65, learning_rate=0.3, sum_of_squared_errors=0.20927003768855687
>>> epoch=66, learning_rate=0.3, sum_of_squared_errors=0.20585796796649447
>>> epoch=67, learning_rate=0.3, sum_of_squared_errors=0.2025492285634839
>>> epoch=68, learning_rate=0.3, sum_of_squared_errors=0.19933934753746116
>>> epoch=69, learning_rate=0.3, sum_of_squared_errors=0.19622410125341277
>>> epoch=70, learning_rate=0.3, sum_of_squared_errors=0.19319949771372985
>>> epoch=71, learning_rate=0.3, sum_of_squared_errors=0.19026176119156954
>>> epoch=72, learning_rate=0.3, sum_of_squared_errors=0.18740731805168892
>>> epoch=73, learning_rate=0.3, sum_of_squared_errors=0.18463278365460747
>>> epoch=74, learning_rate=0.3, sum_of_squared_errors=0.18193495025013248
>>> epoch=75, learning_rate=0.3, sum_of_squared_errors=0.17931077577535176
>>> epoch=76, learning_rate=0.3, sum_of_squared_errors=0.17675737348032025
>>> epoch=77, learning_rate=0.3, sum_of_squared_errors=0.17427200231192752
>>> epoch=78, learning_rate=0.3, sum_of_squared_errors=0.17185205799294412
>>> epoch=79, learning_rate=0.3, sum_of_squared_errors=0.16949506473908427
>>> epoch=80, learning_rate=0.3, sum_of_squared_errors=0.16719866756216364
>>> epoch=81, learning_rate=0.3, sum_of_squared_errors=0.16496062511215465
>>> epoch=82, learning_rate=0.3, sum_of_squared_errors=0.16277880301517395
>>> epoch=83, learning_rate=0.3, sum_of_squared_errors=0.16065116766827378
>>> epoch=84, learning_rate=0.3, sum_of_squared_errors=0.1585757804553511
>>> epoch=85, learning_rate=0.3, sum_of_squared_errors=0.1565507923516049
>>> epoch=86, learning_rate=0.3, sum_of_squared_errors=0.15457443888678876
>>> epoch=87, learning_rate=0.3, sum_of_squared_errors=0.1526450354400551
>>> epoch=88, learning_rate=0.3, sum_of_squared_errors=0.15076097284148837
>>> epoch=89, learning_rate=0.3, sum_of_squared_errors=0.148920713257524
>>> epoch=90, learning_rate=0.3, sum_of_squared_errors=0.14712278633934256
>>> epoch=91, learning_rate=0.3, sum_of_squared_errors=0.1453657856150533
>>> epoch=92, learning_rate=0.3, sum_of_squared_errors=0.14364836510805035
>>> epoch=93, learning_rate=0.3, sum_of_squared_errors=0.14196923616534673
>>> epoch=94, learning_rate=0.3, sum_of_squared_errors=0.14032716448099533
>>> epoch=95, learning_rate=0.3, sum_of_squared_errors=0.13872096730088423
>>> epoch=96, learning_rate=0.3, sum_of_squared_errors=0.13714951079627696
>>> epoch=97, learning_rate=0.3, sum_of_squared_errors=0.135611707594452
>>> epoch=98, learning_rate=0.3, sum_of_squared_errors=0.13410651445569968
>>> epoch=99, learning_rate=0.3, sum_of_squared_errors=0.1326329300867541
beta = [-3.8633388352931486, 1.0769097880558474, 1.8377049776535148]
# 모델 성능 측정
test_sample1 = np.r_[X[1, :], y[1]]
prediction = predict(test_sample1, betas)
print(f'True: {test_sample1[-1]}, Predict: {prediction}')
True: 0.0, Predict: 0.12045546114162796
test_sample2 = np.r_[X[51, :], y[51]]
print(test_sample2)
prediction = predict(test_sample2, betas)
print(f'True: {test_sample2[-1]}, Predict: {prediction}')
[4.5 1.5 1. ]
True: 1.0, Predict: 0.9767814788835434