Python/Python기초

Python 76_ iris 데이터를 이용한 Logistic 함수 적용

Codezoy 2020. 3. 9. 18:56

import math
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt


iris = load_iris()
print(iris.keys())
print(iris.DESCR)


dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

...
중요한 정보만 확인

:Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica

...

X = iris.data
y = iris.target
features = iris.feature_names # iris['feature_names']
for i in range(len(features)):
    plt.scatter(X[:, i], y, label=features[i])
plt.legend()
plt.show()



# petal-length, petal-width가 class(품종)을 분류할 때 상관 관계가 높아 보임.
X = X[:, 2:4] # pl, pw만 선택
print(X[:5])

[[1.4 0.2]
[1.4 0.2]
[1.3 0.2]
[1.5 0.2]
[1.4 0.2]]

# setosa 5개, setosa가 아닌 품종 5개를 샘플링
indices = [x for x in range(0, 100, 10)]
sample_data = np.c_[X[indices, :], y[indices]]
print(sample_data)

[[1.4 0.2 0. ]
[1.5 0.2 0. ]
[1.7 0.2 0. ]
[1.6 0.2 0. ]
[1.3 0.3 0. ]
[4.7 1.4 1. ]
[3.5 1.  1. ]
[4.8 1.8 1. ]
[3.8 1.1 1. ]
[4.4 1.2 1. ]]

def logistic(x):
    """Logistic Sigmoid 함수"""
    return 1 / (1 + math.exp(-x))

    
def predict(row, betas):
    """row의 x1, x2 값과 betas의 b0, b1, b2를 사용해서
    회귀식 y = b0 + b1 * x1 + b2 * x2를 만들고,
    회귀식을 로지스틱 함수의 파라미터에 전달해서 예측값(y_hat)을 알아냄."""
    # y_hat = betas[0] + betas[1] * row[0] + betas[2] * row[1]
    y_hat = betas[0]
    for i in range(len(betas)-1):
        y_hat += betas[i + 1] * row[i]
    return logistic(y_hat)


임의의 b0, b1, b2를 만들어 보자
np.random.seed(1218)
betas = np.random.random(3)
print('betas: ', betas)

betas:  [0.70502709 0.70065136 0.26294298]

우선 아무 betas 값(기울기)에 대하여 sample의 0, 1행의 값을 집어넣은 결과값으로 prediction을 실행한 결과는 다음과 같다.
betas값을 prediction에 적합하도록 수정하는 것이 머신러닝의 목표이다.    
for sample in sample_data:
    prediction = predict(sample, betas)
    # 오류 = 실제값 - 예측값
    error = sample[-1] - prediction
    print(f'True: {sample[-1]}, Prediction: {prediction}, Error: {error}')

True: 0.0, Prediction: 0.8504999461891724, Error: -0.8504999461891724
True: 0.0, Prediction: 0.8591917071831203, Error: -0.8591917071831203
True: 0.0, Prediction: 0.875307331698573, Error: -0.875307331698573
True: 0.0, Prediction: 0.8674568903800843, Error: -0.8674568903800843
True: 0.0, Prediction: 0.8448486959523984, Error: -0.8448486959523984
True: 1.0, Prediction: 0.9874599531039632, Error: 0.012540046896036827
True: 1.0, Prediction: 0.9683314340420521, Error: 0.03166856595794787
True: 1.0, Prediction: 0.989454462878024, Error: 0.010545537121976034
True: 1.0, Prediction: 0.9748331713767564, Error: 0.025166828623243598
True: 1.0, Prediction: 0.983752330133631, Error: 0.016247669866369052
    
def coefficient_sgd(dataset, learning_rate, epochs): 
  # coefficient_ Stochastic Gradient Descent
    """회귀식 y = b0 + b1 * x1 + b2 * x2의 계수들(b0, b1, b2)을
    stochastic gradient descent 방법으로 근사값을 추정(estimate)
    최소값 : gradient 반대 방향으로, 최대값이면 gradient 방향으로 움직이도록 설계"""
    # 회귀식에서 처음에 사용할 betas의 초깃값을 0으로 시작
    betas = [0 for _ in range(len(dataset[0]))] # 컬럼 개수만큼 0으로 채운다
    for epoch in epochs: # epochs 회수만큼 반복
        # sse: sum of squared errors
        sse = 0
        for sample in dataset: # 데이터 세트에서 row 개수만큼 반복
            y_hat = prediction(sample, betas)
            error = sample[-1] - y_hat # 오차 = 실제값 - 예측값
            sse += error**2
            # 계수들(b0, b1, b2)를 아래와 같은 방법으로 업데이트
            # b_new = b + learning_rate * error * prediction * (1 - prediction) * x
            betas[0] = betas[0] + learning_rate * error * prediction * (1 - prediction)
              # ㄴ> Error를 미분한 식
            for i in range(len(sample)-1):
                betas[i + 1] = betas[i + 1] + learning_rate * error * prediction *\ 
                (1 - prediction) * sample[i]



learning_rate = 0.3
epochs = 100
betas = coefficient_sgd(sample_data, learning_rate, epochs)
print('beta = ', betas)
>>>  epoch=0, learning_rate=0.3, sum_of_squared_errors=2.2365202020173363
>>>  epoch=1, learning_rate=0.3, sum_of_squared_errors=2.0155712568783812
>>>  epoch=2, learning_rate=0.3, sum_of_squared_errors=1.9208590592662582
>>>  epoch=3, learning_rate=0.3, sum_of_squared_errors=1.7974599812276866
>>>  epoch=4, learning_rate=0.3, sum_of_squared_errors=1.6779336672262013
>>>  epoch=5, learning_rate=0.3, sum_of_squared_errors=1.56684920144075
>>>  epoch=6, learning_rate=0.3, sum_of_squared_errors=1.4645573717082794
>>>  epoch=7, learning_rate=0.3, sum_of_squared_errors=1.3707617246019508
>>>  epoch=8, learning_rate=0.3, sum_of_squared_errors=1.2849788636208537
>>>  epoch=9, learning_rate=0.3, sum_of_squared_errors=1.2066392414351124
>>>  epoch=10, learning_rate=0.3, sum_of_squared_errors=1.1351377870128383
>>>  epoch=11, learning_rate=0.3, sum_of_squared_errors=1.069867932063553
>>>  epoch=12, learning_rate=0.3, sum_of_squared_errors=1.0102442840033752
>>>  epoch=13, learning_rate=0.3, sum_of_squared_errors=0.9557165583507462
>>>  epoch=14, learning_rate=0.3, sum_of_squared_errors=0.9057770197274222
>>>  epoch=15, learning_rate=0.3, sum_of_squared_errors=0.859963357344933
>>>  epoch=16, learning_rate=0.3, sum_of_squared_errors=0.8178585349974232
>>>  epoch=17, learning_rate=0.3, sum_of_squared_errors=0.7790887735979094
>>>  epoch=18, learning_rate=0.3, sum_of_squared_errors=0.7433204920721501
>>>  epoch=19, learning_rate=0.3, sum_of_squared_errors=0.7102567674155491
>>>  epoch=20, learning_rate=0.3, sum_of_squared_errors=0.6796336762057731
>>>  epoch=21, learning_rate=0.3, sum_of_squared_errors=0.6512167383288808
>>>  epoch=22, learning_rate=0.3, sum_of_squared_errors=0.6247975869139859
>>>  epoch=23, learning_rate=0.3, sum_of_squared_errors=0.6001909248575226
>>>  epoch=24, learning_rate=0.3, sum_of_squared_errors=0.5772317881762699
>>>  epoch=25, learning_rate=0.3, sum_of_squared_errors=0.5557731123165441
>>>  epoch=26, learning_rate=0.3, sum_of_squared_errors=0.5356835840394737
>>>  epoch=27, learning_rate=0.3, sum_of_squared_errors=0.5168457547947859
>>>  epoch=28, learning_rate=0.3, sum_of_squared_errors=0.49915438900173803
>>>  epoch=29, learning_rate=0.3, sum_of_squared_errors=0.48251502066292057
>>>  epoch=30, learning_rate=0.3, sum_of_squared_errors=0.46684269313789645
>>>  epoch=31, learning_rate=0.3, sum_of_squared_errors=0.45206085900175996
>>>  epoch=32, learning_rate=0.3, sum_of_squared_errors=0.4381004192823343
>>>  epoch=33, learning_rate=0.3, sum_of_squared_errors=0.42489888375811524
>>>  epoch=34, learning_rate=0.3, sum_of_squared_errors=0.41239963626822534
>>>  epoch=35, learning_rate=0.3, sum_of_squared_errors=0.40055129106597237
>>>  epoch=36, learning_rate=0.3, sum_of_squared_errors=0.3893071281117396
>>>  epoch=37, learning_rate=0.3, sum_of_squared_errors=0.37862459684599353
>>>  epoch=38, learning_rate=0.3, sum_of_squared_errors=0.3684648794199088
>>>  epoch=39, learning_rate=0.3, sum_of_squared_errors=0.35879250560694986
>>>  epoch=40, learning_rate=0.3, sum_of_squared_errors=0.3495750126938729
>>>  epoch=41, learning_rate=0.3, sum_of_squared_errors=0.3407826445744211
>>>  epoch=42, learning_rate=0.3, sum_of_squared_errors=0.3323880850629534
>>>  epoch=43, learning_rate=0.3, sum_of_squared_errors=0.3243662211261037
>>>  epoch=44, learning_rate=0.3, sum_of_squared_errors=0.3166939323141852
>>>  epoch=45, learning_rate=0.3, sum_of_squared_errors=0.30934990317436684
>>>  epoch=46, learning_rate=0.3, sum_of_squared_errors=0.3023144558567413
>>>  epoch=47, learning_rate=0.3, sum_of_squared_errors=0.29556940049268615
>>>  epoch=48, learning_rate=0.3, sum_of_squared_errors=0.28909790124133994
>>>  epoch=49, learning_rate=0.3, sum_of_squared_errors=0.2828843561721689
>>>  epoch=50, learning_rate=0.3, sum_of_squared_errors=0.2769142893859906
>>>  epoch=51, learning_rate=0.3, sum_of_squared_errors=0.271174253978975
>>>  epoch=52, learning_rate=0.3, sum_of_squared_errors=0.2656517446287191
>>>  epoch=53, learning_rate=0.3, sum_of_squared_errors=0.2603351187325213
>>>  epoch=54, learning_rate=0.3, sum_of_squared_errors=0.2552135251587856
>>>  epoch=55, learning_rate=0.3, sum_of_squared_errors=0.25027683978600884
>>>  epoch=56, learning_rate=0.3, sum_of_squared_errors=0.2455156071024281
>>>  epoch=57, learning_rate=0.3, sum_of_squared_errors=0.2409209872252654
>>>  epoch=58, learning_rate=0.3, sum_of_squared_errors=0.2364847077733389
>>>  epoch=59, learning_rate=0.3, sum_of_squared_errors=0.2321990200921551
>>>  epoch=60, learning_rate=0.3, sum_of_squared_errors=0.22805665938772812
>>>  epoch=61, learning_rate=0.3, sum_of_squared_errors=0.22405080837541616
>>>  epoch=62, learning_rate=0.3, sum_of_squared_errors=0.2201750640939553
>>>  epoch=63, learning_rate=0.3, sum_of_squared_errors=0.21642340757342288
>>>  epoch=64, learning_rate=0.3, sum_of_squared_errors=0.21279017607978418
>>>  epoch=65, learning_rate=0.3, sum_of_squared_errors=0.20927003768855687
>>>  epoch=66, learning_rate=0.3, sum_of_squared_errors=0.20585796796649447
>>>  epoch=67, learning_rate=0.3, sum_of_squared_errors=0.2025492285634839
>>>  epoch=68, learning_rate=0.3, sum_of_squared_errors=0.19933934753746116
>>>  epoch=69, learning_rate=0.3, sum_of_squared_errors=0.19622410125341277
>>>  epoch=70, learning_rate=0.3, sum_of_squared_errors=0.19319949771372985
>>>  epoch=71, learning_rate=0.3, sum_of_squared_errors=0.19026176119156954
>>>  epoch=72, learning_rate=0.3, sum_of_squared_errors=0.18740731805168892
>>>  epoch=73, learning_rate=0.3, sum_of_squared_errors=0.18463278365460747
>>>  epoch=74, learning_rate=0.3, sum_of_squared_errors=0.18193495025013248
>>>  epoch=75, learning_rate=0.3, sum_of_squared_errors=0.17931077577535176
>>>  epoch=76, learning_rate=0.3, sum_of_squared_errors=0.17675737348032025
>>>  epoch=77, learning_rate=0.3, sum_of_squared_errors=0.17427200231192752
>>>  epoch=78, learning_rate=0.3, sum_of_squared_errors=0.17185205799294412
>>>  epoch=79, learning_rate=0.3, sum_of_squared_errors=0.16949506473908427
>>>  epoch=80, learning_rate=0.3, sum_of_squared_errors=0.16719866756216364
>>>  epoch=81, learning_rate=0.3, sum_of_squared_errors=0.16496062511215465
>>>  epoch=82, learning_rate=0.3, sum_of_squared_errors=0.16277880301517395
>>>  epoch=83, learning_rate=0.3, sum_of_squared_errors=0.16065116766827378
>>>  epoch=84, learning_rate=0.3, sum_of_squared_errors=0.1585757804553511
>>>  epoch=85, learning_rate=0.3, sum_of_squared_errors=0.1565507923516049
>>>  epoch=86, learning_rate=0.3, sum_of_squared_errors=0.15457443888678876
>>>  epoch=87, learning_rate=0.3, sum_of_squared_errors=0.1526450354400551
>>>  epoch=88, learning_rate=0.3, sum_of_squared_errors=0.15076097284148837
>>>  epoch=89, learning_rate=0.3, sum_of_squared_errors=0.148920713257524
>>>  epoch=90, learning_rate=0.3, sum_of_squared_errors=0.14712278633934256
>>>  epoch=91, learning_rate=0.3, sum_of_squared_errors=0.1453657856150533
>>>  epoch=92, learning_rate=0.3, sum_of_squared_errors=0.14364836510805035
>>>  epoch=93, learning_rate=0.3, sum_of_squared_errors=0.14196923616534673
>>>  epoch=94, learning_rate=0.3, sum_of_squared_errors=0.14032716448099533
>>>  epoch=95, learning_rate=0.3, sum_of_squared_errors=0.13872096730088423
>>>  epoch=96, learning_rate=0.3, sum_of_squared_errors=0.13714951079627696
>>>  epoch=97, learning_rate=0.3, sum_of_squared_errors=0.135611707594452
>>>  epoch=98, learning_rate=0.3, sum_of_squared_errors=0.13410651445569968
>>>  epoch=99, learning_rate=0.3, sum_of_squared_errors=0.1326329300867541

beta =  [-3.8633388352931486, 1.0769097880558474, 1.8377049776535148]


# 모델 성능 측정
test_sample1 = np.r_[X[1, :], y[1]]
prediction = predict(test_sample1, betas)
print(f'True: {test_sample1[-1]}, Predict: {prediction}')
True: 0.0, Predict: 0.12045546114162796


test_sample2 = np.r_[X[51, :], y[51]]
print(test_sample2)
prediction = predict(test_sample2, betas)
print(f'True: {test_sample2[-1]}, Predict: {prediction}')

[4.5 1.5 1. ]
True: 1.0, Predict: 0.9767814788835434