[6주차 - Day3] ML_basics - E2E

2023. 5. 14. 17:05BOOTCAMP/프로그래머스 인공지능 데브코스

End-to-End 머신러닝 프로젝트

 

부동산 회사에 막 고용된 데이터 과학자라고 가정하고 예제 프로젝트를 처음부터 끝까지 (End-to-End) 진행했습니다.

주요 단계는 다음과 같습니다.

 

프로젝트 과정
1. 큰 그림을 봅니다 (look at the big picture).

2. 데이터를 구합니다 (get the data).
3. 데이터로부터 통찰을 얻기 위해 탐색하고 시각화합니다 (discover and visualize the data to gain insights).
4. 머신러닝 알고리즘을 위해 데이터를 준비합니다 (prepare the data for Machine Learning algorithms).
5. 모델을 선택하고 훈련시킵니다 (select a model and train it).
6. 모델을 상세하게 조정합니다 (fine-tune your model).
7. 설루션을 제시합니다 (present your solution).
8. 시스템을 론칭하고 모니터링하고 유지 보수합니다 (launch, monitor, and maintain your system).

 

데이터 다운로드

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

데이터 구조 훑어보기

housing = load_housing_data()
housing.head()

housing.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 20640 entries, 0 to 20639

Data columns (total 10 columns):

 #   Column              Non-Null Count  Dtype  

---  ------              --------------  -----  

 0   longitude           20640 non-null  float64

 1   latitude            20640 non-null  float64

 2   housing_median_age  20640 non-null  float64

 3   total_rooms         20640 non-null  float64

 4   total_bedrooms      20433 non-null  float64

 5   population          20640 non-null  float64

 6   households          20640 non-null  float64

 7   median_income       20640 non-null  float64

 8   median_house_value  20640 non-null  float64

 9   ocean_proximity     20640 non-null  object 

dtypes: float64(9), object(1)

memory usage: 1.6+ MB

 

housing["ocean_proximity"].value_counts()

<1H OCEAN     9136

INLAND        6551

NEAR OCEAN    2658

NEAR BAY      2290

ISLAND           5

Name: ocean_proximity, dtype: int64

housing.describe()

히스토그램으로 데이터 분석해 보기

%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

테스트 데이터셋 만들기

np.random.seed(42)

import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
a = np.random.permutation(10)
a

array([8, 1, 5, 0, 7, 2, 9, 4, 3, 6])

train_set, test_set = split_train_test(housing, 0.2)
len(train_set), len(test_set)

 

(16512, 4128)

from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

이후에 계층적 샘플링, 상관관계 확인하고 난 뒤, 모델 훈련을 진행합니다.

훈련 데이터셋의 RMSE가 큰 경우 과소적합이며, features가 충분한 정보를 제공하지 못하고, 모델이 충분히 강력하지 못할 때 발생합니다.

 

결정트리 모델, 선형회귀 모델. 렌덤포레스트에 의한 평가를 하며, 테스트 데이터셋으로 최종 평가합니다. 

 

결정트리 모델에 대한 평가

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)