DACON Python 튜토리얼 Lv1. 의사결정회귀나무로 따릉이 데이터 예측하기

2023. 1. 11. 23:55Python/DACON

데이터 다운로드

# 데이터 다운로드 링크로 데이터를 코랩에 불러옵니다.

!wget 'https://bit.ly/3gLj0Q6'

import zipfile
with zipfile.ZipFile('3gLj0Q6', 'r') as existing_zip:
    existing_zip.extractall('data')
--2023-01-11 08:15:16--  https://bit.ly/3gLj0Q6
Resolving bit.ly (bit.ly)... 67.199.248.11, 67.199.248.10
Connecting to bit.ly (bit.ly)|67.199.248.11|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://drive.google.com/uc?export=download&id=1or_QN1ksv81DNog6Tu_kWcZ5jJWf5W9E [following]
--2023-01-11 08:15:16--  https://drive.google.com/uc?export=download&id=1or_QN1ksv81DNog6Tu_kWcZ5jJWf5W9E
Resolving drive.google.com (drive.google.com)... 142.251.2.102, 142.251.2.139, 142.251.2.113, ...
Connecting to drive.google.com (drive.google.com)|142.251.2.102|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0c-10-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/13sun5uefe1sc3mtgg5sqv9jh7njs8n2/1673424900000/17946651057176172524/*/1or_QN1ksv81DNog6Tu_kWcZ5jJWf5W9E?e=download&uuid=d3203cc3-6d73-4d2e-8ede-c29422b501f4 [following]
Warning: wildcards not supported in HTTP.
--2023-01-11 08:15:17--  https://doc-0c-10-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/13sun5uefe1sc3mtgg5sqv9jh7njs8n2/1673424900000/17946651057176172524/*/1or_QN1ksv81DNog6Tu_kWcZ5jJWf5W9E?e=download&uuid=d3203cc3-6d73-4d2e-8ede-c29422b501f4
Resolving doc-0c-10-docs.googleusercontent.com (doc-0c-10-docs.googleusercontent.com)... 74.125.137.132, 2607:f8b0:4023:c03::84
Connecting to doc-0c-10-docs.googleusercontent.com (doc-0c-10-docs.googleusercontent.com)|74.125.137.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39208 (38K) [application/zip]
Saving to: ‘3gLj0Q6.1’

3gLj0Q6.1           100%[===================>]  38.29K  --.-KB/s    in 0s      

2023-01-11 08:15:17 (107 MB/s) - ‘3gLj0Q6.1’ saved [39208/39208]

라이브러리 불러오기

# Pandas 와 Scikit-learn 라이브러리를 불러오세요
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

데이터 불러오기

# train.csv 와 test.csv 를 DataFrame 클래스로 불러오세요
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

데이터 정보 관찰하기

# head() 를 사용해 데이터 정보를 관찰하세요
train.head()
test.head()

데이터 행열 개수 관찰

# shape 를 사용해 데이터 크기를 관찰하세요
train.shape
(1459, 11)

test.shape
(715, 10)

결측치 확인

# info() 를 사용해 결측치가 있는지 확인하세요.
train.info()
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1459 non-null   int64  
 1   hour                    1459 non-null   int64  
 2   hour_bef_temperature    1457 non-null   float64
 3   hour_bef_precipitation  1457 non-null   float64
 4   hour_bef_windspeed      1450 non-null   float64
 5   hour_bef_humidity       1457 non-null   float64
 6   hour_bef_visibility     1457 non-null   float64
 7   hour_bef_ozone          1383 non-null   float64
 8   hour_bef_pm10           1369 non-null   float64
 9   hour_bef_pm2.5          1342 non-null   float64
 10  count                   1459 non-null   float64
dtypes: float64(9), int64(2)
memory usage: 125.5 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715 entries, 0 to 714
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      715 non-null    int64  
 1   hour                    715 non-null    int64  
 2   hour_bef_temperature    714 non-null    float64
 3   hour_bef_precipitation  714 non-null    float64
 4   hour_bef_windspeed      714 non-null    float64
 5   hour_bef_humidity       714 non-null    float64
 6   hour_bef_visibility     714 non-null    float64
 7   hour_bef_ozone          680 non-null    float64
 8   hour_bef_pm10           678 non-null    float64
 9   hour_bef_pm2.5          679 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 56.0 KB

결측치 전처리

# dropna() 를 사용해 train 데이터는 결측치를 제거하고
# fillna() 를 사용해 test 데이터 결측치는 0 으로 대체하세요.
# 그리고 결측치의 갯수를 출력하여 확안하세요.
train = train.dropna()
test = test.fillna(0)
print(train.isnull().sum())
id                        0
hour                      0
hour_bef_temperature      0
hour_bef_precipitation    0
hour_bef_windspeed        0
hour_bef_humidity         0
hour_bef_visibility       0
hour_bef_ozone            0
hour_bef_pm10             0
hour_bef_pm2.5            0
count                     0
dtype: int64

모델 훈련

# train 데이터의 count 피쳐를 뺀 것을 X_train 으로 할당하세요.
# train 데이터의 count 피쳐만을 가진 것을 Y_train 으로 할당하세요.
# 회귀의사결정나무를 선언하고 fit() 으로 훈련시키세요.
X_train = train.drop(['count'], axis=1)
Y_train = train['count']
model = DecisionTreeRegressor()
model.fit(X_train, Y_train)
DecisionTreeRegressor()

테스트 파일 예측

# predict() 을 이용해 test data 를 훈련한 모델로 예측한 data array 를 생성하세요.
pred = model.predict(test)

submission 파일 생성

# submission.csv 를 불러오세요.
# submission df 의 count 피쳐에 예측한 결과를 덧입히세요.
# submission df 를 to_csv() 를 이용해 csv 을 생성하세요. *index=False) 
submission = pd.read_csv('data/submission.csv')
submission['count'] = pred
submission.to_csv('sub.csv', index = False)