COVID-19 data from John Hopkins University

데이터 소개
    - 총 10개의 파일 중 2개의 Raw 데이터를 사용합니다.
    RAW_global_confirmed_cases.csv
    RAW_global_global_deaths.csv

    - 각 파일의 칼럼은 아래와 같습니다.
    Country/Region: 국가
    Province/State: 지방/주
    Lat: 지역의 위도
    Long: 지역의 경도
    날짜: 각 날짜의 확진자/사망자 수

- 데이터 출처: https://www.kaggle.com/antgoldbloom/covid19-data-from-john-hopkins-university

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

os.environ['KAGGLE_USERNAME'] = 'jhighllight'
os.environ['KAGGLE_KEY'] = 'xxxxxxxxxxxxxxxxxxxx'

!rm *.*
!kaggle datasets download -d antgoldbloom/covid19-data-from-john-hopkins-university
!unzip '*.zip'

df_case = pd.read_csv('RAW_global_confirmed_cases.csv')
df_death = pd.read_csv('RAW_global_deaths.csv')

df_case.head()

df_death.head()

def fix_dataframe(df):
  df = df.drop(['Lat', 'Long'], axis=1).groupby('Country/Region').sum()
  df = df.transpose()
  df.index.name = 'Date'
  df.reset_index(inplace=True)
  df['Date'] = df['Date'].apply(lambda s: pd.to_datetime(str(s)))
  df.set_index('Date', inplace=True)
  return df

df_case = fix_dataframe(df_case)
df_death = fix_dataframe(df_death)

<ipython-input-13-8370bf488dd3>:3: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

df = df.drop(['Lat', 'Long'], axis=1).groupby('Country/Region').sum()

df_case

df_death

top_ten_cases = df_case.loc[df_case.index[-1]].sort_values(ascending=False)[:10]
sns.barplot(x=top_ten_cases.index, y=top_ten_cases, color='black')
plt.xticks(rotation=90, size=15)
plt.ylabel('Total Confirmed Cases', size=15)
plt.xlabel('')
plt.title('Total Confirmed Cases (%s)' % top_ten_cases.name.strftime('%Y-%m-%d'), size=15)

ax = plt.gca()
ax2 = ax.twinx()
top_ten_deaths = df_death.loc[df_death.index[-1]][top_ten_cases.index]
ax2.plot(top_ten_deaths.index, top_ten_deaths, 'r--')
ax2.set_ylabel('Total Deaths', color='red', size=15)
plt.show()

def plot_case_with_death(country):
    plt.plot(df_case.index, df_case[country], 'b-')
    plt.ylabel('Confirmed Cases', color='blue')
    plt.title(country + 'Cases & Deaths')
    plt.xlabel('Date')
    plt.xlim(right=df_case.index[-1])
    plt.ylim(0, df_case[country].max()*1.1)

    ax = plt.gca()
    ax2 = ax.twinx()
    ax2.plot(df_death.index, df_death[country], 'r--')
    ax2.set_ylabel('Deaths', color='red')
    ax2.set_ylim(0, df_death[country].max()*1.3)
    plt.show()

plot_case_with_death('US')

for c in df_case.columns:
    print(c)

Afghanistan

Albania

Algeria

Andorra

Angola

Antarctica

Antigua and Barbuda

Argentina

Armenia

Australia

Austria

Azerbaijan

Bahamas

Bahrain

Uganda

Ukraine

United Arab Emirates

United Kingdom

Uruguay

Uzbekistan

Vanuatu

Venezuela

Vietnam

West Bank and Gaza

Winter Olympics 2022

Yemen

Zambia

Zimbabwe

plot_case_with_death('Germany')

plot_case_with_death('China')

plot_case_with_death('Korea, South')

country = 'Korea, South'
plt.plot(df_case.index, df_case[country].diff(), 'b-')
plt.ylabel('Confirmed Cases', color='blue')
plt.title(country + ' Cases & Deaths')
plt.xlabel('Date')
plt.xlim(right=df_case.index[-1])
plt.ylim(bottom=0)

ax = plt.gca()
ax2 = ax.twinx()
ax2.plot(df_death.index, df_death[country].diff(), 'r--')
ax2.set_ylabel('Deaths', color='red')
ax2.set_ylim(bottom=0)
plt.show()

import plotly.graph_objects as go

# 아래 데이터프레임를 이용하여 국가명을 국가코드로 변경
df_code = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')

# 이름이 다르게 쓰인 국가의 이름을 먼저 변경한 후에 코드로 변경
country_renamer = {
    'Bahamas':'Bahamas, The',
    'Congo (Brazzaville)':'Congo, Republic of the',
    'Congo (Kinshasa)':'Congo, Democratic Republic of the',
    'Czechia':'Czech Republic',
    'Eswatini':'Swaziland',
    'Gambia':'Gambia, The',
    'North Macedonia':'Macedonia',
    'Taiwan*':'Taiwan',
    'US':'United States',
    'West Bank and Gaza':'West Bank'
}

df_case.head()

country_to_code = df_code.set_index('COUNTRY')['CODE']

df_c = df_case.rename(country_renamer, axis=1).copy()
for col in df_c.columns:
    if col not in country_to_code.index:
      df_c.drop(col, axis=1, inplace=True)
df_code = df_c.rename(country_to_code, axis=1)
df_code

# 공식 레퍼런스 참조: https://plotly.com/python/choropleth-maps/#world-choropleth-map
def get_choropleth_data(date):
  # TODO: date에 해당하는 날짜로 데이터 생성
  data = go.Choropleth(
    locations = df_code.columns,
    z = df_code.loc[date],
    text = df_c.columns,
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '$',
    colorbar_title = 'Confirmed Cases',
  )
  return data

# 공식 레퍼런스 참조: https://plotly.com/python/choropleth-maps/#world-choropleth-map
def get_choropleth_data(date):
  # TODO: date에 해당하는 날짜로 데이터 생성
  data = go.Choropleth(
    locations = df_code.columns,
    z = df_code.loc[date],
    text = df_c.columns,
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '$',
    colorbar_title = 'Confirmed Cases',
  )
  return data

# 공식 레퍼런스 참조: https://facebook.github.io/prophet/docs/quick_start.html#python-api
# 한국의 확진자 수 데이터를 FBProphet에 학습하기 위한 데이터프레임으로 재구성하시오.
df = pd.DataFrame(df_case.reset_index()[['Date', 'Korea, South']].to_numpy(), columns=['ds', 'y'])
df

from math import floor

def train_test_split_df(df, test_size):
  # TODO: test_size(0 < test_size < 1) 비율에 따라 테스트/학습 데이터프레임으로 나누어 반환
  div = floor(df.shape[0] * (1 - test_size))
  return df.loc[:div], df.loc[div + 1:]

train_df, test_df = train_test_split_df(df, 0.1)

train_df.tail()

test_df.head()

저작자표시 (새창열림)

'Python > Kaggle' 카테고리의 다른 글

Video Game Sales with Ratings (0)	2023.05.18
[패캠] 우리나라의 행복지수는 몇 위? 아니, 행복지수가 도대체 뭔데? (0)	2023.05.17
Part2. Chapter 2 - 뉴욕에서 방이 둘 딸린 집을 에어비엔비에 내놓으려 한다, 이 때 적당한 숙바.. (0)	2023.03.27
Part2. Chapter 1 - 자동으로 모은 데이터는 분석하기 어렵다면서? 자동으로 모은 중고 자동차 데ᄋ.. (0)	2023.03.26
Part1. Chapter 05 - 미국의 대통령은 어떻게 뽑힐까 (0)	2023.03.22