COVID-19 data from John Hopkins University

2023. 5. 21. 21:43Python/Kaggle

데이터 소개
    - 총 10개의 파일 중 2개의 Raw 데이터를 사용합니다.
    RAW_global_confirmed_cases.csv
    RAW_global_global_deaths.csv

    - 각 파일의 칼럼은 아래와 같습니다.
    Country/Region: 국가
    Province/State: 지방/주
    Lat: 지역의 위도
    Long: 지역의 경도
    날짜: 각 날짜의 확진자/사망자 수 
    
- 데이터 출처: https://www.kaggle.com/antgoldbloom/covid19-data-from-john-hopkins-university

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.environ['KAGGLE_USERNAME'] = 'jhighllight'
os.environ['KAGGLE_KEY'] = 'xxxxxxxxxxxxxxxxxxxx'
!rm *.*
!kaggle datasets download -d antgoldbloom/covid19-data-from-john-hopkins-university
!unzip '*.zip'
df_case = pd.read_csv('RAW_global_confirmed_cases.csv')
df_death = pd.read_csv('RAW_global_deaths.csv')
df_case.head()

df_death.head()

def fix_dataframe(df):
  df = df.drop(['Lat', 'Long'], axis=1).groupby('Country/Region').sum()
  df = df.transpose()
  df.index.name = 'Date'
  df.reset_index(inplace=True)
  df['Date'] = df['Date'].apply(lambda s: pd.to_datetime(str(s)))
  df.set_index('Date', inplace=True)
  return df
df_case = fix_dataframe(df_case)
df_death = fix_dataframe(df_death)

 

<ipython-input-13-8370bf488dd3>:3: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

  df = df.drop(['Lat', 'Long'], axis=1).groupby('Country/Region').sum()

<ipython-input-13-8370bf488dd3>:3: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

  df = df.drop(['Lat', 'Long'], axis=1).groupby('Country/Region').sum()

df_case

df_death

top_ten_cases = df_case.loc[df_case.index[-1]].sort_values(ascending=False)[:10]
sns.barplot(x=top_ten_cases.index, y=top_ten_cases, color='black')
plt.xticks(rotation=90, size=15)
plt.ylabel('Total Confirmed Cases', size=15)
plt.xlabel('')
plt.title('Total Confirmed Cases (%s)' % top_ten_cases.name.strftime('%Y-%m-%d'), size=15)

ax = plt.gca()
ax2 = ax.twinx()
top_ten_deaths = df_death.loc[df_death.index[-1]][top_ten_cases.index]
ax2.plot(top_ten_deaths.index, top_ten_deaths, 'r--')
ax2.set_ylabel('Total Deaths', color='red', size=15)
plt.show()

def plot_case_with_death(country):
    plt.plot(df_case.index, df_case[country], 'b-')
    plt.ylabel('Confirmed Cases', color='blue')
    plt.title(country + 'Cases & Deaths')
    plt.xlabel('Date')
    plt.xlim(right=df_case.index[-1])
    plt.ylim(0, df_case[country].max()*1.1)

    ax = plt.gca()
    ax2 = ax.twinx()
    ax2.plot(df_death.index, df_death[country], 'r--')
    ax2.set_ylabel('Deaths', color='red')
    ax2.set_ylim(0, df_death[country].max()*1.3)
    plt.show()
plot_case_with_death('US')

for c in df_case.columns:
    print(c)

 

Afghanistan

Albania

Algeria

Andorra

Angola

Antarctica

Antigua and Barbuda

Argentina

Armenia

Australia

Austria

Azerbaijan

Bahamas

Bahrain

:

:

US

Uganda

Ukraine

United Arab Emirates

United Kingdom

Uruguay

Uzbekistan

Vanuatu

Venezuela

Vietnam

West Bank and Gaza

Winter Olympics 2022

Yemen

Zambia

Zimbabwe

plot_case_with_death('Germany')

plot_case_with_death('China')

plot_case_with_death('Korea, South')

country = 'Korea, South'
plt.plot(df_case.index, df_case[country].diff(), 'b-')
plt.ylabel('Confirmed Cases', color='blue')
plt.title(country + ' Cases & Deaths')
plt.xlabel('Date')
plt.xlim(right=df_case.index[-1])
plt.ylim(bottom=0)

ax = plt.gca()
ax2 = ax.twinx()
ax2.plot(df_death.index, df_death[country].diff(), 'r--')
ax2.set_ylabel('Deaths', color='red')
ax2.set_ylim(bottom=0)
plt.show()

import plotly.graph_objects as go
# 아래 데이터프레임를 이용하여 국가명을 국가코드로 변경
df_code = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')

# 이름이 다르게 쓰인 국가의 이름을 먼저 변경한 후에 코드로 변경
country_renamer = {
    'Bahamas':'Bahamas, The',
    'Congo (Brazzaville)':'Congo, Republic of the',
    'Congo (Kinshasa)':'Congo, Democratic Republic of the',
    'Czechia':'Czech Republic',
    'Eswatini':'Swaziland',
    'Gambia':'Gambia, The',
    'North Macedonia':'Macedonia',
    'Taiwan*':'Taiwan',
    'US':'United States',
    'West Bank and Gaza':'West Bank'
}
df_case.head()

country_to_code = df_code.set_index('COUNTRY')['CODE']
df_c = df_case.rename(country_renamer, axis=1).copy()
for col in df_c.columns:
    if col not in country_to_code.index:
      df_c.drop(col, axis=1, inplace=True)
df_code = df_c.rename(country_to_code, axis=1)
df_code

# 공식 레퍼런스 참조: https://plotly.com/python/choropleth-maps/#world-choropleth-map
def get_choropleth_data(date):
  # TODO: date에 해당하는 날짜로 데이터 생성
  data = go.Choropleth(
    locations = df_code.columns,
    z = df_code.loc[date],
    text = df_c.columns,
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '$',
    colorbar_title = 'Confirmed Cases',
  )
  return data
# 공식 레퍼런스 참조: https://plotly.com/python/choropleth-maps/#world-choropleth-map
def get_choropleth_data(date):
  # TODO: date에 해당하는 날짜로 데이터 생성
  data = go.Choropleth(
    locations = df_code.columns,
    z = df_code.loc[date],
    text = df_c.columns,
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '$',
    colorbar_title = 'Confirmed Cases',
  )
  return data
# 공식 레퍼런스 참조: https://facebook.github.io/prophet/docs/quick_start.html#python-api
# 한국의 확진자 수 데이터를 FBProphet에 학습하기 위한 데이터프레임으로 재구성하시오.
df = pd.DataFrame(df_case.reset_index()[['Date', 'Korea, South']].to_numpy(), columns=['ds', 'y'])
df

from math import floor
def train_test_split_df(df, test_size):
  # TODO: test_size(0 < test_size < 1) 비율에 따라 테스트/학습 데이터프레임으로 나누어 반환
  div = floor(df.shape[0] * (1 - test_size))
  return df.loc[:div], df.loc[div + 1:]
train_df, test_df = train_test_split_df(df, 0.1)
train_df.tail()

test_df.head()