[12주차] SQL과 데이터분석

2023. 6. 27. 22:00BOOTCAMP/프로그래머스 인공지능 데브코스

주피터 SQL 엔진 설정

%load_ext sql
# ID와 PW를 자신의 환경에 맞게 수정
%sql postgresql://guest:Guest1!*@learnde.cduaw970ssvt.ap-northeast-2.redshift.amazonaws.com:5439/prod

 

'Connected: guest@prod'

다양한 SELECT 실행해 보기

%%sql

SELECT * FROM raw_data.session_timestamp LIMIT 10

%%sql

SELECT DATE(ts) date, sessionID FROM raw_data.session_timestamp LIMIT 10;

%%sql

SELECT DATE(ts) date, sessionID FROM raw_data.session_timestamp ORDER BY ts 
desc LIMIT 10;

일별 방문 유니크한 사용자의 수를 계산하고 싶다면?

# raw_data.user_session_channel과 raw_data.session_timestamp 테이블의 조인이 필요
%%sql

SELECT DATE(st.ts) date, COUNT(usc.userID)
FROM raw_data.session_timestamp st
JOIN raw_data.user_session_channel usc ON st.sessionID = usc.sessionID
GROUP BY 1
ORDER BY 1
LIMIT 10;

판다스와 연동하는 방법

result = %sql SELECT * FROM raw_data.user_session_channel
df = result.DataFrame()
df.head()

df.groupby(["channel"]).size()
channel
Facebook     16791
Google       16982
Instagram    16831
Naver        16921
Organic      16904
Youtube      17091
dtype: int64
df.groupby(["channel"])["sessionid"].count()
channel
Facebook     16791
Google       16982
Instagram    16831
Naver        16921
Organic      16904
Youtube      17091
Name: sessionid, dtype: int64
result = %sql SELECT * FROM raw_data.session_timestamp
df_st = result.DataFrame()
df_st['date'] = df_st['ts'].apply(lambda x: "%d-%02d-%02d" % (x.year, x.month, x.day))
df_st.head()

df_st.groupby(["date"]).size()  # .sort_values(["date"])
date
2019-05-01    147
2019-05-02    161
2019-05-03    150
2019-05-04    142
2019-05-05    144
             ... 
2019-11-26    633
2019-11-27    617
2019-11-28    516
2019-11-29    531
2019-11-30    562
Length: 214, dtype: int64
df_st.groupby(["date"])["sessionid"].count().reset_index(name='count').sort_values("date", ascending=False)

월별 세션수를 계산하는 SQL

%%sql

SELECT
    LEFT(ts, 7) AS mon,
    COUNT(1) AS session_count
FROM raw_data.session_timestamp
GROUP BY 1
ORDER BY 1;

가장 많이 사용된 채널은 무엇인가?

%%sql

SELECT
    channel,
    COUNT(1) AS session_count,
    COUNT(DISTINCT userId) AS user_count
FROM raw_data.user_session_channel
GROUP BY 1                         -- GROUP BY channel
ORDER BY 2 DESC;              -- ORDER BY session_count DESC

%%sql

SELECT
    channel,
    COUNT(1) AS session_count,
    COUNT(DISTINCT userId) AS user_count
FROM raw_data.user_session_channel
GROUP BY 1                         -- GROUP BY channel
ORDER BY 3 DESC;              -- ORDER BY user_count DESC

가장 많은 세션을 만들어낸 사용자 ID는 무엇인가?

%%sql

SELECT
    userId,
    COUNT(1) AS count
FROM raw_data.user_session_channel
GROUP BY 1                         -- GROUP BY userId
ORDER BY 2 DESC              -- ORDER BY count DESC
LIMIT 1;

월별 유니크한 사용자 수

%%sql

SELECT 
  TO_CHAR(A.ts, 'YYYY-MM') AS month,
  COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1      
ORDER BY 1 DESC;

%%sql

SELECT 
  TO_CHAR(A.ts, 'YYYY-MM') AS month,
  COUNT(B.userid) AS cnt,
  COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1      
ORDER BY 1 DESC;

월별 채널별 유니크한 사용자 수

%%sql

SELECT 
  TO_CHAR(A.ts, 'YYYY-MM') AS month,
  channel,
  COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1, 2      
ORDER BY 1 DESC, 2;

CTAS & CTE

CTAS: SELECT를 가지고 테이블 생성

%%sql

DROP TABLE IF EXISTS adhoc.keeyong_session_summary;
CREATE TABLE adhoc.keeyong_session_summary AS
SELECT B.*, A.ts FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid;
%%sql

SELECT 
  TO_CHAR(ts, 'YYYY-MM') AS month,
  COUNT(DISTINCT userid) AS mau
FROM adhoc.keeyong_session_summary
GROUP BY 1 
ORDER BY 1 DESC;

중복된 레코드들 체크하기

%%sql

SELECT COUNT(1) FROM adhoc.keeyong_session_summary;

%%sql

SELECT COUNT(1)
FROM (
    SELECT DISTINCT userId, sessionId, ts, channel FROM adhoc.keeyong_session_summary
);

최근 데이터의 존재 여부 체크하기 (freshness)

%%sql

SELECT MIN(ts), MAX(ts)
FROM adhoc.keeyong_session_summary;

Primary key uniqueness가 지켜지는지 체크하기

%%sql

SELECT sessionId, COUNT(1)
FROM adhoc.keeyong_session_summary
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;

값이 비어있는 칼럼들이 있는지 체크하기

%%sql

SELECT
    COUNT(CASE WHEN sessionId is NULL THEN 1 END) sessionid_null_count,
    COUNT(CASE WHEN userId is NULL THEN 1 END) userid_null_count,
    COUNT(CASE WHEN ts is NULL THEN 1 END) ts_null_count,
    COUNT(CASE WHEN channel is NULL THEN 1 END) channel_null_count
FROM adhoc.keeyong_session_summary;

'BOOTCAMP > 프로그래머스 인공지능 데브코스' 카테고리의 다른 글

[14주차] Spark  (0) 2023.06.28
[13주차] monthly project3  (0) 2023.06.28
[11주차] CNN & RNN  (0) 2023.06.27
[10주차] 신경망 기초  (0) 2023.06.24
[9주차] ML_basics - Linear Regression  (0) 2023.06.23