[12주차] SQL과 데이터분석
2023. 6. 27. 22:00ㆍBOOTCAMP/프로그래머스 인공지능 데브코스
주피터 SQL 엔진 설정
%load_ext sql
# ID와 PW를 자신의 환경에 맞게 수정
%sql postgresql://guest:Guest1!*@learnde.cduaw970ssvt.ap-northeast-2.redshift.amazonaws.com:5439/prod
'Connected: guest@prod'
다양한 SELECT 실행해 보기
%%sql
SELECT * FROM raw_data.session_timestamp LIMIT 10
%%sql
SELECT DATE(ts) date, sessionID FROM raw_data.session_timestamp LIMIT 10;
%%sql
SELECT DATE(ts) date, sessionID FROM raw_data.session_timestamp ORDER BY ts
desc LIMIT 10;
일별 방문 유니크한 사용자의 수를 계산하고 싶다면?
# raw_data.user_session_channel과 raw_data.session_timestamp 테이블의 조인이 필요
%%sql
SELECT DATE(st.ts) date, COUNT(usc.userID)
FROM raw_data.session_timestamp st
JOIN raw_data.user_session_channel usc ON st.sessionID = usc.sessionID
GROUP BY 1
ORDER BY 1
LIMIT 10;
판다스와 연동하는 방법
result = %sql SELECT * FROM raw_data.user_session_channel
df = result.DataFrame()
df.head()
df.groupby(["channel"]).size()
channel
Facebook 16791
Google 16982
Instagram 16831
Naver 16921
Organic 16904
Youtube 17091
dtype: int64
df.groupby(["channel"])["sessionid"].count()
channel
Facebook 16791
Google 16982
Instagram 16831
Naver 16921
Organic 16904
Youtube 17091
Name: sessionid, dtype: int64
result = %sql SELECT * FROM raw_data.session_timestamp
df_st = result.DataFrame()
df_st['date'] = df_st['ts'].apply(lambda x: "%d-%02d-%02d" % (x.year, x.month, x.day))
df_st.head()
df_st.groupby(["date"]).size() # .sort_values(["date"])
date
2019-05-01 147
2019-05-02 161
2019-05-03 150
2019-05-04 142
2019-05-05 144
...
2019-11-26 633
2019-11-27 617
2019-11-28 516
2019-11-29 531
2019-11-30 562
Length: 214, dtype: int64
df_st.groupby(["date"])["sessionid"].count().reset_index(name='count').sort_values("date", ascending=False)
월별 세션수를 계산하는 SQL
%%sql
SELECT
LEFT(ts, 7) AS mon,
COUNT(1) AS session_count
FROM raw_data.session_timestamp
GROUP BY 1
ORDER BY 1;
가장 많이 사용된 채널은 무엇인가?
%%sql
SELECT
channel,
COUNT(1) AS session_count,
COUNT(DISTINCT userId) AS user_count
FROM raw_data.user_session_channel
GROUP BY 1 -- GROUP BY channel
ORDER BY 2 DESC; -- ORDER BY session_count DESC
%%sql
SELECT
channel,
COUNT(1) AS session_count,
COUNT(DISTINCT userId) AS user_count
FROM raw_data.user_session_channel
GROUP BY 1 -- GROUP BY channel
ORDER BY 3 DESC; -- ORDER BY user_count DESC
가장 많은 세션을 만들어낸 사용자 ID는 무엇인가?
%%sql
SELECT
userId,
COUNT(1) AS count
FROM raw_data.user_session_channel
GROUP BY 1 -- GROUP BY userId
ORDER BY 2 DESC -- ORDER BY count DESC
LIMIT 1;
월별 유니크한 사용자 수
%%sql
SELECT
TO_CHAR(A.ts, 'YYYY-MM') AS month,
COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1
ORDER BY 1 DESC;
%%sql
SELECT
TO_CHAR(A.ts, 'YYYY-MM') AS month,
COUNT(B.userid) AS cnt,
COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1
ORDER BY 1 DESC;
월별 채널별 유니크한 사용자 수
%%sql
SELECT
TO_CHAR(A.ts, 'YYYY-MM') AS month,
channel,
COUNT(DISTINCT B.userid) AS mau
FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid
GROUP BY 1, 2
ORDER BY 1 DESC, 2;
CTAS & CTE
CTAS: SELECT를 가지고 테이블 생성
%%sql
DROP TABLE IF EXISTS adhoc.keeyong_session_summary;
CREATE TABLE adhoc.keeyong_session_summary AS
SELECT B.*, A.ts FROM raw_data.session_timestamp A
JOIN raw_data.user_session_channel B ON A.sessionid = B.sessionid;
%%sql
SELECT
TO_CHAR(ts, 'YYYY-MM') AS month,
COUNT(DISTINCT userid) AS mau
FROM adhoc.keeyong_session_summary
GROUP BY 1
ORDER BY 1 DESC;
![](https://blog.kakaocdn.net/dn/Vy909/btslC4OaS6X/zqZSKhXFZwH3puQjkkkCy1/img.png)
중복된 레코드들 체크하기
%%sql
SELECT COUNT(1) FROM adhoc.keeyong_session_summary;
%%sql
SELECT COUNT(1)
FROM (
SELECT DISTINCT userId, sessionId, ts, channel FROM adhoc.keeyong_session_summary
);
최근 데이터의 존재 여부 체크하기 (freshness)
%%sql
SELECT MIN(ts), MAX(ts)
FROM adhoc.keeyong_session_summary;
Primary key uniqueness가 지켜지는지 체크하기
%%sql
SELECT sessionId, COUNT(1)
FROM adhoc.keeyong_session_summary
GROUP BY 1
ORDER BY 2 DESC
LIMIT 1;
값이 비어있는 칼럼들이 있는지 체크하기
%%sql
SELECT
COUNT(CASE WHEN sessionId is NULL THEN 1 END) sessionid_null_count,
COUNT(CASE WHEN userId is NULL THEN 1 END) userid_null_count,
COUNT(CASE WHEN ts is NULL THEN 1 END) ts_null_count,
COUNT(CASE WHEN channel is NULL THEN 1 END) channel_null_count
FROM adhoc.keeyong_session_summary;
'BOOTCAMP > 프로그래머스 인공지능 데브코스' 카테고리의 다른 글
[14주차] Spark (0) | 2023.06.28 |
---|---|
[13주차] monthly project3 (0) | 2023.06.28 |
[11주차] CNN & RNN (0) | 2023.06.27 |
[10주차] 신경망 기초 (0) | 2023.06.24 |
[9주차] ML_basics - Linear Regression (0) | 2023.06.23 |