πŸ† μ•žμœΌλ‘œ 진행할 데이터 뢄석 과정을 μœ„ν•΄ 이전 ν¬μŠ€νŒ…μ—μ„œμ˜ μ „μ²˜λ¦¬λœ 데이터λ₯Ό κ°€μ Έμ˜€μž.

import pandas as pd
birth = pd.read_csv("Data/births.csv", encoding = 'utf-8-sig')

birth_til_1988 = birth[:15067]

birth_date = pd.pivot_table(birth_til_1988, index = ['year','month', 'day'], values = ['births'], aggfunc={'births' : 'sum'})
birth_date = birth_date.reset_index()
birth_date[['day']] = birth_date[['day']].astype('int64')

import numpy as np
quartiles = np.percentile(birth_date['births'], [25,50,75])
mu = quartiles[1]
sig = 0.74 * (quartiles[2] - quartiles[0])
birth_date = birth_date.query('(births > @mu - 5*@sig) & (births < @mu + 5*@sig)')

birth_date = birth_date.astype('str')
birth_date['births'] = birth_date['births'].astype('int64')
birth_date['date'] = pd.to_datetime(birth_date['year'] + '-' + birth_date['month'] + '-' + birth_date['day'], errors = 'raise')
birth_date['weekday'] = birth_date['date'].dt.weekday

def weekday_func(row):
    if row['weekday'] == 0:
        row['weekday'] = 'Mon'
    elif row['weekday'] == 1:
        row['weekday'] = 'Tue'
    elif row['weekday'] == 2:
        row['weekday'] = 'Wed'
    elif row['weekday'] == 3:
        row['weekday'] = 'Thu'
    elif row['weekday'] == 4:
        row['weekday'] = 'Fri' 
    elif row['weekday'] == 5:
        row['weekday'] = 'Sat'
    elif row['weekday'] == 6:
        row['weekday'] = 'Sun'
    
    return row

birth_date = birth_date.apply(weekday_func, axis = 1)

birth_date.head()
>>
    year	month	day	births	date	    weekday
0	1969	1	    1	8486	1969-01-01	Wed
1	1969	1	    2	9002	1969-01-02	Thu
2	1969	1	    3	9542	1969-01-03	Fri
3	1969	1	    4	8960	1969-01-04	Sat
4	1969	1	    5	8390	1969-01-05	Sun

1. 1969λ…„ μΌλ³„λ‘œ μΆœμƒ 좔이 νŒŒμ•…ν•˜κΈ°

πŸ† μ•žμ„œ μ‚¬μš©ν•˜μ§€ μ•Šμ€ 1969λ…„λ„μ˜ 데이터λ₯Ό λ°”νƒ•μœΌλ‘œ 일별 μΆœμƒ 좔이λ₯Ό νŒŒμ•…ν•΄λ³΄μž.

birth_1969 = birth_date[birth_date['year'] == 1969]
birth_1969
>>
    year	month	day	births	date	    weekday
0	1969	1	    1	8486	1969-01-01	Wed
1	1969	1	    2	9002	1969-01-02	Thu
2	1969	1	    3	9542	1969-01-03	Fri
3	1969	1	    4	8960	1969-01-04	Sat
4	1969	1	    5	8390	1969-01-05	Sun
...	...	    ...	    ...	...	    ...	        ...
378	1969	12	    27	9304	1969-12-27	Sat
379	1969	12	    28	9004	1969-12-28	Sun
380	1969	12	    29	10980	1969-12-29	Mon
381	1969	12	    30	12232	1969-12-30	Tue
382	1969	12	    31	11122	1969-12-31	Wed

365 rows Γ— 6 columns

κΈ°κ°€ λ§‰νžˆκ²Œ λ”± 365개의 행을 가진닀. 이런걸 보면 기뢄이 μ°Έ μ’‹λ‹€πŸ˜‰.

이 λ°μ΄ν„°ν”„λ ˆμž„λ„ μœ€λ…„μ„ μ²˜λ¦¬ν•΄μ£Όκ³  datetime ν•¨μˆ˜λ₯Ό μ μš©ν•˜μž.

import numpy as np
quartiles = np.percentile(birth_1969['births'], [25,50,75])
mu = quartiles[1]
sig = 0.74 * (quartiles[2] - quartiles[0])
birth_1969 = birth_1969.query('(births > @mu - 5*@sig) & (births < @mu + 5*@sig)')

birth_1969['day'] = birth_1969['day'].astype('int64')
birth_1969 = birth_1969.astype('str')
birth_1969['births'] = birth_1969['births'].astype('int64')

birth_1969['date'] = pd.to_datetime(birth_1969['year'] + '-' + birth_1969['month'] + '-' + birth_1969['day'], errors = 'raise')
birth_1969.head()
>>
	year	month	day	births	date	    weekday
0	1969	1	    1	8486	1969-01-01	Wed
1	1969	1	    2	9002	1969-01-02	Thu
2	1969	1	    3	9542	1969-01-03	Fri
3	1969	1	    4	8960	1969-01-04	Sat
4	1969	1	    5	8390	1969-01-05	Sun

λ§ˆμ§€λ§‰μœΌλ‘œ plotly μ‹œκ°ν™”ν•΄μ£Όμž.

import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = birth_1969['date'], y = birth_1969['births'], marker_color = '#025918'))

fig.update_layout(
{
    'title' : {'text':'<b>Births in 1969</b>', 'font':{'size':25}, 'x':0.5, 'y':0.92},
    'xaxis' : {'showticklabels':True, 'dtick' : 'M1', 'tickfont' : {'size':15}, 'title' : {'text':'Date', 'font':{'size':20}}},
    'yaxis' : {'showticklabels':True, 'tickfont' : {'size':15}, 'title' : {'text':'Births', 'font':{'size':20}}},
    'template':'presentation'
})

fig.add_annotation(
    x = '1969-12-30', y = 12250,
    
    arrowcolor='#025918',
    arrowhead=2,
    arrowsize=1,
    arrowwidth=2,
    
    text = 'MAX : 1969-12-30,12232',
    font = dict(size = 13, color = 'white'),
    
    bordercolor='#025918',
    borderpad=4,
    borderwidth=2,
    
    ax = 20, ay = -40,
    align = 'center',
    bgcolor = '#025918',
    opacity = 0.8
    )

fig.add_annotation(
    x = '1969-4-20', y = 7930,
    
    arrowcolor='#025918',
    arrowhead=2,
    arrowsize=1,
    arrowwidth=2,
    
    text = 'MIN : 1969-4-20, 7928',
    font = dict(size = 13, color = 'white'),
    
    bordercolor='#025918',
    borderpad=4,
    borderwidth=2,
    
    ax = 20, ay = 50,
    align = 'center',
    bgcolor = '#025918',
    opacity = 0.8
    )

fig.show()

4. Heatmap - 뢄기별 μΆœμƒ λΉˆλ„ (1969 ~ 2008)

πŸ† λ§ˆμ§€λ§‰μœΌλ‘œ 전체적인 μΆœμƒ λΉˆλ„λ₯Ό μ•Œμ•„λ³΄κΈ° μœ„ν•΄ νžˆνŠΈλ§΅μ„ 그렀보자.

birth_quarter = birth.groupby(['year','month']).agg({'births':'sum'}) 
birth_quarter = birth_quarter.reset_index()
birth_quarter = birth_quarter.astype('str')
birth_quarter['Date'] = pd.to_datetime(birth_quarter['year'] + '-' + birth_quarter['month'], errors = 'raise')
birth_quarter.head()
>>
year	month	births	Date
0	1969	1	293940	1969-01-01
1	1969	2	270786	1969-02-01
2	1969	3	296550	1969-03-01
3	1969	4	282638	1969-04-01
4	1969	5	289124	1969-05-01

ν•΄λ‹Ή μ›”μ˜ λΆ„κΈ°λ₯Ό κ΅¬ν•˜κΈ° μœ„ν•΄μ„œ dt . quarter λ©”μ„œλ“œλ₯Ό μ‚¬μš©ν•˜μž.

birth_quarter['Quarter'] = birth_quarter['Date'].dt.quarter
birth_quarter['births'] = birth_quarter['births'].astype('int64')
birth_quarter = birth_quarter.groupby(['year', 'Quarter']).agg({'births':'sum'})
birth_quarter_plotly = birth_quarter.reset_index()
birth_quarter_plotly.head()
>>
year	Quarter	births
0	1969	1	861276
1	1969	2	863372
2	1969	3	952010
3	1969	4	923548
4	1970	1	891214

κΉ”λ”ν•˜κ²Œ μ •λ¦¬λ˜μ—ˆλ‹€!! μ‹œκ°ν™”ν•˜μžπŸ™Œ.

fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = birth_quarter_plotly['year'], y = birth_quarter_plotly['Quarter'], z = birth_quarter_plotly['births'],
        colorscale = 'blues'))

fig.update_layout(
    {
        'title':{'text':'<b>뢄기별 μΆœμƒ 정도</b>', 'font':{'size':25}, 'x':0.5, 'y':0.92},
        'xaxis':{'showticklabels':True, 'title':{'text':'년도', 'font':{'size':20}}},
        'yaxis':{'showticklabels':True, 'dtick':1, 'title':{'text':'λΆ„κΈ°', 'font':{'size':20}}}
    })

fig.show()

πŸ‘‰ 전체적인 λ…„λ„μ˜ λΆ„κΈ°λ₯Ό 보면 3λΆ„κΈ°μ˜ μΆœμ‚° 좔이가 κ°€μž₯ 높은 것을 확인할 수 μžˆλ‹€. λ˜ν•œ μ‹œκ°„μ΄ μ§€λ‚ μˆ˜λ‘ 전체적인 μΆœμ‚°μœ¨μ΄ 높아지고 μžˆμŒμ„ ν™•μΈν•˜μž.


πŸ† μΆ”κ°€λ‘œ μ΄λ ‡κ²Œ λ³΅μž‘ν•œ 과정을 κ±°μΉ˜μ§€ μ•Šκ³  pandas의 stlye κΈ°λŠ₯을 μ‚¬μš©ν•΄μ„œ νžˆνŠΈλ§΅μ„ κ·Έλ¦΄μˆ˜λ„ μžˆλ‹€. κ°„λ‹¨ν•˜κ³  μ§κ΄€μ μœΌλ‘œ 뢄석할 λ•Œ 쒋은 λ°©λ²•μ΄λ―€λ‘œ μ•Œμ•„λ‘λ©΄ 쒋을 것 κ°™λ‹€πŸ˜‰!!

birth_quarter_df = birth_quarter_plotly[128:160].set_index(['year','Quarter']).unstack()
birth_quarter_df.style.background_gradient(cmap = 'Reds')

πŸ‘‰ 졜근 8λ…„κ°„μ˜ μΆ”μ„Έλ₯Ό 보아도 κ³„μ†ν•΄μ„œ μΆœμ‚°μœ¨μ΄ 높아지고 μžˆμŒμ„ 확인할 수 μžˆλ‹€.


πŸ† μš”λ ‡κ²Œ ν•΄μ„œ CDC 데이터 뢄석을 ν•œμ°¨λ‘€ λλƒˆλ‹€. λ””μŠ€ν¬ 곡백기 λ•Œλ¬Έμ— μ˜ˆμ „μ— μ‚¬μš©ν–ˆλ˜ ν•¨μˆ˜λ“€λ„ ν•œλ²ˆμ”© μ°Ύμ•„λ³΄λ©΄μ„œ μ‚¬μš©λ²•μ„ λ‹€μ‹œ μ΅νžˆλŠ” 쀑이라 μžμ—°μŠ€λŸ½κ²Œ 손이 쑰금 κ΅Όλ– μ‘Œλ‹€πŸ€¦β€β™‚οΈ. κΈ°μ΄ˆλΆ€ν„° λ‹€μ‹œ νƒ„νƒ„νžˆ λ‹€μ§„λ‹€λŠ” μƒκ°μœΌλ‘œ μ—΄μ‹¬νžˆ λ‹¬λ €λ΄μ•Όκ² λ‹€πŸƒβ€β™‚οΈπŸƒβ€β™‚οΈπŸƒβ€β™‚οΈ!!

πŸ† 내일이면 κ°œκ°•μ΄λ‹€. 5전곡 3ν”„λ‘œμ νŠΈμ˜ 2022λ…„ 1ν•™κΈ°κ°€ 될 것 κ°™λ‹€. 뭐 ν•˜λ©΄ λ˜κ² μ§€λΌλŠ” 마음으둜 κ·Έλƒ₯ μ—΄μ‹¬νžˆ 살아봐야겠닀. μ•„μžμžμž£!!


πŸ’‘νŒŒμ΄μ¬ 데이터 μ‚¬μ΄μ–ΈμŠ€ ν•Έλ“œλΆ(μœ„ν‚€λΆμŠ€,2020)의 μ €μž Jake VanderPlas λΆ„μ˜ κΉƒν—ˆλΈŒμ—μ„œ 데이터λ₯Ό κ°€μ Έμ™”μŒμ„ λ°νž™λ‹ˆλ‹€.

Leave a comment