import pandas as pd
if __name__ == '__main__':
# csv 파일에서 데이터 프레임을 생성
emp_df = pd.read_csv('emp.csv')
print(emp_df.iloc[0:5])
# 부서별, 직책별 직원 수를 출력
grouped = emp_df.groupby(['DEPTNO', 'JOB'])
emp_by_dept = grouped['EMPNO']
result_df = emp_by_dept.agg('count')
print(result_df)
EMPNO ENAME JOB MGR HIREDATE SAL COMM DEPTNO
0 7369 SMITH CLERK 7902.0 1980-12-17 800.0 NaN 20
1 7499 ALLEN SALESMAN 7698.0 1981-02-20 1600.0 300.0 30
2 7521 WARD SALESMAN 7698.0 1981-02-22 1250.0 500.0 30
3 7566 JONES MANAGER 7839.0 1981-04-02 2975.0 NaN 20
4 7654 MARTIN SALESMAN 7698.0 1981-09-28 1250.0 1400.0 30
DEPTNO JOB
10 CLERK 1
MANAGER 1
PRESIDENT 1
20 ANALYST 2
CLERK 2
MANAGER 1
30 CLERK 1
MANAGER 1
SALESMAN 4
Name: EMPNO, dtype: int64
unstacked=result_df.unstack()
print(unstacked)
print(unstacked.shape)
JOB ANALYST CLERK MANAGER PRESIDENT SALESMAN
DEPTNO
10 NaN 1.0 1.0 1.0 NaN
20 2.0 2.0 1.0 NaN NaN
30 NaN 1.0 1.0 NaN 4.0
(3, 5)
# grouping 기준이 되는 컬럼의 값들이 index(행의 이름)으로 사용되지 않고,
# 컬럼으로 사용하려면, as_index=False 파라미터를 전달하면 됨.
grouped = emp_df.groupby('DEPTNO', as_index=False)
print(grouped['EMPNO'].count())
grouped = emp_df.groupby(['DEPTNO', 'JOB'], as_index=False)
print(grouped['EMPNO'].count())
DEPTNO EMPNO
0 10 3
1 20 5
2 30 6
DEPTNO JOB EMPNO
0 10 CLERK 1
1 10 MANAGER 1
2 10 PRESIDENT 1
3 20 ANALYST 2
4 20 CLERK 2
5 20 MANAGER 1
6 30 CLERK 1
7 30 MANAGER 1
8 30 SALESMAN 4
import pandas as pd
if __name__ == '__main__':
# tips.csv 파일을 읽어서 데이터 프레임을 생성
tips_df = pd.read_csv('tips.csv', encoding='UTF-8')
print(tips.iloc[0:5])
total_bill tip smoker day time size
0 16.99 1.01 No Sun Dinner 2
1 10.34 1.66 No Sun Dinner 3
2 21.01 3.50 No Sun Dinner 3
3 23.68 3.31 No Sun Dinner 2
4 24.59 3.61 No Sun Dinner 4
# DataFrame에 tip_pct 컬럼 추가: 팁금액 / 총금액
tips['tip_pct'] = tips['tip'] / tips['total_bill']
print(tips.iloc[0:5])
total_bill tip smoker day time size tip_pct
0 16.99 1.01 No Sun Dinner 2 0.059447
1 10.34 1.66 No Sun Dinner 3 0.160542
2 21.01 3.50 No Sun Dinner 3 0.166587
3 23.68 3.31 No Sun Dinner 2 0.139780
4 24.59 3.61 No Sun Dinner 4 0.146808
# day, smoker별 그룹을 지어서,
grouped = tips.groupby(['day', 'smoker'])
grouped_tip_pct= grouped['tip_pct']
# tip_pct의 평균을 출력
print(grouped_tip_pct.mean())
print(grouped_tip_pct.agg('mean')) # 같은 결과
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
Name: tip_pct, dtype: float64
# day, smoker별 그룹의 tip_pct의 평균, 표준편차, 최대/최소 차이를 출력
print(grouped_tip_pct.agg(mean='mean', std='std', max_min=lambda x: x.max() - x.min()))
mean std max_min
day smoker
Fri No 0.151650 0.028123 0.067349
Yes 0.174783 0.051293 0.159925
Sat No 0.158048 0.039767 0.235193
Yes 0.147906 0.061375 0.290095
Sun No 0.160113 0.042347 0.193226
Yes 0.187250 0.154134 0.644685
Thur No 0.160298 0.038774 0.193350
Yes 0.163863 0.039389 0.151240
튜플로 agg 사용하는 방법
# day, smoker별 그룹의 tip_pct, total_bill 컬럼의 평균, 표준편차, 최대/최소 차이
print(grouped_pct_bill.agg([('mean', 'mean'),
('std_dev', 'std'),
('range', lambda x: x.max() - x.min())]))
tip_pct total_bill
mean std_dev range mean std_dev range
day smoker
Fri No 0.151650 0.028123 0.067349 18.420000 5.059282 10.29
Yes 0.174783 0.051293 0.159925 16.813333 9.086388 34.42
Sat No 0.158048 0.039767 0.235193 19.661778 8.939181 41.08
Yes 0.147906 0.061375 0.290095 21.276667 10.069138 47.74
Sun No 0.160113 0.042347 0.193226 20.506667 8.130189 39.40
Yes 0.187250 0.154134 0.644685 24.120000 10.442511 38.10
Thur No 0.160298 0.038774 0.193350 17.113111 7.721728 33.68
Yes 0.163863 0.039389 0.151240 19.190588 8.355149 32.77
# GroupBy 객체의 컬럼들마다 다른 함수를 agg로 적용할 때
# agg({'col_name':[functions], ... })
# 그루핑된 데이터 프레임의 tip 컬럼에는 max() 함수를 aggregate하고,
# size 컬럼에는 sum()함수를 aggregate 하겠다.
result= grouped.agg({'tip': 'max', 'size': 'sum'})
print(result)
tip size
day smoker
Fri No 3.50 9
Yes 4.73 31
Sat No 9.00 115
Yes 10.00 104
Sun No 6.00 167
Yes 6.50 49
Thur No 6.70 112
Yes 5.00 40
functions = ['mean', 'std', lambda x: x.max() - x.min()]
result = grouped.agg({
'tip_pct': functions,
'total_bill': functions
})
print(result)
tip_pct total_bill
mean std <lambda_0> mean std <lambda_0>
day smoker
Fri No 0.151650 0.028123 0.067349 18.420000 5.059282 10.29
Yes 0.174783 0.051293 0.159925 16.813333 9.086388 34.42
Sat No 0.158048 0.039767 0.235193 19.661778 8.939181 41.08
Yes 0.147906 0.061375 0.290095 21.276667 10.069138 47.74
Sun No 0.160113 0.042347 0.193226 20.506667 8.130189 39.40
Yes 0.187250 0.154134 0.644685 24.120000 10.442511 38.10
Thur No 0.160298 0.038774 0.193350 17.113111 7.721728 33.68
Yes 0.163863 0.039389 0.151240 19.190588 8.355149 32.77
functions = ['mean', 'std', ('range', lambda x: x.max() - x.min())]
result = grouped.agg({
'tip_pct': functions,
'total_bill': functions
})
print(result)
tip_pct total_bill
mean std range mean std range
day smoker
Fri No 0.151650 0.028123 0.067349 18.420000 5.059282 10.29
Yes 0.174783 0.051293 0.159925 16.813333 9.086388 34.42
Sat No 0.158048 0.039767 0.235193 19.661778 8.939181 41.08
Yes 0.147906 0.061375 0.290095 21.276667 10.069138 47.74
Sun No 0.160113 0.042347 0.193226 20.506667 8.130189 39.40
Yes 0.187250 0.154134 0.644685 24.120000 10.442511 38.10
Thur No 0.160298 0.038774 0.193350 17.113111 7.721728 33.68
Yes 0.163863 0.039389 0.151240 19.190588 8.355149 32.77
# grouping 컬럼들을 인덱스로 사용하지 않고자 할 때,
grouped = tips.groupby(['day', 'smoker'], as_index=False)
print(grouped['tip'].mean())
<이후>
day smoker tip
0 Fri No 2.812500
1 Fri Yes 2.714000
2 Sat No 3.102889
3 Sat Yes 2.875476
4 Sun No 3.167895
5 Sun Yes 3.516842
6 Thur No 2.673778
7 Thur Yes 3.030000
<이전>
day smoker
Fri No 2.812500
Yes 2.714000
Sat No 3.102889
Yes 2.875476
Sun No 3.167895
Yes 3.516842
Thur No 2.673778
Yes 3.030000
Name: tip, dtype: float64