# gapminder.tsv 파일을 읽어서 데이터 프레임 생성
import pandas as pd
df = pd.read_csv('gapminder.tsv', sep= '\t', encoding='UTF-8')
print(df.iloc[0:5])
country continent year lifeExp pop gdpPercap
0 Afghanistan Asia 1952 28.801 8425333 779.445314
1 Afghanistan Asia 1957 30.332 9240934 820.853030
2 Afghanistan Asia 1962 31.997 10267083 853.100710
3 Afghanistan Asia 1967 34.020 11537966 836.197138
4 Afghanistan Asia 1972 36.088 13079460 739.981106
# boolean indexing:
# 컬럼의 값을 이용해서 특정 레코드(행, row)들을 선택하는 방법
# DataFrame[컬럼의 값을 이용한 조건식]
# SQL: select * from DataFrame where column == '';
df_afg = df[df['country'] == 'Afghanistan']
print(df_afg)
country continent year lifeExp pop gdpPercap
0 Afghanistan Asia 1952 28.801 8425333 779.445314
1 Afghanistan Asia 1957 30.332 9240934 820.853030
2 Afghanistan Asia 1962 31.997 10267083 853.100710
3 Afghanistan Asia 1967 34.020 11537966 836.197138
4 Afghanistan Asia 1972 36.088 13079460 739.981106
5 Afghanistan Asia 1977 38.438 14880372 786.113360
6 Afghanistan Asia 1982 39.854 12881816 978.011439
7 Afghanistan Asia 1987 40.822 13867957 852.395945
8 Afghanistan Asia 1992 41.674 16317921 649.341395
9 Afghanistan Asia 1997 41.763 22227415 635.341351
10 Afghanistan Asia 2002 42.129 25268405 726.734055
11 Afghanistan Asia 2007 43.828 31889923 974.580338
df_korea = df[df['country']=='Korea, Rep.']
print(df_korea)
country continent year lifeExp pop gdpPercap
840 Korea, Rep. Asia 1952 47.453 20947571 1030.592226
841 Korea, Rep. Asia 1957 52.681 22611552 1487.593537
842 Korea, Rep. Asia 1962 55.292 26420307 1536.344387
843 Korea, Rep. Asia 1967 57.716 30131000 2029.228142
844 Korea, Rep. Asia 1972 62.612 33505000 3030.876650
845 Korea, Rep. Asia 1977 64.766 36436000 4657.221020
846 Korea, Rep. Asia 1982 67.123 39326000 5622.942464
847 Korea, Rep. Asia 1987 69.810 41622000 8533.088805
848 Korea, Rep. Asia 1992 72.244 43805450 12104.278720
849 Korea, Rep. Asia 1997 74.647 46173816 15993.527960
850 Korea, Rep. Asia 2002 77.045 47969150 19233.988180
851 Korea, Rep. Asia 2007 78.623 49044790 23348.139730
# 대한민국(Korea, Rep.)의 인구(pop)와 1인당 GDP(gdpPercap)을 출력
df_korea_pop_gdp = df[df['country']=='Korea, Rep.'][['pop', 'gdpPercap']]
print(df_korea_pop_gdp)
pop gdpPercap
840 20947571 1030.592226
841 22611552 1487.593537
842 26420307 1536.344387
843 30131000 2029.228142
844 33505000 3030.876650
845 36436000 4657.221020
846 39326000 5622.942464
847 41622000 8533.088805
848 43805450 12104.278720
849 46173816 15993.527960
850 47969150 19233.988180
851 49044790 23348.139730
>> Mean() : Series vs Dataframe
1차원 구조: Series
2차원 구조: Dataframe
mean_cty_df = mpg[['cty']].mean()
mean_cty_series = mpg['cty'].mean()
print(mean_cty_df)
print(mean_cty_series)
cty 16.858974
dtype: float64
16.858974358974358
# cty 컬럼의 값이 평균보다 큰 레코드들을 출력
print(mpg[mpg['cty']> mean_cty_df])
Traceback (most recent call last):
File "C:/dev/lab-python/scratch09/ex06.py", line 42, in <module>
print(mpg[mpg['cty']> mean_cty_df])
File "C:\dev\Python37\lib\site-packages\pandas\core\ops\__init__.py", line 1142, in wrapper
raise ValueError("Can only compare identically-labeled " "Series objects")
ValueError: Can only compare identically-labeled Series objects
>> 오류 발생
# cty 컬럼의 값이 평균보다 큰 레코드들을 출력
print(mpg[mpg['cty']> mean_cty_series])
manufacturer model displ year cyl ... drv cty hwy fl class
0 audi a4 1.8 1999 4 ... f 18 29 p compact
1 audi a4 1.8 1999 4 ... f 21 29 p compact
2 audi a4 2.0 2008 4 ... f 20 31 p compact
3 audi a4 2.0 2008 4 ... f 21 30 p compact
5 audi a4 2.8 1999 6 ... f 18 26 p compact
.. ... ... ... ... ... ... .. .. ... .. ...
228 volkswagen passat 1.8 1999 4 ... f 18 29 p midsize
229 volkswagen passat 2.0 2008 4 ... f 19 28 p midsize
230 volkswagen passat 2.0 2008 4 ... f 21 29 p midsize
232 volkswagen passat 2.8 1999 6 ... f 18 26 p midsize
233 volkswagen passat 3.6 2008 6 ... f 17 26 p midsize
[118 rows x 11 columns]
# cty 컬럼 값이 평균보다 큰 자동차들의 model, cty, hwy 컬럼 출력
print(mpg[mpg['cty']> mean_cty_series][['model', 'cty', 'hwy']])
model cty hwy
0 a4 18 29
1 a4 21 29
2 a4 20 31
3 a4 21 30
5 a4 18 26
.. ... ... ...
228 passat 18 29
229 passat 19 28
230 passat 21 29
232 passat 18 26
233 passat 17 26
[118 rows x 3 columns]