티스토리 뷰
[2]
import pandas as pd
import numpy as np
[12]
pop = pd.Series([33871648, 37253956,
18976457, 19378102,
20851820, 25145561],index=['a','b','c','d','e','f'])
pop
a 33871648
b 37253956
c 18976457
d 19378102
e 20851820
f 25145561
dtype: int64
[16]
index=['a','b','c','d','e','f']
pop = pd.Series([33871648, 37253956,
18976457, 19378102,
20851820, 25145561],index=index)
pop
pop['a': 'c']
a 33871648
b 37253956
c 18976457
dtype: int64
[17]
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(populations, index=index)
pop
(California, 2000) 33871648
(California, 2010) 37253956
(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
(Texas, 2010) 25145561
dtype: int64
[18]
pop[('California', 2010):('Texas', 2000)]
(California, 2010) 37253956
(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
dtype: int64
[28]
l = []
for i in pop.index:
if i[1] == 2010:
l.append(pop[i])
print(l)
[37253956, 19378102, 25145561]
[29]
l = [pop[i] for i in pop.index if i[1] == 2010]
l
[37253956, 19378102, 25145561]
[33]
pop[[('California', 2010),('New York', 2010),('Texas', 2010)]]
pop[[i for i in pop.index if i[1] == 2010]]
(California, 2010) 37253956
(New York, 2010) 19378102
(Texas, 2010) 25145561
dtype: int64
[36]
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
index = pd.MultiIndex.from_tuples(index)
index
MultiIndex([('California', 2000),
('California', 2010),
( 'New York', 2000),
( 'New York', 2010),
( 'Texas', 2000),
( 'Texas', 2010)],
)
[37]
pop
(California, 2000) 33871648
(California, 2010) 37253956
(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
(Texas, 2010) 25145561
dtype: int64
[38]
pop = pop.reindex(index)
pop
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
[39]
pop[:, 2010]
California 37253956
New York 19378102
Texas 25145561
dtype: int64
[40]
pop_df = pop.unstack()
pop_df
[41]
pop_sir = pop_df.stack()
pop_sir
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
[42]
pop
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
[43]
pop_df = pd.DataFrame({'total': pop,
'under18': [9267089, 9284094,
4687374, 4318033,
5906301, 6879014]})
pop_df
[51]
f_u18 = pop_df['under18']/ pop_df['total']
f_u18.unstack()
[54]
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'],[1,2,1,2]],
columns=['data1', 'data2'])
df
[56]
pop.index.names = ['state','year' ]
pop
state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
[63]
# 계층적 인덱스와 열
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data
[68]
health_data['Guido']
health_data['Guido']['HR']
health_data['Guido','HR']
year visit
2013 1 50.0
2 47.0
2014 1 16.0
2 32.0
Name: (Guido, HR), dtype: float64
[74]
pop
pop['California']
pop['California',2000]
pop
pop['California' : 'New York']
# 2000년도 데이터만 가지고 오시오
pop[: , 2000]
state
California 33871648
New York 18976457
Texas 20851820
dtype: int64
[79]
# 모든 데이터에서 22000000보다 큰 데이터만 출력하시오.
pop > 22000000 # 마스크
pop[pop > 22000000] # 마스킹
# California , Texas데이터를 가지고 오시오.
pop['California'], pop['Texas']
pop[['California','Texas' ]] # 팬시 인덱싱
state year
California 2000 33871648
2010 37253956
Texas 2000 20851820
2010 25145561
dtype: int64
[84]
health_data
health_data.iloc[0:2, 0:2]
health_data
health_data.loc[: , ('Guido', 'Temp')]
year visit
2013 1 36.9
2 36.4
2014 1 36.7
2 36.7
Name: (Guido, Temp), dtype: float64
[87]
idx = pd.IndexSlice
health_data.loc[idx[:, 1] , idx[:, 'HR']]
[105]
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data
char int
a 1 0.355351
2 0.274061
c 1 0.474610
2 0.116276
b 1 0.262809
2 0.635166
dtype: float64
[106]
# index 정렬
data = data.sort_index()
data
char int
a 1 0.355351
2 0.274061
b 1 0.262809
2 0.635166
c 1 0.474610
2 0.116276
dtype: float64
[108]
data['a' : 'b']
char int
a 1 0.355351
2 0.274061
b 1 0.262809
2 0.635166
dtype: float64
[113]
pop.unstack()
pop.unstack(level=0)
pop.unstack(level=1)
pop.unstack().stack()
state year
California 2000 33871648
2010 37253956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
[123]
pop
pop_flat = pop.reset_index(name='population')
pop_flat
#print(type(pop_flat))
pop_ser = pop_flat.set_index(['state' ,'year'])
pop_ser
#print(type(pop_ser))
[131]
health_data
#### 각 열에 대한 년도의 평균을 구하시오.
data_mean = health_data.mean(level = 'year')
data_mean
data_mean.mean(level = 'type', axis=1)
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\2173746977.py:3: FutureWarning: Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.median(level=1) should use df.groupby(level=1).median().
data_mean = health_data.mean(level = 'year')
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\2173746977.py:5: FutureWarning: Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.median(level=1) should use df.groupby(level=1).median().
data_mean.mean(level = 'type', axis=1)
[130]
health_data
health_data.mean(level = 'type', axis=1)
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\1123710997.py:2: FutureWarning: Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.median(level=1) should use df.groupby(level=1).median().
health_data.mean(level = 'type', axis=1)
[134]
# numpy 배열 연결
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
[136]
x = [[1, 2],
[3, 4]]
np.concatenate([x, x])
np.concatenate([x, x], axis = 1)
array([[1, 2, 1, 2],
[3, 4, 3, 4]])
[141]
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[1, 2, 3])
ser3 = pd.concat([ser1, ser2])
ser3[1]
ser4 = pd.concat([ser1, ser2], ignore_index = True)
ser4
0 A
1 B
2 C
3 D
4 E
5 F
dtype: object
[191]
df1 = pd.read_csv("https://raw.githubusercontent.com/SoongMoo/soldesk2110/main/data/concat_1.csv")
df2 = pd.read_csv("https://raw.githubusercontent.com/SoongMoo/soldesk2110/main/data/concat_2.csv")
df3 = pd.read_csv("https://raw.githubusercontent.com/SoongMoo/soldesk2110/main/data/concat_3.csv")
df1, df2, df3
( A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3,
A B C D
0 a4 b4 c4 d4
1 a5 b5 c5 d5
2 a6 b6 c6 d6
3 a7 b7 c7 d7,
A B C D
0 a8 b8 c8 d8
1 a9 b9 c9 d9
2 a10 b10 c10 d10
3 a11 b11 c11 d11)
[145]
# x축 기준 : 행으로 연결
row_concat = pd.concat([df1, df2, df3], axis = 0)
row_concat
[144]
col_concat = pd.concat([df1, df2, df3] , axis = 1)
col_concat
[150]
row_concat
row_concat.loc[0]
row_concat.iloc[0]
row_concat.loc[1]
[158]
row_concat
row_concat.loc[1, 'A']
row_concat.iloc[5, 0]
# 암시적인덱스 5행을 가지고 오시오
row_concat.iloc[5,]
row_concat.iloc[5]
row_concat.iloc[5 , : ]
A a5
B b5
C c5
D d5
Name: 1, dtype: object
[167]
new_row_series = pd.Series(['n1', 'n2', 'n3', 'n4'])
new_row_series
# 데이터프레임과 시리즈를 연결
dataframe_series = pd.concat([new_row_series, df1])
dataframe_series
dataframe_series = pd.concat([new_row_series, df1], axis = 1)
dataframe_series
[165]
df4 = pd.DataFrame([['n1','n2','n3','n4']], columns=['A', 'B' , 'C' , 'D'])
df4
pd.concat([df4, df1])
[175]
df5 = pd.concat([df1,df2,df3])
df5
df5.loc[0]
df5.iloc[0]
df5 = pd.concat([df1,df2,df3], ignore_index=True)
df5
df5.loc[0]
A a0
B b0
C c0
D d0
Name: 0, dtype: object
[181]
col_concat = pd.concat([df1,df2,df3], axis = 1 , ignore_index=True)
col_concat
col_concat[0]
0 a0
1 a1
2 a2
3 a3
Name: 0, dtype: object
[188]
print(df1.columns)
print(df2.columns)
print(df3.columns)
df2.columns = ['E','F','G','H']
df2
df3.columns = ['A','C','F','H']
df3
df1, df2, df3
df6 = pd.concat([df1, df3] , join='inner' , ignore_index=True)
df6
Index(['A', 'B', 'C', 'D'], dtype='object')
Index(['E', 'F', 'G', 'H'], dtype='object')
Index(['A', 'C', 'F', 'H'], dtype='object')
[193]
print(df2.index)
print(df3.index)
df2.index = [4, 5, 6, 7] # 행번호를 변경
df3.index = [0, 2, 5, 7]
print(df2.index)
print(df3.index)
RangeIndex(start=0, stop=4, step=1)
RangeIndex(start=0, stop=4, step=1)
Int64Index([4, 5, 6, 7], dtype='int64')
Int64Index([0, 2, 5, 7], dtype='int64')
[194]
df2, df3
( A B C D
4 a4 b4 c4 d4
5 a5 b5 c5 d5
6 a6 b6 c6 d6
7 a7 b7 c7 d7,
A B C D
0 a8 b8 c8 d8
2 a9 b9 c9 d9
5 a10 b10 c10 d10
7 a11 b11 c11 d11)
[198]
print(df1)
print(df3)
### 행번호가 일치하는 것끼리만 연결 : y축기준
col_concat2 = pd.concat([df1,df3], axis = 1, join ='inner')
print(col_concat2)
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
A B C D
0 a8 b8 c8 d8
2 a9 b9 c9 d9
5 a10 b10 c10 d10
7 a11 b11 c11 d11
A B C D A B C D
0 a0 b0 c0 d0 a8 b8 c8 d8
2 a2 b2 c2 d2 a9 b9 c9 d9
[207]
df1, df3
row_df1 = df1.append(df4) # x축 기준으로 연결
print(row_df1)
A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3
0 n1 n2 n3 n4
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\472189282.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
row_df1 = df1.append(df4) # x축 기준으로 연결
[221]
df1, df3
#df7 = pd.concat([df1, df3], join_axes=[df1.columns])
#df7
( A B C D
0 a0 b0 c0 d0
1 a1 b1 c1 d1
2 a2 b2 c2 d2
3 a3 b3 c3 d3,
A C F H
0 a8 b8 c8 d8
2 a9 b9 c9 d9
5 a10 b10 c10 d10
7 a11 b11 c11 d11)
[231]
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
df1, df2
df3 = pd.concat([df1, df2], axis=1)
df3
df3 = pd.merge(df1, df2)
df3
( employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR,
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014)
[232]
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
'supervisor': ['Carly', 'Guido', 'Steve']})
df3, df4
df5 = pd.merge(df3, df4)
df5
( employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014,
group supervisor
0 Accounting Carly
1 Engineering Guido
2 HR Steve)
[234]
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
'Engineering', 'Engineering', 'HR', 'HR'],
'skills': ['math', 'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})
df1, df5
pd.merge(df1, df5)
[241]
df1, df2
pd.merge(df1, df2, on='employee')
( employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR,
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014)
[246]
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]})
pd.merge(df1, df3, left_on="employee", right_on="name")
df1.merge(df3, left_on="employee", right_on="name")
df1.merge(df3, left_on="employee", right_on="name").drop("name", axis=1)
[252]
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
df1a, df2a
pd.merge(df1a, df2a, left_index=True, right_index=True)
df1a.merge( df2a, left_index=True, right_index=True)
[256]
df1a, df3
pd.merge(df1a, df3, left_index = True, right_on = 'name')
[262]
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
df8, df9
pd.merge(df8, df9, on="name")
pd.merge(df8, df9, on="name", suffixes = ["_L", "_R"])
[268]
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
df6, df7
pd.merge(df6, df7)
pd.merge(df6, df7, how ='inner')
df6, df7
pd.merge(df6, df7, how ='outer')
[273]
df6, df7
pd.merge(df6, df7, how ='left')
df6.merge(df7, how ='left')
df6.merge(df7, how ='right')
[274]
pop = pd.read_csv('https://raw.githubusercontent.com/wikibook/python-ds-handbook/master/notebooks/data/state-population.csv')
areas = pd.read_csv('https://raw.githubusercontent.com/wikibook/python-ds-handbook/master/notebooks/data/state-areas.csv')
abbrevs = pd.read_csv('https://raw.githubusercontent.com/wikibook/python-ds-handbook/master/notebooks/data/state-abbrevs.csv')
[279]
pop.head(), areas.head(), abbrevs.head()
( state/region ages year population
0 AL under18 2012 1117489.0
1 AL total 2012 4817528.0
2 AL under18 2010 1130966.0
3 AL total 2010 4785570.0
4 AL under18 2011 1125763.0,
state area (sq. mi)
0 Alabama 52423
1 Alaska 656425
2 Arizona 114006
3 Arkansas 53182
4 California 163707,
state abbreviation
0 Alabama AL
1 Alaska AK
2 Arizona AZ
3 Arkansas AR
4 California CA)
[285]
merged = pd.merge(pop, abbrevs , how ='inner' ,
left_on = 'state/region', right_on = 'abbreviation')
merged = merged.drop('abbreviation', axis = 1)
merged
[292]
import seaborn as sns
planets = sns.load_dataset('planets')
planets
planets.shape
planets.info()
planets.head()
# null의 갯수 확인
planets.shape[0] - planets.count()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 method 1035 non-null object
1 number 1035 non-null int64
2 orbital_period 992 non-null float64
3 mass 513 non-null float64
4 distance 808 non-null float64
5 year 1035 non-null int64
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB
method 0
number 0
orbital_period 43
mass 522
distance 227
year 0
dtype: int64
[295]
planets.describe()
planets.dropna().describe()
[298]
# GroupBy: Split, Apply, Combine
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data': range(6)}, columns=['key', 'data'])
df
df.groupby("key").sum()
df.groupby("key").describe()
[303]
planets.groupby('method').median()['orbital_period']
planets.groupby('method')['orbital_period'].median()
method
Astrometry 631.180000
Eclipse Timing Variations 4343.500000
Imaging 27500.000000
Microlensing 3300.000000
Orbital Brightness Modulation 0.342887
Pulsar Timing 66.541900
Pulsation Timing Variations 1170.000000
Radial Velocity 360.200000
Transit 5.714932
Transit Timing Variations 57.011000
Name: orbital_period, dtype: float64
[311]
planets.groupby('method')['year'].describe().unstack()
method
count Astrometry 2.0
Eclipse Timing Variations 9.0
Imaging 38.0
Microlensing 23.0
Orbital Brightness Modulation 3.0
...
max Pulsar Timing 2011.0
Pulsation Timing Variations 2007.0
Radial Velocity 2014.0
Transit 2014.0
Transit Timing Variations 2014.0
Length: 80, dtype: float64
[312]
planets
[313]
df = pd.read_csv(
"https://raw.githubusercontent.com/SoongMoo/soldesk2110/main/data/gapminder.tsv",
sep="\t")
df
[320]
df.groupby('year').mean()
df.groupby('year')['lifeExp'].mean()
df.groupby('year').lifeExp.mean()
df['year']
df.year
df.groupby('year').mean()['lifeExp']
df.groupby('year').mean().lifeExp
year
1952 49.057620
1957 51.507401
1962 53.609249
1967 55.678290
1972 57.647386
1977 59.570157
1982 61.533197
1987 63.212613
1992 64.160338
1997 65.014676
2002 65.694923
2007 67.007423
Name: lifeExp, dtype: float64
[324]
df
df.groupby(['year', 'continent']).mean()
df.groupby(['year', 'continent'])['lifeExp'].mean()
df.groupby(['year', 'continent']).lifeExp.mean()
df.groupby(['year', 'continent']).lifeExp.mean().head()
year continent
1952 Africa 39.135500
Americas 53.279840
Asia 46.314394
Europe 64.408500
Oceania 69.255000
Name: lifeExp, dtype: float64
[326]
### 년도별 지역의 gdp에 따른 평균수명에 대한 평균은?
df
df.groupby(['year', 'continent', 'gdpPercap']).lifeExp.mean()
### 년도별 지역의 평균 수명의 평균과 gdp의 평균을 구하시오
df.groupby(['year', 'continent'])['lifeExp','gdpPercap'].mean()
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\1383371160.py:5: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
df.groupby(['year', 'continent'])['lifeExp','gdpPercap'].mean()
[327]
## 지역에 대한 나라의 갯수
df.groupby('continent')['country'].nunique()
continent
Africa 52
Americas 25
Asia 33
Europe 30
Oceania 2
Name: country, dtype: int64
[329]
# 년도별 평균 수명의 평균을 구하시오
df.groupby('year')['lifeExp'].mean()
import matplotlib as plt
df.groupby('year')['lifeExp'].mean().plot()
<AxesSubplot:xlabel='year'>
[335]
#Aggregate, filter, transform, apply
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
df
df.groupby('key').min()
df.groupby('key').max()
df.groupby('key').mean()
df.groupby('key').aggregate(['min', max, np.median])
[336]
df.groupby('key').aggregate({"data1":min, "data2": max})
[340]
# Filtering
def filter_func(x):
return x['data2'].std() > 4 # df.groupby('key')['data2'].std()
df, df.groupby('key').std()
df.groupby('key').filter(filter_func)
[341]
### lambda함수
def add(a, b):
return a + b
result = add(3, 4)
print(result)
7
[342]
add = lambda a, b : a + b
result = add(3, 4)
print(result)
7
[343]
def add(a, b = 10):
return a + b
result = add(3)
print(result)
13
add = lambda a, b = 10 : a + b
result = add(3)
print(result)
[345]
a = (1,2,3,4)
result = []
for i in a:
if i % 2 == 0:
result.append(i * 3)
print(result)
result = [i * 3 for i in a if i % 2 == 0 ]
[6, 12]
def add(opt, a, b):
if opt == "add":
return a + b
else:
return a - b
[346]
add = lambda opt, a, b : a + b if opt == "add" else a - b
result = add("add", 4 , 3)
print(result)
7
def add(opt, a, b):
if opt == "add":
return a + b
else:
if opt == "sub":
return a - b
else:
if opt == "mul":
return a * b
else:
return a / b
[347]
add = lambda opt, a, b : a + b if opt == "add" else (
a - b if opt == "sub" else (
a * b if opt == "mul" else a / b
))
result = add("add", 4 , 3)
print(result)
7
[354]
def f (x):
return x - x.mean()
f = lambda x : x - x.mean()
f(df.groupby('key'))
[355]
df.groupby('key').transform(lambda x : x - x.mean())
[356]
df.groupby('key').apply(f)
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\3304050746.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
return x - x.mean()
[358]
# 피벗 테이블
titanic = sns.load_dataset('titanic')
titanic.head()
[360]
# 성별에 따른 생존률
titanic.groupby('sex').survived.mean()
titanic.groupby('sex')[['survived']].mean()
[364]
# 성별별 좌석등급에 따른 생존률 평균
titanic.groupby(['sex','class' ]).survived.mean()
titanic.groupby(['sex','class' ])[['survived']].mean()
titanic.groupby(['sex','class' ]).survived.mean().unstack()
[365]
titanic.pivot_table('survived',index='sex', columns ='class' )
[366]
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')
[369]
fare = pd.cut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare , 'class'])
[373]
titanic.pivot_table(index='sex', columns ='class' ,
aggfunc={'survived':sum , 'fare' : 'mean'})
[374]
titanic.pivot_table('survived', index='sex', columns='class', margins=True)
[382]
births = pd.read_csv("https://raw.githubusercontent.com/wikibook/python-ds-handbook/master/notebooks/data/births.csv")
births.head()
births['decade'] = (births['year'] // 10 ) *10
births
births.pivot_table('births', index='decade', columns = 'gender' , aggfunc = 'sum')
[386]
import matplotlib.pyplot as plt
births.pivot_table('births', index='year', columns = 'gender' , aggfunc = 'sum')
births.pivot_table('births', index='year', columns = 'gender' , aggfunc = 'sum').plot()
plt.ylabel('total births per year');
[389]
quartiles = np.percentile(births['births'], [25, 50,75])
mu = quartiles[1]
mu
sig = 0.74 * (quartiles[2] - quartiles[0])
[391]
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')
births
births.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 14610 entries, 0 to 15066
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 14610 non-null int64
1 month 14610 non-null int64
2 day 14610 non-null float64
3 gender 14610 non-null object
4 births 14610 non-null int64
5 decade 14610 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 799.0+ KB
[393]
# 년(year), 월(month), 일(day)로부터 날짜(datetime) 인덱스를 생성
# 19991212
births.index = pd.to_datetime(births.year * 10000 +
births.month * 100 +
births.day,format='%Y%m%d' )
births['dayofweek'] = births.index.dayofweek
births
[398]
births.pivot_table('births', index='dayofweek',
columns='decade', aggfunc='mean').plot()
plt.gca().set_xticklabels(['','Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean births by day');
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\598896414.py:3: UserWarning: FixedFormatter should only be used together with FixedLocator
plt.gca().set_xticklabels(['','Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
'파이썬[python]' 카테고리의 다른 글
파이썬 오프라인 설치 (0) | 2022.07.21 |
---|---|
파이썬 강좌 5 (0) | 2022.05.13 |
파이썬 강좌 3 (0) | 2022.05.12 |
파이썬 오프라인 (0) | 2022.05.11 |
파이썬 강좌 2 (0) | 2022.05.11 |
댓글
공지사항
최근에 올라온 글
최근에 달린 댓글
- Total
- Today
- Yesterday
링크
TAG
- C
- 프로씨
- 이클립스
- 라이믹스 모듈
- ocjap
- 오라클
- KG
- 문자열
- 스크래핑
- 인포믹스
- XE
- MySQL
- proc
- esql
- ocajp
- JDBC
- xe addon
- 자바 smtp
- xe애드온
- webix
- 자바
- Python
- XE3
- 파싱
- 플러터
- 파이썬
- php
- 포인터
- EC
- C언어
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | ||||||
2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 | 25 | 26 | 27 | 28 |
글 보관함