IT,라이믹스,XE(Xpress Engine),자바,파이썬,마리아DB,php,스크래핑,파싱,크롤링,스프링등 알짜 정보 제공

티스토리 뷰

파이썬[python]
파이썬 강좌 4

xemaker 2022. 5. 13. 09:02
[2]
import pandas as pd
import numpy as np
[12]
pop = pd.Series([33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561],index=['a','b','c','d','e','f'])
pop
a    33871648
b    37253956
c    18976457
d    19378102
e    20851820
f    25145561
dtype: int64
[16]
index=['a','b','c','d','e','f']
pop = pd.Series([33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561],index=index)
pop
pop['a': 'c']
a    33871648
b    37253956
c    18976457
dtype: int64
[17]
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop
(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64
[18]
pop[('California', 2010):('Texas', 2000)]
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64
[28]
l = []
for i in pop.index:
    if i[1] == 2010:
        l.append(pop[i])
print(l)
[37253956, 19378102, 25145561]

[29]
l = [pop[i] for i in pop.index if i[1] == 2010]
l
[37253956, 19378102, 25145561]
[33]
pop[[('California', 2010),('New York', 2010),('Texas', 2010)]]
pop[[i for i in pop.index if i[1] == 2010]]
(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64
[36]
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

index = pd.MultiIndex.from_tuples(index)
index
MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )
[37]
pop
(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64
[38]
pop = pop.reindex(index)
pop
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
[39]
pop[:, 2010]
California    37253956
New York      19378102
Texas         25145561
dtype: int64
[40]
pop_df = pop.unstack()
pop_df

[41]
pop_sir = pop_df.stack()
pop_sir
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
[42]
pop
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
[43]
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

[51]
f_u18  = pop_df['under18']/ pop_df['total']
f_u18.unstack()

[54]
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'],[1,2,1,2]],
                  columns=['data1', 'data2'])
df

[56]
pop.index.names = ['state','year' ]
pop
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
[63]
# 계층적 인덱스와 열
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                    names=['subject', 'type'])

data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

[68]
health_data['Guido']
health_data['Guido']['HR']
health_data['Guido','HR']
year  visit
2013  1        50.0
      2        47.0
2014  1        16.0
      2        32.0
Name: (Guido, HR), dtype: float64
[74]
pop
pop['California']
pop['California',2000]
pop
pop['California' : 'New York']
# 2000년도 데이터만 가지고 오시오
pop[: , 2000]
state
California    33871648
New York      18976457
Texas         20851820
dtype: int64
[79]
# 모든 데이터에서 22000000보다 큰 데이터만 출력하시오.
pop > 22000000 # 마스크
pop[pop > 22000000] # 마스킹
# California , Texas데이터를 가지고 오시오.
pop['California'], pop['Texas']
pop[['California','Texas' ]] # 팬시 인덱싱
state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64
[84]
health_data
health_data.iloc[0:2, 0:2]
health_data
health_data.loc[: , ('Guido', 'Temp')]
year  visit
2013  1        36.9
      2        36.4
2014  1        36.7
      2        36.7
Name: (Guido, Temp), dtype: float64
[87]
idx = pd.IndexSlice
health_data.loc[idx[:, 1] , idx[:, 'HR']]

[105]
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data
char  int
a     1      0.355351
      2      0.274061
c     1      0.474610
      2      0.116276
b     1      0.262809
      2      0.635166
dtype: float64
[106]
# index 정렬
data = data.sort_index()
data
char  int
a     1      0.355351
      2      0.274061
b     1      0.262809
      2      0.635166
c     1      0.474610
      2      0.116276
dtype: float64
[108]
data['a' : 'b']
char  int
a     1      0.355351
      2      0.274061
b     1      0.262809
      2      0.635166
dtype: float64
[113]
pop.unstack()
pop.unstack(level=0)
pop.unstack(level=1)
pop.unstack().stack()
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
[123]
pop
pop_flat = pop.reset_index(name='population')
pop_flat
#print(type(pop_flat))
pop_ser = pop_flat.set_index(['state'   ,'year'])
pop_ser
#print(type(pop_ser))

[131]
health_data
#### 각 열에 대한 년도의 평균을 구하시오.
data_mean = health_data.mean(level = 'year')
data_mean
data_mean.mean(level = 'type', axis=1)
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\2173746977.py:3: FutureWarning: Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.median(level=1) should use df.groupby(level=1).median().
  data_mean = health_data.mean(level = 'year')
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\2173746977.py:5: FutureWarning: Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.median(level=1) should use df.groupby(level=1).median().
  data_mean.mean(level = 'type', axis=1)


[130]
health_data
health_data.mean(level = 'type', axis=1)
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\1123710997.py:2: FutureWarning: Using the level keyword in DataFrame and Series aggregations is deprecated and will be removed in a future version. Use groupby instead. df.median(level=1) should use df.groupby(level=1).median().
  health_data.mean(level = 'type', axis=1)


[134]
# numpy 배열 연결
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
[136]
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x])
np.concatenate([x, x], axis = 1)
array([[1, 2, 1, 2],
       [3, 4, 3, 4]])
[141]
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[1, 2, 3])
ser3 = pd.concat([ser1, ser2])
ser3[1]
ser4 = pd.concat([ser1, ser2], ignore_index = True)
ser4
0    A
1    B
2    C
3    D
4    E
5    F
dtype: object
[191]
df1 = pd.read_csv("https://raw.githubusercontent.com/SoongMoo/soldesk2110/main/data/concat_1.csv")
df2 = pd.read_csv("https://raw.githubusercontent.com/SoongMoo/soldesk2110/main/data/concat_2.csv")
df3 = pd.read_csv("https://raw.githubusercontent.com/SoongMoo/soldesk2110/main/data/concat_3.csv")
df1, df2, df3
(    A   B   C   D
 0  a0  b0  c0  d0
 1  a1  b1  c1  d1
 2  a2  b2  c2  d2
 3  a3  b3  c3  d3,
     A   B   C   D
 0  a4  b4  c4  d4
 1  a5  b5  c5  d5
 2  a6  b6  c6  d6
 3  a7  b7  c7  d7,
      A    B    C    D
 0   a8   b8   c8   d8
 1   a9   b9   c9   d9
 2  a10  b10  c10  d10
 3  a11  b11  c11  d11)
[145]
# x축 기준 : 행으로 연결
row_concat = pd.concat([df1, df2, df3], axis = 0)
row_concat

[144]
col_concat =   pd.concat([df1, df2, df3] , axis = 1)
col_concat

[150]
row_concat
row_concat.loc[0]
row_concat.iloc[0]
row_concat.loc[1]

[158]
row_concat
row_concat.loc[1, 'A']
row_concat.iloc[5, 0]
# 암시적인덱스 5행을 가지고 오시오
row_concat.iloc[5,]
row_concat.iloc[5]
row_concat.iloc[5 , : ]
A    a5
B    b5
C    c5
D    d5
Name: 1, dtype: object
[167]
new_row_series = pd.Series(['n1', 'n2', 'n3', 'n4'])
new_row_series
# 데이터프레임과 시리즈를 연결
dataframe_series  = pd.concat([new_row_series, df1])
dataframe_series
dataframe_series  = pd.concat([new_row_series, df1], axis = 1)
dataframe_series

[165]
df4 = pd.DataFrame([['n1','n2','n3','n4']], columns=['A', 'B' ,  'C' ,  'D'])
df4
pd.concat([df4, df1])

[175]
df5 = pd.concat([df1,df2,df3])
df5
df5.loc[0]
df5.iloc[0]
df5 = pd.concat([df1,df2,df3], ignore_index=True)
df5
df5.loc[0]
A    a0
B    b0
C    c0
D    d0
Name: 0, dtype: object
[181]
col_concat  = pd.concat([df1,df2,df3], axis = 1 , ignore_index=True)
col_concat
col_concat[0]
0    a0
1    a1
2    a2
3    a3
Name: 0, dtype: object
[188]
print(df1.columns)
print(df2.columns)
print(df3.columns)
df2.columns = ['E','F','G','H']
df2
df3.columns = ['A','C','F','H']
df3
df1, df2, df3
df6 = pd.concat([df1, df3] , join='inner' , ignore_index=True)
df6
Index(['A', 'B', 'C', 'D'], dtype='object')
Index(['E', 'F', 'G', 'H'], dtype='object')
Index(['A', 'C', 'F', 'H'], dtype='object')


[193]
print(df2.index)
print(df3.index)
df2.index = [4, 5, 6, 7]  # 행번호를 변경
df3.index = [0, 2, 5, 7]
print(df2.index)
print(df3.index)
RangeIndex(start=0, stop=4, step=1)
RangeIndex(start=0, stop=4, step=1)
Int64Index([4, 5, 6, 7], dtype='int64')
Int64Index([0, 2, 5, 7], dtype='int64')

[194]
df2, df3
(    A   B   C   D
 4  a4  b4  c4  d4
 5  a5  b5  c5  d5
 6  a6  b6  c6  d6
 7  a7  b7  c7  d7,
      A    B    C    D
 0   a8   b8   c8   d8
 2   a9   b9   c9   d9
 5  a10  b10  c10  d10
 7  a11  b11  c11  d11)
[198]
print(df1)
print(df3)
### 행번호가 일치하는 것끼리만 연결 : y축기준
col_concat2 = pd.concat([df1,df3], axis = 1, join ='inner') 
print(col_concat2)
    A   B   C   D
0  a0  b0  c0  d0
1  a1  b1  c1  d1
2  a2  b2  c2  d2
3  a3  b3  c3  d3
     A    B    C    D
0   a8   b8   c8   d8
2   a9   b9   c9   d9
5  a10  b10  c10  d10
7  a11  b11  c11  d11
    A   B   C   D   A   B   C   D
0  a0  b0  c0  d0  a8  b8  c8  d8
2  a2  b2  c2  d2  a9  b9  c9  d9

[207]
df1, df3
row_df1 = df1.append(df4) # x축 기준으로 연결
print(row_df1)
    A   B   C   D
0  a0  b0  c0  d0
1  a1  b1  c1  d1
2  a2  b2  c2  d2
3  a3  b3  c3  d3
0  n1  n2  n3  n4

C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\472189282.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  row_df1 = df1.append(df4) # x축 기준으로 연결

[221]
df1, df3
#df7 = pd.concat([df1, df3], join_axes=[df1.columns])
#df7
(    A   B   C   D
 0  a0  b0  c0  d0
 1  a1  b1  c1  d1
 2  a2  b2  c2  d2
 3  a3  b3  c3  d3,
      A    C    F    H
 0   a8   b8   c8   d8
 2   a9   b9   c9   d9
 5  a10  b10  c10  d10
 7  a11  b11  c11  d11)
[231]
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
df1, df2
df3 = pd.concat([df1, df2], axis=1)
df3
df3 = pd.merge(df1, df2)
df3
(  employee        group
 0      Bob   Accounting
 1     Jake  Engineering
 2     Lisa  Engineering
 3      Sue           HR,
   employee  hire_date
 0     Lisa       2004
 1      Bob       2008
 2     Jake       2012
 3      Sue       2014)
[232]
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
df3, df4
df5 = pd.merge(df3, df4)
df5
(  employee        group  hire_date
 0      Bob   Accounting       2008
 1     Jake  Engineering       2012
 2     Lisa  Engineering       2004
 3      Sue           HR       2014,
          group supervisor
 0   Accounting      Carly
 1  Engineering      Guido
 2           HR      Steve)
[234]
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})
df1, df5
pd.merge(df1, df5)

[241]
df1, df2
pd.merge(df1, df2, on='employee')
(  employee        group
 0      Bob   Accounting
 1     Jake  Engineering
 2     Lisa  Engineering
 3      Sue           HR,
   employee  hire_date
 0     Lisa       2004
 1      Bob       2008
 2     Jake       2012
 3      Sue       2014)
[246]
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
pd.merge(df1, df3,  left_on="employee", right_on="name")
df1.merge(df3,  left_on="employee", right_on="name")
df1.merge(df3,  left_on="employee", right_on="name").drop("name", axis=1)

[252]
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
df1a, df2a
pd.merge(df1a, df2a, left_index=True, right_index=True)
df1a.merge( df2a, left_index=True, right_index=True)

[256]
df1a, df3
pd.merge(df1a, df3, left_index = True, right_on = 'name')

[262]
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
df8, df9
pd.merge(df8, df9, on="name")
pd.merge(df8, df9, on="name", suffixes = ["_L", "_R"])

[268]
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                   columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                   columns=['name', 'drink'])
df6, df7
pd.merge(df6, df7)

pd.merge(df6, df7, how ='inner')
df6, df7
pd.merge(df6, df7, how ='outer')

[273]
df6, df7
pd.merge(df6, df7, how ='left')
df6.merge(df7, how ='left')
df6.merge(df7, how ='right')

[274]
pop = pd.read_csv('https://raw.githubusercontent.com/wikibook/python-ds-handbook/master/notebooks/data/state-population.csv')
areas = pd.read_csv('https://raw.githubusercontent.com/wikibook/python-ds-handbook/master/notebooks/data/state-areas.csv')
abbrevs = pd.read_csv('https://raw.githubusercontent.com/wikibook/python-ds-handbook/master/notebooks/data/state-abbrevs.csv')
[279]
pop.head(), areas.head(), abbrevs.head()
(  state/region     ages  year  population
 0           AL  under18  2012   1117489.0
 1           AL    total  2012   4817528.0
 2           AL  under18  2010   1130966.0
 3           AL    total  2010   4785570.0
 4           AL  under18  2011   1125763.0,
         state  area (sq. mi)
 0     Alabama          52423
 1      Alaska         656425
 2     Arizona         114006
 3    Arkansas          53182
 4  California         163707,
         state abbreviation
 0     Alabama           AL
 1      Alaska           AK
 2     Arizona           AZ
 3    Arkansas           AR
 4  California           CA)
[285]
merged = pd.merge(pop, abbrevs , how ='inner' , 
                 left_on = 'state/region', right_on = 'abbreviation')
merged = merged.drop('abbreviation', axis = 1)
merged

[292]
import seaborn as sns
planets = sns.load_dataset('planets')
planets
planets.shape
planets.info()
planets.head()
# null의 갯수 확인
planets.shape[0] - planets.count()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB

method              0
number              0
orbital_period     43
mass              522
distance          227
year                0
dtype: int64
[295]
planets.describe()
planets.dropna().describe()

[298]
# GroupBy: Split, Apply, Combine
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df 
df.groupby("key").sum()
df.groupby("key").describe()

[303]
planets.groupby('method').median()['orbital_period']
planets.groupby('method')['orbital_period'].median()
method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64
[311]
planets.groupby('method')['year'].describe().unstack()
       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64
[312]
planets

[313]
df = pd.read_csv(
    "https://raw.githubusercontent.com/SoongMoo/soldesk2110/main/data/gapminder.tsv", 
    sep="\t")
df

[320]
df.groupby('year').mean()
df.groupby('year')['lifeExp'].mean()
df.groupby('year').lifeExp.mean()
df['year']
df.year
df.groupby('year').mean()['lifeExp']
df.groupby('year').mean().lifeExp
year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64
[324]
df
df.groupby(['year', 'continent']).mean()
df.groupby(['year', 'continent'])['lifeExp'].mean()
df.groupby(['year', 'continent']).lifeExp.mean()
df.groupby(['year', 'continent']).lifeExp.mean().head()
year  continent
1952  Africa       39.135500
      Americas     53.279840
      Asia         46.314394
      Europe       64.408500
      Oceania      69.255000
Name: lifeExp, dtype: float64
[326]
### 년도별 지역의 gdp에 따른 평균수명에 대한 평균은?
df
df.groupby(['year', 'continent', 'gdpPercap']).lifeExp.mean()
### 년도별 지역의 평균 수명의 평균과 gdp의 평균을 구하시오
df.groupby(['year', 'continent'])['lifeExp','gdpPercap'].mean()
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\1383371160.py:5: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  df.groupby(['year', 'continent'])['lifeExp','gdpPercap'].mean()


[327]
## 지역에 대한 나라의 갯수
df.groupby('continent')['country'].nunique()
continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64
[329]
# 년도별 평균 수명의 평균을 구하시오
df.groupby('year')['lifeExp'].mean()
import matplotlib as plt
df.groupby('year')['lifeExp'].mean().plot()
<AxesSubplot:xlabel='year'>

[335]
#Aggregate, filter, transform, apply
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df
df.groupby('key').min()
df.groupby('key').max()
df.groupby('key').mean()
df.groupby('key').aggregate(['min', max, np.median])

[336]
df.groupby('key').aggregate({"data1":min, "data2": max})

[340]
# Filtering
def filter_func(x):
    return  x['data2'].std() > 4 # df.groupby('key')['data2'].std()
df, df.groupby('key').std()
df.groupby('key').filter(filter_func)

[341]
### lambda함수
def add(a, b):
    return a + b
result = add(3, 4)
print(result)
7

[342]
add = lambda a, b : a + b
result = add(3, 4)
print(result)
7

[343]
def add(a, b = 10):
    return a + b
result = add(3)
print(result)
13

add = lambda a, b = 10 : a + b
result = add(3)
print(result)
[345]
a = (1,2,3,4) 
result = []
for i in a:
    if i % 2 == 0:
        result.append(i * 3)
print(result)

result = [i * 3 for i in a if i % 2 == 0 ] 
[6, 12]

def add(opt, a, b):
    if opt == "add":
        return a + b
    else:
        return a - b
[346]
add = lambda opt, a, b : a + b if opt == "add" else a - b
result = add("add", 4 , 3)
print(result)
7

def add(opt, a, b):
    if opt == "add":
        return a + b
    else:
        if opt == "sub":
            return a - b
        else:
            if opt == "mul":
                return a * b
            else:
                return a / b
[347]
add = lambda opt, a, b :  a + b if opt == "add" else (
                          a - b if opt == "sub" else (
                          a * b if opt == "mul" else a / b
                          ))
result = add("add", 4 , 3)
print(result)
7

[354]
def f (x):
    return x - x.mean()
f = lambda  x : x - x.mean()
f(df.groupby('key'))
[355]
df.groupby('key').transform(lambda  x : x - x.mean())

[356]
df.groupby('key').apply(f)
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\3304050746.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  return x - x.mean()


[358]
# 피벗 테이블
titanic = sns.load_dataset('titanic')
titanic.head()

[360]
# 성별에 따른 생존률
titanic.groupby('sex').survived.mean()
titanic.groupby('sex')[['survived']].mean()

[364]
# 성별별 좌석등급에  따른 생존률 평균
titanic.groupby(['sex','class' ]).survived.mean()
titanic.groupby(['sex','class' ])[['survived']].mean()
titanic.groupby(['sex','class' ]).survived.mean().unstack()

[365]
titanic.pivot_table('survived',index='sex', columns ='class' )

[366]
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')

[369]
fare  = pd.cut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare , 'class'])

[373]
titanic.pivot_table(index='sex', columns ='class' ,
                   aggfunc={'survived':sum , 'fare' : 'mean'})

[374]
titanic.pivot_table('survived', index='sex', columns='class', margins=True)

[382]
births = pd.read_csv("https://raw.githubusercontent.com/wikibook/python-ds-handbook/master/notebooks/data/births.csv")
births.head()
births['decade'] = (births['year'] // 10 ) *10
births
births.pivot_table('births', index='decade', columns = 'gender' , aggfunc = 'sum')

[386]
import matplotlib.pyplot as plt
births.pivot_table('births', index='year', columns = 'gender' , aggfunc = 'sum')
births.pivot_table('births', index='year', columns = 'gender' , aggfunc = 'sum').plot()
plt.ylabel('total births per year');

[389]
quartiles = np.percentile(births['births'], [25, 50,75])
mu = quartiles[1]
mu
sig = 0.74 * (quartiles[2] - quartiles[0])
[391]
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')
births
births.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 14610 entries, 0 to 15066
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    14610 non-null  int64  
 1   month   14610 non-null  int64  
 2   day     14610 non-null  float64
 3   gender  14610 non-null  object 
 4   births  14610 non-null  int64  
 5   decade  14610 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 799.0+ KB

[393]
# 년(year), 월(month), 일(day)로부터 날짜(datetime) 인덱스를 생성 
# 19991212
births.index = pd.to_datetime(births.year * 10000 + 
                              births.month * 100 +
                             births.day,format='%Y%m%d' )
births['dayofweek'] = births.index.dayofweek
births

[398]
births.pivot_table('births', index='dayofweek',
                    columns='decade', aggfunc='mean').plot()
plt.gca().set_xticklabels(['','Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean births by day');
C:\Users\SCOOL\AppData\Local\Temp\ipykernel_720\598896414.py:3: UserWarning: FixedFormatter should only be used together with FixedLocator
  plt.gca().set_xticklabels(['','Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
저작자표시
'파이썬[python]' 카테고리의 다른 글

파이썬 오프라인 설치 (0)	2022.07.21
파이썬 강좌 5 (0)	2022.05.13
파이썬 강좌 3 (0)	2022.05.12
파이썬 오프라인 (0)	2022.05.11
파이썬 강좌 2 (0)	2022.05.11
공유하기 링크
페이스북
카카오스토리
트위터
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
TAG more
« 2024/05 »
일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31
글 보관함
IT의 신

티스토리 뷰

파이썬 강좌 4

'파이썬[python]' 카테고리의 다른 글

티스토리툴바