7. 使用Seaborn网格做多变量分析

  1. # 读取employee数据集,创建工龄的列
  2. In[85]: employee = pd.read_csv('data/employee.csv',
  3. parse_dates=['HIRE_DATE', 'JOB_DATE'])
  4. days_hired = (pd.to_datetime('12-1-2016') - employee['HIRE_DATE'])
  5. one_year = pd.Timedelta(1, unit='Y')
  6. employee['YEARS_EXPERIENCE'] = days_hired / one_year
  7. employee[['HIRE_DATE', 'YEARS_EXPERIENCE']].head()
  8. Out[85]:

7. 使用Seaborn网格做多变量分析 - 图1

  1. # 画一个基本的带有回归线的散点图
  2. In[86]: import seaborn as sns
  3. In[87]: ax = sns.regplot(x='YEARS_EXPERIENCE', y='BASE_SALARY',
  4. data=employee)
  5. ax.figure.set_size_inches(14,4)
  6. Out[87]:

7. 使用Seaborn网格做多变量分析 - 图2

  1. # 用regplot的上层函数lmplot,画出不同性别的回归线
  2. In[88]: grid = sns.lmplot(x='YEARS_EXPERIENCE', y='BASE_SALARY',
  3. hue='GENDER', palette='Greys',
  4. scatter_kws={'s':10}, data=employee)
  5. grid.fig.set_size_inches(14, 4)
  6. type(grid)
  7. Out[88]: seaborn.axisgrid.FacetGrid

7. 使用Seaborn网格做多变量分析 - 图3

  1. # 为每个种族创建子图,同时保留回归线
  2. In[89]: grid = sns.lmplot(x='YEARS_EXPERIENCE', y='BASE_SALARY',
  3. hue='GENDER', col='RACE', col_wrap=3,
  4. palette='Greys', sharex=False,
  5. line_kws = {'linewidth':5},
  6. data=employee)
  7. grid.set(ylim=(20000, 120000))
  8. Out[89]: <seaborn.axisgrid.FacetGrid at 0x11e7ce470>

7. 使用Seaborn网格做多变量分析 - 图4

  1. # 将类型值的层级减小到二,将部门的层级减小到三
  2. In[90]: deps = employee['DEPARTMENT'].value_counts().index[:2]
  3. races = employee['RACE'].value_counts().index[:3]
  4. is_dep = employee['DEPARTMENT'].isin(deps)
  5. is_race = employee['RACE'].isin(races)
  6. emp2 = employee[is_dep & is_race].copy()
  7. emp2['DEPARTMENT'] = emp2.DEPARTMENT.str.extract('(HPD|HFD)', expand=True)
  8. emp2.shape
  9. Out[90]: (968, 11)
  10. In[91]: emp2['DEPARTMENT'].value_counts()
  11. Out[91]: HPD 591
  12. HFD 377
  13. Name: DEPARTMENT, dtype: int64
  14. In[92]: emp2['RACE'].value_counts()
  15. Out[92]: White 478
  16. Hispanic/Latino 250
  17. Black or African American 240
  18. Name: RACE, dtype: int64
  1. # 用Axe层函数,比如violinplot来画出工龄和性别的分布
  2. In[93]: ax = sns.violinplot(x = 'YEARS_EXPERIENCE', y='GENDER', data=emp2)
  3. ax.figure.set_size_inches(10,4)
  4. Out[93]:

7. 使用Seaborn网格做多变量分析 - 图5

  1. # 用factorplot函数,为每个部门和种族的组合画图
  2. In[94]: sns.factorplot(x ='YEARS_EXPERIENCE', y='GENDER',
  3. col='RACE', row='DEPARTMENT',
  4. size=3, aspect=2,
  5. data=emp2, kind='violin')
  6. Out[94]: <seaborn.axisgrid.FacetGrid at 0x11e40ec50>

7. 使用Seaborn网格做多变量分析 - 图6