7. 过滤状态

  1. In[51]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
  2. grouped = college.groupby('STABBR')
  3. grouped.ngroups
  4. Out[51]: 59
  1. # 这等于求出不同州的个数,nunique()可以得到同样的结果
  2. In[52]: college['STABBR'].nunique()
  3. Out[52]: 59
  1. # 自定义一个计算少数民族学生总比例的函数,如果比例大于阈值,还返回True
  2. In[53]: def check_minority(df, threshold):
  3. minority_pct = 1 - df['UGDS_WHITE']
  4. total_minority = (df['UGDS'] * minority_pct).sum()
  5. total_ugds = df['UGDS'].sum()
  6. total_minority_pct = total_minority / total_ugds
  7. return total_minority_pct > threshold
  8. # grouped变量有一个filter方法,可以接收一个自定义函数,决定是否保留一个分组
  9. In[54]: college_filtered = grouped.filter(check_minority, threshold=.5)
  10. college_filtered.head()
  11. Out[54]:

7. 过滤状态 - 图1

  1. # 通过查看形状,可以看到过滤了60%,只有20个州的少数学生占据多数
  2. In[55]: college.shape
  3. Out[55]: (7535, 26)
  4. In[56]: college_filtered.shape
  5. Out[56]: (3028, 26)
  6. In[57]: college_filtered['STABBR'].nunique()
  7. Out[57]: 20

更多

  1. # 用一些不同的阈值,检查形状和不同州的个数
  2. In[58]: college_filtered_20 = grouped.filter(check_minority, threshold=.2)
  3. college_filtered_20.shape
  4. Out[58]: (7461, 26)
  5. In[59]: college_filtered_20['STABBR'].nunique()
  6. Out[59]: 57
  7. In[60]: college_filtered_70 = grouped.filter(check_minority, threshold=.7)
  8. college_filtered_70.shape
  9. Out[60]: (957, 26)
  10. In[61]: college_filtered_70['STABBR'].nunique()
  11. Out[61]: 10
  12. In[62]: college_filtered_95 = grouped.filter(check_minority, threshold=.95)
  13. college_filtered_95.shape
  14. Out[62]: (156, 26)