7. 过滤状态
In[51]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
grouped = college.groupby('STABBR')
grouped.ngroups
Out[51]: 59
# 这等于求出不同州的个数,nunique()可以得到同样的结果
In[52]: college['STABBR'].nunique()
Out[52]: 59
# 自定义一个计算少数民族学生总比例的函数,如果比例大于阈值,还返回True
In[53]: def check_minority(df, threshold):
minority_pct = 1 - df['UGDS_WHITE']
total_minority = (df['UGDS'] * minority_pct).sum()
total_ugds = df['UGDS'].sum()
total_minority_pct = total_minority / total_ugds
return total_minority_pct > threshold
# grouped变量有一个filter方法,可以接收一个自定义函数,决定是否保留一个分组
In[54]: college_filtered = grouped.filter(check_minority, threshold=.5)
college_filtered.head()
Out[54]:
# 通过查看形状,可以看到过滤了60%,只有20个州的少数学生占据多数
In[55]: college.shape
Out[55]: (7535, 26)
In[56]: college_filtered.shape
Out[56]: (3028, 26)
In[57]: college_filtered['STABBR'].nunique()
Out[57]: 20
更多
# 用一些不同的阈值,检查形状和不同州的个数
In[58]: college_filtered_20 = grouped.filter(check_minority, threshold=.2)
college_filtered_20.shape
Out[58]: (7461, 26)
In[59]: college_filtered_20['STABBR'].nunique()
Out[59]: 57
In[60]: college_filtered_70 = grouped.filter(check_minority, threshold=.7)
college_filtered_70.shape
Out[60]: (957, 26)
In[61]: college_filtered_70['STABBR'].nunique()
Out[61]: 10
In[62]: college_filtered_95 = grouped.filter(check_minority, threshold=.95)
college_filtered_95.shape
Out[62]: (156, 26)