第07章分组聚合、过滤、转换 - 9. 用apply计算每州的加权平均SAT分数 - 《Pandas Cookbook 带注释源码》

9. 用apply计算每州的加权平均SAT分数

#  读取college，'UGDS', 'SATMTMID', 'SATVRMID'三列如果有缺失值则删除行
 In[74]: college = pd.read_csv('data/college.csv')
         subset = ['UGDS', 'SATMTMID', 'SATVRMID']
         college2 = college.dropna(subset=subset)
         college.shape
Out[74]: (7535, 27)
 In[75]: college2.shape
Out[75]: (1184, 27)

#  自定义一个求SAT数学成绩的加权平均值的函数
 In[76]: def weighted_math_average(df):
             weighted_math = df['UGDS'] * df['SATMTMID']
             return int(weighted_math.sum() / df['UGDS'].sum())
#  按州分组，并调用apply方法，传入自定义函数
 In[77]: college2.groupby('STABBR').apply(weighted_math_average).head()
Out[77]: STABBR
         AK    503
         AL    536
         AR    529
         AZ    569
         CA    564
         dtype: int64

#  效果同上
 In[78]: college2.groupby('STABBR').agg(weighted_math_average).head()
Out[78]:

#  如果将列限制到SATMTMID，会报错。这是因为不能访问UGDS。
 In[79]: college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError                                  Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
  2177         try:
-> 2178             return self._aggregate_series_fast(obj, func)
  2179         except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
  2197                                     dummy)
-> 2198         result, counts = grouper.get_result()
  2199         return result, counts
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38888)()
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
   841         func = self._is_builtin_func(func)
--> 842         f = lambda x: func(x, *args, **kwargs)
   843 
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
     1 def weighted_math_average(df):
----> 2     weighted_math = df['UGDS'] * df['SATMTMID']
     3     return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
   600         try:
--> 601             result = self.index.get_value(self, key)
   602 
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  2476             return self._engine.get_value(s, k,
-> 2477                                           tz=getattr(series.dtype, 'tz', None))
  2478         except KeyError as e1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'
During handling of the above exception, another exception occurred:
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError                                  Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
  2882             try:
-> 2883                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
  2884             except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
   847             try:
--> 848                 result, counts = self.grouper.agg_series(obj, f)
   849                 output[name] = self._try_cast(result, obj, numeric_only=True)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
  2179         except Exception:
-> 2180             return self._aggregate_series_pure_python(obj, func)
  2181 
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
  2210         for label, group in splitter:
-> 2211             res = func(group)
  2212             if result is None:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
   841         func = self._is_builtin_func(func)
--> 842         f = lambda x: func(x, *args, **kwargs)
   843 
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
     1 def weighted_math_average(df):
----> 2     weighted_math = df['UGDS'] * df['SATMTMID']
     3     return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
   600         try:
--> 601             result = self.index.get_value(self, key)
   602 
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  2476             return self._engine.get_value(s, k,
-> 2477                                           tz=getattr(series.dtype, 'tz', None))
  2478         except KeyError as e1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'
During handling of the above exception, another exception occurred:
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError                                  Traceback (most recent call last)
<ipython-input-79-1351e4f306c7> in <module>()
----> 1 college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
  2883                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
  2884             except Exception:
-> 2885                 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
  2886 
  2887             index = Index(sorted(result), name=self.grouper.names[0])
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
  3013         for name, group in self:
  3014             group.name = name
-> 3015             output = func(group, *args, **kwargs)
  3016             if isinstance(output, (Series, Index, np.ndarray)):
  3017                 raise Exception('Must produce aggregated value')
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
     1 def weighted_math_average(df):
----> 2     weighted_math = df['UGDS'] * df['SATMTMID']
     3     return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
   599         key = com._apply_if_callable(key, self)
   600         try:
--> 601             result = self.index.get_value(self, key)
   602 
   603             if not is_scalar(result):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
  2475         try:
  2476             return self._engine.get_value(s, k,
-> 2477                                           tz=getattr(series.dtype, 'tz', None))
  2478         except KeyError as e1:
  2479             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'

#  apply的一个不错的功能是通过返回Series，创建多个新的列
 In[80]: from collections import OrderedDict
         def weighted_average(df):
             data = OrderedDict()
             weight_m = df['UGDS'] * df['SATMTMID']
             weight_v = df['UGDS'] * df['SATVRMID']
             data['weighted_math_avg'] = weight_m.sum() / df['UGDS'].sum()
             data['weighted_verbal_avg'] = weight_v.sum() / df['UGDS'].sum()
             data['math_avg'] = df['SATMTMID'].mean()
             data['verbal_avg'] = df['SATVRMID'].mean()
             data['count'] = len(df)
             return pd.Series(data, dtype='int')
         college2.groupby('STABBR').apply(weighted_average).head(10)
Out[80]:

#  多创建两个新的列
 In[81]: from collections import OrderedDict
         def weighted_average(df):
             data = OrderedDict()
             weight_m = df['UGDS'] * df['SATMTMID']
             weight_v = df['UGDS'] * df['SATVRMID']
             wm_avg = weight_m.sum() / df['UGDS'].sum()
             wv_avg = weight_v.sum() / df['UGDS'].sum()
             data['weighted_math_avg'] = wm_avg
             data['weighted_verbal_avg'] = wv_avg
             data['math_avg'] = df['SATMTMID'].mean()
             data['verbal_avg'] = df['SATVRMID'].mean()
             data['count'] = len(df)
             return pd.Series(data, dtype='int')
         college2.groupby('STABBR').apply(weighted_average).head(10)
Out[81]:

#  自定义一个返回DataFrame的函数，使用NumPy的函数average计算加权平均值，使用SciPy的gmean和hmean计算几何和调和平均值
 In[82]: from scipy.stats import gmean, hmean
         def calculate_means(df):
             df_means = pd.DataFrame(index=['Arithmetic', 'Weighted', 'Geometric', 'Harmonic'])
             cols = ['SATMTMID', 'SATVRMID']
             for col in cols:
                 arithmetic = df[col].mean()
                 weighted = np.average(df[col], weights=df['UGDS'])
                 geometric = gmean(df[col])
                 harmonic = hmean(df[col])
                 df_means[col] = [arithmetic, weighted, geometric, harmonic]
             df_means['count'] = len(df)
             return df_means.astype(int)
         college2.groupby('STABBR').filter(lambda x: len(x) != 1).groupby('STABBR').apply(calculate_means).head(10)
Out[82]:

9. 用apply计算每州的加权平均SAT分数

9. 用apply计算每州的加权平均SAT分数

更多