1. 定义聚合

  1. # 读取flights数据集,查询头部
  2. In[2]: flights = pd.read_csv('data/flights.csv')
  3. flights.head()
  4. Out[2]:

1. 定义聚合 - 图1

  1. # 按照AIRLINE分组,使用agg方法,传入要聚合的列和聚合函数
  2. In[3]: flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head()
  3. Out[3]:
  1. # 或者要选取的列使用索引,聚合函数作为字符串传入agg
  2. In[4]: flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()
  3. Out[4]:
  4. AIRLINE
  5. AA 5.542661
  6. AS -0.833333
  7. B6 8.692593
  8. DL 0.339691
  9. EV 7.034580
  10. Name: ARR_DELAY, dtype: float64

1. 定义聚合 - 图2

  1. # 也可以向agg中传入NumPy的mean函数
  2. In[5]: flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.mean).head()
  3. Out[5]:

1. 定义聚合 - 图3

  1. # 也可以直接使用mean()函数
  2. In[6]: flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()
  3. Out[6]:

1. 定义聚合 - 图4

原理

  1. # groupby方法产生的是一个DataFrameGroupBy对象
  2. In[7]: grouped = flights.groupby('AIRLINE')
  3. type(grouped)
  4. Out[7]: pandas.core.groupby.DataFrameGroupBy

更多

  1. # 如果agg接收的不是聚合函数,则会导致异常
  2. In[8]: flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
  3. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py:842: RuntimeWarning: invalid value encountered in sqrt
  4. f = lambda x: func(x, *args, **kwargs)
  5. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py:3015: RuntimeWarning: invalid value encountered in sqrt
  6. output = func(group, *args, **kwargs)
  7. ---------------------------------------------------------------------------
  8. ValueError Traceback (most recent call last)
  9. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
  10. 2177 try:
  11. -> 2178 return self._aggregate_series_fast(obj, func)
  12. 2179 except Exception:
  13. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
  14. 2197 dummy)
  15. -> 2198 result, counts = grouper.get_result()
  16. 2199 return result, counts
  17. pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()
  18. pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38973)()
  19. pandas/_libs/src/reduce.pyx in pandas._libs.lib._get_result_array (pandas/_libs/lib.c:32039)()
  20. ValueError: function does not reduce
  21. During handling of the above exception, another exception occurred:
  22. ValueError Traceback (most recent call last)
  23. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
  24. 2882 try:
  25. -> 2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
  26. 2884 except Exception:
  27. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
  28. 847 try:
  29. --> 848 result, counts = self.grouper.agg_series(obj, f)
  30. 849 output[name] = self._try_cast(result, obj, numeric_only=True)
  31. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
  32. 2179 except Exception:
  33. -> 2180 return self._aggregate_series_pure_python(obj, func)
  34. 2181
  35. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
  36. 2214 isinstance(res, list)):
  37. -> 2215 raise ValueError('Function does not reduce')
  38. 2216 result = np.empty(ngroups, dtype='O')
  39. ValueError: Function does not reduce
  40. During handling of the above exception, another exception occurred:
  41. Exception Traceback (most recent call last)
  42. <ipython-input-8-2bcc9ccfec77> in <module>()
  43. ----> 1 flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
  44. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
  45. 2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
  46. 2884 except Exception:
  47. -> 2885 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
  48. 2886
  49. 2887 index = Index(sorted(result), name=self.grouper.names[0])
  50. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
  51. 3015 output = func(group, *args, **kwargs)
  52. 3016 if isinstance(output, (Series, Index, np.ndarray)):
  53. -> 3017 raise Exception('Must produce aggregated value')
  54. 3018 result[name] = self._try_cast(output, group)
  55. 3019
  56. Exception: Must produce aggregated value