9. 用apply计算每州的加权平均SAT分数
# 读取college,'UGDS', 'SATMTMID', 'SATVRMID'三列如果有缺失值则删除行
In[74]: college = pd.read_csv('data/college.csv')
subset = ['UGDS', 'SATMTMID', 'SATVRMID']
college2 = college.dropna(subset=subset)
college.shape
Out[74]: (7535, 27)
In[75]: college2.shape
Out[75]: (1184, 27)
# 自定义一个求SAT数学成绩的加权平均值的函数
In[76]: def weighted_math_average(df):
weighted_math = df['UGDS'] * df['SATMTMID']
return int(weighted_math.sum() / df['UGDS'].sum())
# 按州分组,并调用apply方法,传入自定义函数
In[77]: college2.groupby('STABBR').apply(weighted_math_average).head()
Out[77]: STABBR
AK 503
AL 536
AR 529
AZ 569
CA 564
dtype: int64
# 效果同上
In[78]: college2.groupby('STABBR').agg(weighted_math_average).head()
Out[78]:
# 如果将列限制到SATMTMID,会报错。这是因为不能访问UGDS。
In[79]: college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2177 try:
-> 2178 return self._aggregate_series_fast(obj, func)
2179 except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_fast(self, obj, func)
2197 dummy)
-> 2198 result, counts = grouper.get_result()
2199 return result, counts
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:39105)()
pandas/_libs/src/reduce.pyx in pandas._libs.lib.SeriesGrouper.get_result (pandas/_libs/lib.c:38888)()
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
841 func = self._is_builtin_func(func)
--> 842 f = lambda x: func(x, *args, **kwargs)
843
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
1 def weighted_math_average(df):
----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
3 return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
600 try:
--> 601 result = self.index.get_value(self, key)
602
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2882 try:
-> 2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _python_agg_general(self, func, *args, **kwargs)
847 try:
--> 848 result, counts = self.grouper.agg_series(obj, f)
849 output[name] = self._try_cast(result, obj, numeric_only=True)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in agg_series(self, obj, func)
2179 except Exception:
-> 2180 return self._aggregate_series_pure_python(obj, func)
2181
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_series_pure_python(self, obj, func)
2210 for label, group in splitter:
-> 2211 res = func(group)
2212 if result is None:
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in <lambda>(x)
841 func = self._is_builtin_func(func)
--> 842 f = lambda x: func(x, *args, **kwargs)
843
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
1 def weighted_math_average(df):
----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
3 return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
600 try:
--> 601 result = self.index.get_value(self, key)
602
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14010)()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-79-1351e4f306c7> in <module>()
----> 1 college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
2883 return self._python_agg_general(func_or_funcs, *args, **kwargs)
2884 except Exception:
-> 2885 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
2886
2887 index = Index(sorted(result), name=self.grouper.names[0])
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/groupby.py in _aggregate_named(self, func, *args, **kwargs)
3013 for name, group in self:
3014 group.name = name
-> 3015 output = func(group, *args, **kwargs)
3016 if isinstance(output, (Series, Index, np.ndarray)):
3017 raise Exception('Must produce aggregated value')
<ipython-input-76-01eb90aa258d> in weighted_math_average(df)
1 def weighted_math_average(df):
----> 2 weighted_math = df['UGDS'] * df['SATMTMID']
3 return int(weighted_math.sum() / df['UGDS'].sum())
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
/Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5210)()
KeyError: 'UGDS'
# apply的一个不错的功能是通过返回Series,创建多个新的列
In[80]: from collections import OrderedDict
def weighted_average(df):
data = OrderedDict()
weight_m = df['UGDS'] * df['SATMTMID']
weight_v = df['UGDS'] * df['SATVRMID']
data['weighted_math_avg'] = weight_m.sum() / df['UGDS'].sum()
data['weighted_verbal_avg'] = weight_v.sum() / df['UGDS'].sum()
data['math_avg'] = df['SATMTMID'].mean()
data['verbal_avg'] = df['SATVRMID'].mean()
data['count'] = len(df)
return pd.Series(data, dtype='int')
college2.groupby('STABBR').apply(weighted_average).head(10)
Out[80]:
# 多创建两个新的列
In[81]: from collections import OrderedDict
def weighted_average(df):
data = OrderedDict()
weight_m = df['UGDS'] * df['SATMTMID']
weight_v = df['UGDS'] * df['SATVRMID']
wm_avg = weight_m.sum() / df['UGDS'].sum()
wv_avg = weight_v.sum() / df['UGDS'].sum()
data['weighted_math_avg'] = wm_avg
data['weighted_verbal_avg'] = wv_avg
data['math_avg'] = df['SATMTMID'].mean()
data['verbal_avg'] = df['SATVRMID'].mean()
data['count'] = len(df)
return pd.Series(data, dtype='int')
college2.groupby('STABBR').apply(weighted_average).head(10)
Out[81]:
更多
# 自定义一个返回DataFrame的函数,使用NumPy的函数average计算加权平均值,使用SciPy的gmean和hmean计算几何和调和平均值
In[82]: from scipy.stats import gmean, hmean
def calculate_means(df):
df_means = pd.DataFrame(index=['Arithmetic', 'Weighted', 'Geometric', 'Harmonic'])
cols = ['SATMTMID', 'SATVRMID']
for col in cols:
arithmetic = df[col].mean()
weighted = np.average(df[col], weights=df['UGDS'])
geometric = gmean(df[col])
harmonic = hmean(df[col])
df_means[col] = [arithmetic, weighted, geometric, harmonic]
df_means['count'] = len(df)
return df_means.astype(int)
college2.groupby('STABBR').filter(lambda x: len(x) != 1).groupby('STABBR').apply(calculate_means).head(10)
Out[82]: