5. 从不同的DataFrame追加列

  1. # 读取employee数据,选取'DEPARTMENT', 'BASE_SALARY'这两列
  2. In[48]: employee = pd.read_csv('data/employee.csv')
  3. dept_sal = employee[['DEPARTMENT', 'BASE_SALARY']]
  4. # 在每个部门内,对BASE_SALARY进行排序
  5. In[49]: dept_sal = dept_sal.sort_values(['DEPARTMENT', 'BASE_SALARY'],
  6. ascending=[True, False])
  7. # 用drop_duplicates方法保留每个部门的第一行
  8. In[50]: max_dept_sal = dept_sal.drop_duplicates(subset='DEPARTMENT')
  9. max_dept_sal.head()
  10. Out[50]:

5. 从不同的DataFrame追加列 - 图1

  1. # 使用DEPARTMENT作为行索引
  2. In[51]: max_dept_sal = max_dept_sal.set_index('DEPARTMENT')
  3. employee = employee.set_index('DEPARTMENT')
  4. # 现在行索引包含匹配值了,可以向employee的DataFrame新增一列
  5. In[52]: employee['MAX_DEPT_SALARY'] = max_dept_sal['BASE_SALARY']
  6. In[53]: pd.options.display.max_columns = 6
  7. Out[54]:

5. 从不同的DataFrame追加列 - 图2

  1. # 现在可以用query查看是否有BASE_SALARY大于MAX_DEPT_SALARY的
  2. In[55]: employee.query('BASE_SALARY > MAX_DEPT_SALARY')
  3. Out[55]:

5. 从不同的DataFrame追加列 - 图3

原理

  1. # 用random从dept_sal随机取10行,不做替换
  2. In[56]: np.random.seed(1234)
  3. random_salary = dept_sal.sample(n=10).set_index('DEPARTMENT')
  4. random_salary
  5. Out[56]:

5. 从不同的DataFrame追加列 - 图4

  1. # random_salary中是有重复索引的,employee DataFrame的标签要对应random_salary中的多个标签
  2. In[57]: employee['RANDOM_SALARY'] = random_salary['BASE_SALARY']
  3. ---------------------------------------------------------------------------
  4. ValueError Traceback (most recent call last)
  5. <ipython-input-57-1cbebe15fa39> in <module>()
  6. ----> 1 employee['RANDOM_SALARY'] = random_salary['BASE_SALARY']
  7. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
  8. 2329 else:
  9. 2330 # set column
  10. -> 2331 self._set_item(key, value)
  11. 2332
  12. 2333 def _setitem_slice(self, key, value):
  13. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _set_item(self, key, value)
  14. 2395
  15. 2396 self._ensure_valid_index(value)
  16. -> 2397 value = self._sanitize_column(key, value)
  17. 2398 NDFrame._set_item(self, key, value)
  18. 2399
  19. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value, broadcast)
  20. 2545
  21. 2546 if isinstance(value, Series):
  22. -> 2547 value = reindexer(value)
  23. 2548
  24. 2549 elif isinstance(value, DataFrame):
  25. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in reindexer(value)
  26. 2537 # duplicate axis
  27. 2538 if not value.index.is_unique:
  28. -> 2539 raise e
  29. 2540
  30. 2541 # other
  31. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/frame.py in reindexer(value)
  32. 2532 # GH 4107
  33. 2533 try:
  34. -> 2534 value = value.reindex(self.index)._values
  35. 2535 except Exception as e:
  36. 2536
  37. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/series.py in reindex(self, index, **kwargs)
  38. 2424 @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
  39. 2425 def reindex(self, index=None, **kwargs):
  40. -> 2426 return super(Series, self).reindex(index=index, **kwargs)
  41. 2427
  42. 2428 @Appender(generic._shared_docs['fillna'] % _shared_doc_kwargs)
  43. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
  44. 2513 # perform the reindex on the axes
  45. 2514 return self._reindex_axes(axes, level, limit, tolerance, method,
  46. -> 2515 fill_value, copy).__finalize__(self)
  47. 2516
  48. 2517 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
  49. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
  50. 2531 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
  51. 2532 fill_value=fill_value,
  52. -> 2533 copy=copy, allow_dups=False)
  53. 2534
  54. 2535 return obj
  55. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
  56. 2625 fill_value=fill_value,
  57. 2626 allow_dups=allow_dups,
  58. -> 2627 copy=copy)
  59. 2628
  60. 2629 if copy and new_data is self._data:
  61. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
  62. 3884 # some axes don't allow reindexing with dups
  63. 3885 if not allow_dups:
  64. -> 3886 self.axes[axis]._can_reindex(indexer)
  65. 3887
  66. 3888 if axis >= self.ndim:
  67. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer)
  68. 2834 # trying to reindex on an axis with duplicates
  69. 2835 if not self.is_unique and len(indexer):
  70. -> 2836 raise ValueError("cannot reindex from a duplicate axis")
  71. 2837
  72. 2838 def reindex(self, target, method=None, level=None, limit=None,
  73. ValueError: cannot reindex from a duplicate axis

更多

  1. # 选取max_dept_sal['BASE_SALARY']的前三行,赋值给employee['MAX_SALARY2']
  2. In[58]: employee['MAX_SALARY2'] = max_dept_sal['BASE_SALARY'].head(3)
  3. # 对MAX_SALARY2统计
  4. In[59]: employee.MAX_SALARY2.value_counts()
  5. Out[59]: 140416.0 29
  6. 100000.0 11
  7. 64251.0 5
  8. Name: MAX_SALARY2, dtype: int64
  9. # 因为只填充了三个部门的值,所有其它部门在结果中都是缺失值
  10. In[60]: employee.MAX_SALARY2.isnull().mean()
  11. Out[60]: 0.97750000000000004