4. 用不等索引填充数值

  1. # 读取三个baseball数据集,行索引设为playerID
  2. In[32]: baseball_14 = pd.read_csv('data/baseball14.csv', index_col='playerID')
  3. baseball_15 = pd.read_csv('data/baseball15.csv', index_col='playerID')
  4. baseball_16 = pd.read_csv('data/baseball16.csv', index_col='playerID')
  5. baseball_14.head()
  6. Out[32]:

4. 用不等索引填充数值 - 图1

  1. # 用索引方法difference,找到哪些索引标签在baseball_14中,却不在baseball_15、baseball_16中
  2. In[33]: baseball_14.index.difference(baseball_15.index)
  3. Out[33]: Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
  4. 'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
  5. dtype='object', name='playerID')
  6. In[34]: baseball_14.index.difference(baseball_16.index)
  7. Out[34]: Index(['congeha01', 'correca01', 'gattiev01', 'gomezca01',
  8. 'lowrije01', 'rasmuco01', 'tuckepr01', 'valbulu01'],
  9. dtype='object', name='playerID')
  1. # 找到每名球员在过去三个赛季的击球数,H列包含了这个数据
  2. In[35]: hits_14 = baseball_14['H']
  3. hits_15 = baseball_15['H']
  4. hits_16 = baseball_16['H']
  5. hits_14.head()
  6. Out[35]: Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
  7. 'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
  8. dtype='object', name='playerID')
  1. # 将hits_14和hits_15两列相加
  2. In[36]: (hits_14 + hits_15).head()
  3. Out[36]: playerID
  4. altuvjo01 425.0
  5. cartech02 193.0
  6. castrja01 174.0
  7. congeha01 NaN
  8. corpoca01 NaN
  9. Name: H, dtype: float64
  1. # congeha01 和 corpoca01 在2015年是有记录的,但是结果缺失了。使用add方法和参数fill_value,避免产生缺失值
  2. In[37]: hits_14.add(hits_15, fill_value=0).head()
  3. Out[37]: playerID
  4. altuvjo01 425.0
  5. cartech02 193.0
  6. castrja01 174.0
  7. congeha01 46.0
  8. corpoca01 40.0
  9. Name: H, dtype: float64
  1. # 再将2016的数据也加上
  2. In[38]: hits_total = hits_14.add(hits_15, fill_value=0).add(hits_16, fill_value=0)
  3. hits_total.head()
  4. Out[38]: playerID
  5. altuvjo01 641.0
  6. bregmal01 53.0
  7. cartech02 193.0
  8. castrja01 243.0
  9. congeha01 46.0
  10. Name: H, dtype: float64
  1. # 检查结果中是否有缺失值
  2. In[39]: hits_total.hasnans
  3. Out[39]: False

原理

  1. # 如果一个元素在两个Series都是缺失值,即便使用了fill_value,相加的结果也仍是缺失值
  2. In[40]: s = pd.Series(index=['a', 'b', 'c', 'd'], data=[np.nan, 3, np.nan, 1])
  3. s
  4. Out[40]: a NaN
  5. b 3.0
  6. c NaN
  7. d 1.0
  8. dtype: float64
  9. In[41]: s1 = pd.Series(index=['a', 'b', 'c'], data=[np.nan, 6, 10])
  10. s1
  11. Out[41]: a NaN
  12. b 6.0
  13. c 10.0
  14. dtype: float64
  15. In[42]: s.add(s1, fill_value=5)
  16. Out[42]: a NaN
  17. b 9.0
  18. c 15.0
  19. d 6.0
  20. dtype: float64
  21. In[43]: s1.add(s, fill_value=5)
  22. Out[43]: a NaN
  23. b 9.0
  24. c 15.0
  25. d 6.0
  26. dtype: float64

更多

  1. # 从baseball_14中选取一些列
  2. In[44]: df_14 = baseball_14[['G','AB', 'R', 'H']]
  3. df_14.head()
  4. Out[44]:

4. 用不等索引填充数值 - 图2

  1. # 再从baseball_15中选取一些列,有相同的、也有不同的
  2. In[45]: df_15 = baseball_15[['AB', 'R', 'H', 'HR']]
  3. df_15.head()
  4. Out[45]:

4. 用不等索引填充数值 - 图3

  1. # 将二者相加的话,只要行或列不能对齐,就会产生缺失值。style属性的highlight_null方法可以高亮缺失值
  2. In[46]: (df_14 + df_15).head(10).style.highlight_null('yellow')
  3. Out[46]:

4. 用不等索引填充数值 - 图4

  1. # 即便使用了fill_value=0,有些值也会是缺失值,这是因为一些行和列的组合根本不存在输入的数据中
  2. In[47]: df_14.add(df_15, fill_value=0).head(10).style.highlight_null('yellow')
  3. Out[47]:

4. 用不等索引填充数值 - 图5