10. 用where方法保留Series

  1. # 读取movie数据集,movie_title作为行索引,actor_1_facebook_likes列删除缺失值
  2. In[68]: movie = pd.read_csv('data/movie.csv', index_col='movie_title')
  3. fb_likes = movie['actor_1_facebook_likes'].dropna()
  4. fb_likes.head()
  5. Out[68]: movie_title
  6. Avatar 1000.0
  7. Pirates of the Caribbean: At World's End 40000.0
  8. Spectre 11000.0
  9. The Dark Knight Rises 27000.0
  10. Star Wars: Episode VII - The Force Awakens 131.0
  11. Name: actor_1_facebook_likes, dtype: float64
  1. # 使用describe获得对数据的认知
  2. In[69]: fb_likes.describe(percentiles=[.1, .25, .5, .75, .9]).astype(int)
  3. Out[69]: count 4909
  4. mean 6494
  5. std 15106
  6. min 0
  7. 10% 240
  8. 25% 607
  9. 50% 982
  10. 75% 11000
  11. 90% 18000
  12. max 640000
  13. Name: actor_1_facebook_likes, dtype: int64
  1. # 作用和前面相同(这里是作者代码弄乱了)
  2. In[70]: fb_likes.describe(percentiles=[.1,.25,.5,.75,.9])
  3. Out[70]: count 4909.000000
  4. mean 6494.488491
  5. std 15106.986884
  6. min 0.000000
  7. 10% 240.000000
  8. 25% 607.000000
  9. 50% 982.000000
  10. 75% 11000.000000
  11. 90% 18000.000000
  12. max 640000.000000
  13. Name: actor_1_facebook_likes, dtype: float64
  1. # 画一张柱状图
  2. In[71]: fb_likes.hist()
  3. Out[71]: <matplotlib.axes._subplots.AxesSubplot at 0x10f9fbe80>

10. 用where方法保留Series - 图1

  1. # 检测小于20000个喜欢的的比例
  2. In[72]: criteria_high = fb_likes < 20000
  3. criteria_high.mean().round(2)
  4. Out[71]: 0.91000000000000003
  1. # where条件可以返回一个同样大小的Series,但是所有False会被替换成缺失值
  2. In[73]: fb_likes.where(criteria_high).head()
  3. Out[73]: movie_title
  4. Avatar 1000.0
  5. Pirates of the Caribbean: At World's End NaN
  6. Spectre 11000.0
  7. The Dark Knight Rises NaN
  8. Star Wars: Episode VII - The Force Awakens 131.0
  9. Name: actor_1_facebook_likes, dtype: float64
  1. # 第二个参数other,可以让你控制替换值
  2. In[74]: fb_likes.where(criteria_high, other=20000).head()
  3. Out[74]: movie_title
  4. Avatar 1000.0
  5. Pirates of the Caribbean: At World's End 20000.0
  6. Spectre 11000.0
  7. The Dark Knight Rises 20000.0
  8. Star Wars: Episode VII - The Force Awakens 131.0
  9. Name: actor_1_facebook_likes, dtype: float64
  1. # 通过where条件,设定上下限的值
  2. In[75]: criteria_low = fb_likes > 300
  3. fb_likes_cap = fb_likes.where(criteria_high, other=20000)\
  4. .where(criteria_low, 300)
  5. fb_likes_cap.head()
  6. Out[75]: movie_title
  7. Avatar 1000.0
  8. Pirates of the Caribbean: At World's End 20000.0
  9. Spectre 11000.0
  10. The Dark Knight Rises 20000.0
  11. Star Wars: Episode VII - The Force Awakens 300.0
  12. Name: actor_1_facebook_likes, dtype: float64
  1. # 原始Series和修改过的Series的长度是一样的
  2. In[76]: len(fb_likes), len(fb_likes_cap)
  3. Out[76]: (4909, 4909)
  1. # 再做一张柱状图,效果好多了
  2. In[77]: fb_likes_cap.hist()
  3. Out[77]: <matplotlib.axes._subplots.AxesSubplot at 0x10eeea8d0>

10. 用where方法保留Series - 图2

  1. In[78]: fb_likes_cap2 = fb_likes.clip(lower=300, upper=20000)
  2. fb_likes_cap2.equals(fb_likes_cap)
  3. Out[78]: True