11. 对DataFrame的行做mask

  1. # 读取movie,根据条件进行筛选
  2. In[79]: movie = pd.read_csv('data/movie.csv', index_col='movie_title')
  3. c1 = movie['title_year'] >= 2010
  4. c2 = movie['title_year'].isnull()
  5. criteria = c1 | c2
  6. # 使用mask方法,使所有满足条件的数据消失
  7. In[80]: movie.mask(criteria).head()
  8. Out[80]:

11. 对DataFrame的行做mask - 图1

  1. # 去除缺失值
  2. In[81]: movie_mask = movie.mask(criteria).dropna(how='all')
  3. movie_mask.head()
  4. Out[81]:

11. 对DataFrame的行做mask - 图2

  1. # 用布尔索引选取title_year小于2010的电影
  2. In[82]: movie_boolean = movie[movie['title_year'] < 2010]
  3. movie_boolean.head()
  4. Out[82]:

11. 对DataFrame的行做mask - 图3

  1. # 判断这两种方法是否相同
  2. In[83]: movie_mask.equals(movie_boolean)
  3. Out[83]: False
  1. # 判断二者的形状是否相同
  2. In[84]: movie_mask.shape == movie_boolean.shape
  3. Out[84]: True
  1. # mask方法产生了许多缺失值,缺失值是float类型,所以之前是整数型的列都变成了浮点型
  2. In[85]: movie_mask.dtypes == movie_boolean.dtypes
  3. Out[85]:
  4. color True
  5. director_name True
  6. num_critic_for_reviews True
  7. duration True
  8. director_facebook_likes True
  9. actor_3_facebook_likes True
  10. actor_2_name True
  11. actor_1_facebook_likes True
  12. gross True
  13. genres True
  14. actor_1_name True
  15. num_voted_users False
  16. cast_total_facebook_likes False
  17. actor_3_name True
  18. facenumber_in_poster True
  19. plot_keywords True
  20. movie_imdb_link True
  21. num_user_for_reviews True
  22. language True
  23. country True
  24. content_rating True
  25. budget True
  26. title_year True
  27. actor_2_facebook_likes True
  28. imdb_score True
  29. aspect_ratio True
  30. movie_facebook_likes False
  31. dtype: bool
  1. # Pandas有一个assert_frame_equal方法,可以判断两个Pandas对象是否一样,而不检测其数据类型
  2. In[86]: from pandas.testing import assert_frame_equal
  3. assert_frame_equal(movie_boolean, movie_mask, check_dtype=False)

更多

  1. # 比较mask和布尔索引的速度,两者相差了一个数量级
  2. In[87]: %timeit movie.mask(criteria).dropna(how='all')
  3. 11.1 ms ± 48.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
  4. In[88]: %timeit movie[movie['title_year'] < 2010]
  5. 1.12 ms ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)