3. 用布尔索引过滤

  1. # 读取movie数据集,创建布尔条件
  2. In[15]: movie = pd.read_csv('data/movie.csv', index_col='movie_title')
  3. crit_a1 = movie.imdb_score > 8
  4. crit_a2 = movie.content_rating == 'PG-13'
  5. crit_a3 = (movie.title_year < 2000) | (movie.title_year > 2009)
  6. final_crit_a = crit_a1 & crit_a2 & crit_a3
  7. # 创建第二个布尔条件
  8. In[16]: crit_b1 = movie.imdb_score < 5
  9. crit_b2 = movie.content_rating == 'R'
  10. crit_b3 = (movie.title_year >= 2000) & (movie.title_year <= 2010)
  11. final_crit_b = crit_b1 & crit_b2 & crit_b3
  12. # 将这两个条件用或运算合并起来
  13. In[17]: final_crit_all = final_crit_a | final_crit_b
  14. final_crit_all.head()
  15. Out[17]: movie_title
  16. Avatar False
  17. Pirates of the Caribbean: At World's End False
  18. Spectre False
  19. The Dark Knight Rises True
  20. Star Wars: Episode VII - The Force Awakens False
  21. dtype: bool
  1. # 用最终的布尔条件过滤数据
  2. In[18]: movie[final_crit_all].head()
  3. Out[18]:

3. 用布尔索引过滤 - 图1

  1. # 使用loc,对指定的列做过滤操作,可以清楚地看到过滤是否起作用
  2. In[19]: cols = ['imdb_score', 'content_rating', 'title_year']
  3. movie_filtered = movie.loc[final_crit_all, cols]
  4. movie_filtered.head(10)
  5. Out[19]:

3. 用布尔索引过滤 - 图2

更多

  1. # 用一个长布尔表达式代替前面由短表达式生成的布尔条件
  2. In[21]: final_crit_a2 = (movie.imdb_score > 8) & \
  3. (movie.content_rating == 'PG-13') & \
  4. ((movie.title_year < 2000) | (movie.title_year > 2009))
  5. final_crit_a2.equals(final_crit_a)
  6. Out[21]:

3. 用布尔索引过滤 - 图3