5. 调用Series方法

准备

  1. in[23]: # 查看Series所有不重复的指令
  2. s_attr_methods = set(dir(pd.Series))
  3. # 该集合的大小
  4. len(s_attr_methods)
  5. out[23]: 442
  1. in[24]: # 查看DataFrame所有不重复的指令
  2. df_attr_methods = set(dir(pd.DataFrame))
  3. len(df_attr_methods)
  4. out[24]: 445
  1. in[25]: # 这两个集合中有多少共有的指令
  2. len(s_attr_methods & df_attr_methods)
  3. out[25]: 376

原理

  1. in[26]: # 选取director和actor_1_fb_likes两列
  2. movie = pd.read_csv('data/movie.csv')
  3. director = movie['director_name']
  4. actor_1_fb_likes = movie['actor_1_facebook_likes']
  1. # 查看头部
  2. in[27]: director.head()
  3. out[27]: 0 James Cameron
  4. 1 Gore Verbinski
  5. 2 Sam Mendes
  6. 3 Christopher Nolan
  7. 4 Doug Walker
  8. Name: director_name, dtype: object
  9. in[28]: actor_1_fb_likes.head()
  10. out[28]: 0 1000.0
  11. 1 40000.0
  12. 2 11000.0
  13. 3 27000.0
  14. 4 131.0
  15. Name: actor_1_facebook_likes, dtype: float64
  1. in[29]: # 分别计数
  2. pd.set_option('max_rows', 8)
  3. director.value_counts()
  4. out[29]: Steven Spielberg 26
  5. Woody Allen 22
  6. Clint Eastwood 20
  7. Martin Scorsese 20
  8. ..
  9. James Nunn 1
  10. Gerard Johnstone 1
  11. Ethan Maniquis 1
  12. Antony Hoffman 1
  13. Name: director_name, Length: 2397, dtype: int64
  14. in[30]: actor_1_fb_likes.value_counts()
  15. out[30]: 1000.0 436
  16. 11000.0 206
  17. 2000.0 189
  18. 3000.0 150
  19. ...
  20. 216.0 1
  21. 859.0 1
  22. 225.0 1
  23. 334.0 1
  24. Name: actor_1_facebook_likes, Length: 877, dtype: int64
  1. in[31]: director.size
  2. out[31]: 4916
  3. in[32]: director.shape
  4. out[33]: (4916,)
  5. in[33]: len(director)
  6. out[33]: 4916
  1. in[34]: # director有多少非空值
  2. director.count()
  3. out[34]: 4814 # 说明有102个缺失值
  1. in[35]: # actor_1_fb_likes有多少非空值
  2. actor_1_fb_likes.count()
  3. out[35]: 4909
  1. in[36]: # actor_1_fb_likes的中位分位数
  2. actor_1_fb_likes.quantile()
  3. out[36]: 982.0
  1. in[37]: # 求最小值、最大值、平均值、中位数、标准差、总和
  2. actor_1_fb_likes.min(), actor_1_fb_likes.max(), \
  3. actor_1_fb_likes.mean(), actor_1_fb_likes.median(), \
  4. actor_1_fb_likes.std(), actor_1_fb_likes.sum()
  5. out[37]: (0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848309, 31881444.0)
  1. in[38]: # 打印描述信息
  2. actor_1_fb_likes.describe()
  3. out[38]: count 4909.000000
  4. mean 6494.488491
  5. std 15106.986884
  6. min 0.000000
  7. 25% 607.000000
  8. 50% 982.000000
  9. 75% 11000.000000
  10. max 640000.000000
  11. Name: actor_1_facebook_likes, dtype: float64
  12. in[39]: director.describe()
  13. out[39]: count 4814
  14. unique 2397
  15. top Steven Spielberg
  16. freq 26
  17. Name: director_name, dtype: object
  1. in[40]: actor_1_fb_likes.quantile(.2)
  2. out[41]: 510.0
  1. in[41]: # 各个十分之一分位数
  2. actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])
  3. out[41]: 0.1 240.0
  4. 0.2 510.0
  5. 0.3 694.0
  6. 0.4 854.0
  7. ...
  8. 0.6 1000.0
  9. 0.7 8000.0
  10. 0.8 13000.0
  11. 0.9 18000.0
  12. Name: actor_1_facebook_likes, Length: 9, dtype: float64
  1. # 非空值
  2. In[42]: director.isnull()
  3. Out[42]: 0 False
  4. 1 False
  5. 2 False
  6. 3 False
  7. ...
  8. 4912 True
  9. 4913 False
  10. 4914 False
  11. 4915 False
  12. Name: director_name, Length: 4916, dtype: bool
  1. # 填充缺失值
  2. In[43]: actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)
  3. actor_1_fb_likes_filled.count()
  4. Out[43]: 4916
  1. # 删除缺失值
  2. In[44]: actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()
  3. actor_1_fb_likes_dropped.size
  4. Out[44]: 4909

更多

  1. # value_counts(normalize=True) 可以返回频率
  2. In[45]: director.value_counts(normalize=True)
  3. Out[45]: Steven Spielberg 0.005401
  4. Woody Allen 0.004570
  5. Clint Eastwood 0.004155
  6. Martin Scorsese 0.004155
  7. ...
  8. James Nunn 0.000208
  9. Gerard Johnstone 0.000208
  10. Ethan Maniquis 0.000208
  11. Antony Hoffman 0.000208
  12. Name: director_name, Length: 2397, dtype: float64
  1. # 判断是否有缺失值
  2. In[46]: director.hasnans
  3. Out[46]: True
  1. # 判断是否是非缺失值
  2. In[47]: director.notnull()
  3. Out[47]: 0 True
  4. 1 True
  5. 2 True
  6. 3 True
  7. ...
  8. 4912 False
  9. 4913 True
  10. 4914 True
  11. 4915 True
  12. Name: director_name, Length: 4916, dtype: bool