第01章 Pandas基础 - 5. 调用Series方法 - 《Pandas Cookbook 带注释源码》

5. 调用Series方法

准备

 in[23]: #  查看Series所有不重复的指令
         s_attr_methods = set(dir(pd.Series))
         #  该集合的大小
         len(s_attr_methods)
out[23]: 442

 in[24]: #  查看DataFrame所有不重复的指令
         df_attr_methods = set(dir(pd.DataFrame))
         len(df_attr_methods)
out[24]: 445

 in[25]: #  这两个集合中有多少共有的指令
         len(s_attr_methods & df_attr_methods)
out[25]: 376

原理

 in[26]: #  选取director和actor_1_fb_likes两列
         movie = pd.read_csv('data/movie.csv')
         director = movie['director_name']
         actor_1_fb_likes = movie['actor_1_facebook_likes']

#  查看头部
 in[27]: director.head()
out[27]: 0        James Cameron
         1       Gore Verbinski
         2           Sam Mendes
         3    Christopher Nolan
         4          Doug Walker
         Name: director_name, dtype: object
 in[28]: actor_1_fb_likes.head()
out[28]: 0     1000.0
         1    40000.0
         2    11000.0
         3    27000.0
         4      131.0
        Name: actor_1_facebook_likes, dtype: float64

 in[29]: #  分别计数
         pd.set_option('max_rows', 8)
         director.value_counts()
out[29]: Steven Spielberg    26
         Woody Allen         22
         Clint Eastwood      20
         Martin Scorsese     20
                              ..
         James Nunn           1
         Gerard Johnstone     1
         Ethan Maniquis       1
         Antony Hoffman       1
         Name: director_name, Length: 2397, dtype: int64
 in[30]: actor_1_fb_likes.value_counts()
out[30]: 1000.0     436
         11000.0    206
         2000.0     189
         3000.0     150
                  ... 
         216.0        1
         859.0        1
         225.0        1
         334.0        1
         Name: actor_1_facebook_likes, Length: 877, dtype: int64

 in[31]: director.size  
out[31]: 4916
 in[32]: director.shape 
out[33]: (4916,)
 in[33]: len(director)  
out[33]: 4916

 in[34]: #  director有多少非空值
         director.count() 
out[34]: 4814 #  说明有102个缺失值

 in[35]: #  actor_1_fb_likes有多少非空值
         actor_1_fb_likes.count()
out[35]: 4909

 in[36]: #  actor_1_fb_likes的中位分位数
         actor_1_fb_likes.quantile()
out[36]: 982.0

 in[37]: #  求最小值、最大值、平均值、中位数、标准差、总和
         actor_1_fb_likes.min(), actor_1_fb_likes.max(), \
         actor_1_fb_likes.mean(), actor_1_fb_likes.median(), \
         actor_1_fb_likes.std(), actor_1_fb_likes.sum()
out[37]: (0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848309, 31881444.0)

 in[38]: #  打印描述信息
         actor_1_fb_likes.describe()
out[38]: count      4909.000000
         mean       6494.488491
         std       15106.986884
         min           0.000000
         25%         607.000000
         50%         982.000000
         75%       11000.000000
         max      640000.000000
         Name: actor_1_facebook_likes, dtype: float64
 in[39]: director.describe()
out[39]: count                 4814
         unique                2397
         top       Steven Spielberg
         freq                    26
         Name: director_name, dtype: object

 in[40]: actor_1_fb_likes.quantile(.2)
out[41]: 510.0

 in[41]: #  各个十分之一分位数
         actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])
out[41]: 0.1      240.0
         0.2      510.0
         0.3      694.0
         0.4      854.0
               ...   
         0.6     1000.0
         0.7     8000.0
         0.8    13000.0
         0.9    18000.0
         Name: actor_1_facebook_likes, Length: 9, dtype: float64

#  非空值
 In[42]: director.isnull()
Out[42]: 0       False
         1       False
         2       False
         3       False
                 ...  
         4912     True
         4913    False
         4914    False
         4915    False
         Name: director_name, Length: 4916, dtype: bool

#  填充缺失值
 In[43]: actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)
         actor_1_fb_likes_filled.count()
Out[43]: 4916

#  删除缺失值
 In[44]: actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()
         actor_1_fb_likes_dropped.size
Out[44]: 4909

#  value_counts(normalize=True) 可以返回频率
 In[45]: director.value_counts(normalize=True)
Out[45]: Steven Spielberg    0.005401
         Woody Allen         0.004570
         Clint Eastwood      0.004155
         Martin Scorsese     0.004155
                         ...   
         James Nunn          0.000208
         Gerard Johnstone    0.000208
         Ethan Maniquis      0.000208
         Antony Hoffman      0.000208
         Name: director_name, Length: 2397, dtype: float64

#  判断是否有缺失值
 In[46]: director.hasnans
Out[46]: True

#  判断是否是非缺失值
 In[47]: director.notnull()
Out[47]: 0        True
         1        True
         2        True
         3        True
                 ...  
         4912    False
         4913     True
         4914     True
         4915     True
         Name: director_name, Length: 4916, dtype: bool

5. 调用Series方法

5. 调用Series方法

准备

原理

更多