5. 调用Series方法
准备
in[23]: # 查看Series所有不重复的指令
s_attr_methods = set(dir(pd.Series))
# 该集合的大小
len(s_attr_methods)
out[23]: 442
in[24]: # 查看DataFrame所有不重复的指令
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)
out[24]: 445
in[25]: # 这两个集合中有多少共有的指令
len(s_attr_methods & df_attr_methods)
out[25]: 376
原理
in[26]: # 选取director和actor_1_fb_likes两列
movie = pd.read_csv('data/movie.csv')
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']
# 查看头部
in[27]: director.head()
out[27]: 0 James Cameron
1 Gore Verbinski
2 Sam Mendes
3 Christopher Nolan
4 Doug Walker
Name: director_name, dtype: object
in[28]: actor_1_fb_likes.head()
out[28]: 0 1000.0
1 40000.0
2 11000.0
3 27000.0
4 131.0
Name: actor_1_facebook_likes, dtype: float64
in[29]: # 分别计数
pd.set_option('max_rows', 8)
director.value_counts()
out[29]: Steven Spielberg 26
Woody Allen 22
Clint Eastwood 20
Martin Scorsese 20
..
James Nunn 1
Gerard Johnstone 1
Ethan Maniquis 1
Antony Hoffman 1
Name: director_name, Length: 2397, dtype: int64
in[30]: actor_1_fb_likes.value_counts()
out[30]: 1000.0 436
11000.0 206
2000.0 189
3000.0 150
...
216.0 1
859.0 1
225.0 1
334.0 1
Name: actor_1_facebook_likes, Length: 877, dtype: int64
in[31]: director.size
out[31]: 4916
in[32]: director.shape
out[33]: (4916,)
in[33]: len(director)
out[33]: 4916
in[34]: # director有多少非空值
director.count()
out[34]: 4814 # 说明有102个缺失值
in[35]: # actor_1_fb_likes有多少非空值
actor_1_fb_likes.count()
out[35]: 4909
in[36]: # actor_1_fb_likes的中位分位数
actor_1_fb_likes.quantile()
out[36]: 982.0
in[37]: # 求最小值、最大值、平均值、中位数、标准差、总和
actor_1_fb_likes.min(), actor_1_fb_likes.max(), \
actor_1_fb_likes.mean(), actor_1_fb_likes.median(), \
actor_1_fb_likes.std(), actor_1_fb_likes.sum()
out[37]: (0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848309, 31881444.0)
in[38]: # 打印描述信息
actor_1_fb_likes.describe()
out[38]: count 4909.000000
mean 6494.488491
std 15106.986884
min 0.000000
25% 607.000000
50% 982.000000
75% 11000.000000
max 640000.000000
Name: actor_1_facebook_likes, dtype: float64
in[39]: director.describe()
out[39]: count 4814
unique 2397
top Steven Spielberg
freq 26
Name: director_name, dtype: object
in[40]: actor_1_fb_likes.quantile(.2)
out[41]: 510.0
in[41]: # 各个十分之一分位数
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])
out[41]: 0.1 240.0
0.2 510.0
0.3 694.0
0.4 854.0
...
0.6 1000.0
0.7 8000.0
0.8 13000.0
0.9 18000.0
Name: actor_1_facebook_likes, Length: 9, dtype: float64
# 非空值
In[42]: director.isnull()
Out[42]: 0 False
1 False
2 False
3 False
...
4912 True
4913 False
4914 False
4915 False
Name: director_name, Length: 4916, dtype: bool
# 填充缺失值
In[43]: actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)
actor_1_fb_likes_filled.count()
Out[43]: 4916
# 删除缺失值
In[44]: actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()
actor_1_fb_likes_dropped.size
Out[44]: 4909
更多
# value_counts(normalize=True) 可以返回频率
In[45]: director.value_counts(normalize=True)
Out[45]: Steven Spielberg 0.005401
Woody Allen 0.004570
Clint Eastwood 0.004155
Martin Scorsese 0.004155
...
James Nunn 0.000208
Gerard Johnstone 0.000208
Ethan Maniquis 0.000208
Antony Hoffman 0.000208
Name: director_name, Length: 2397, dtype: float64
# 判断是否有缺失值
In[46]: director.hasnans
Out[46]: True
# 判断是否是非缺失值
In[47]: director.notnull()
Out[47]: 0 True
1 True
2 True
3 True
...
4912 False
4913 True
4914 True
4915 True
Name: director_name, Length: 4916, dtype: bool