2. 用matplotlib做数据可视化

  1. # 读取movie数据集,计算每年的预算中位数,再计算五年滚动均值以平滑数据
  2. In[32]: movie = pd.read_csv('data/movie.csv')
  3. med_budget = movie.groupby('title_year')['budget'].median() / 1e6
  4. med_budget_roll = med_budget.rolling(5, min_periods=1).mean()
  5. med_budget_roll.tail()
  6. Out[32]: title_year
  7. 2012.0 20.893
  8. 2013.0 19.893
  9. 2014.0 19.100
  10. 2015.0 17.980
  11. 2016.0 17.780
  12. Name: budget, dtype: float64
  1. # 将数据变为NumPy数组
  2. In[33]: years = med_budget_roll.index.values
  3. years[-5:]
  4. Out[33]: array([ 2012., 2013., 2014., 2015., 2016.])
  5. In[34]: budget = med_budget_roll.values
  6. budget[-5:]
  7. Out[34]: array([ 20.893, 19.893, 19.1 , 17.98 , 17.78 ])
  1. # plot方法可以用来画线图
  2. In[35]: fig, ax = plt.subplots(figsize=(14,4), linewidth=5, edgecolor='.5')
  3. ax.plot(years, budget, linestyle='--', linewidth=3, color='.2', label='All Movies')
  4. text_kwargs=dict(fontsize=20, family='cursive')
  5. ax.set_title('Median Movie Budget', **text_kwargs)
  6. ax.set_ylabel('Millions of Dollars', **text_kwargs)
  7. Out[35]: Text(0,0.5,'Millions of Dollars')

2. 用matplotlib做数据可视化 - 图1

  1. # 每年的电影产量
  2. In[36]: movie_count = movie.groupby('title_year')['budget'].count()
  3. movie_count.tail()
  4. Out[36]: title_year
  5. 2012.0 191
  6. 2013.0 208
  7. 2014.0 221
  8. 2015.0 192
  9. 2016.0 86
  10. Name: budget, dtype: int64
  1. # 在前图的基础上,将每年的电影产量画成一个柱状图,因为大部分电影都近年的,所以将起始的年设为1970
  2. In[37]: ct = movie_count.values
  3. ct_norm = ct / ct.max() * budget.max()
  4. fifth_year = (years % 5 == 0) & (years >= 1970)
  5. years_5 = years[fifth_year]
  6. ct_5 = ct[fifth_year]
  7. ct_norm_5 = ct_norm[fifth_year]
  8. ax.bar(years_5, ct_norm_5, 3, facecolor='.5', alpha=.3, label='Movies per Year')
  9. ax.set_xlim(1968, 2017)
  10. for x, y, v in zip(years_5, ct_norm_5, ct_5):
  11. ax.text(x, y + .5, str(v), ha='center')
  12. ax.legend()
  13. fig
  14. Out[37]:

2. 用matplotlib做数据可视化 - 图2

  1. # 找到每年的前10部电影的五年滚动中位数
  2. In[38]: top10 = movie.sort_values('budget', ascending=False) \
  3. .groupby('title_year')['budget'] \
  4. .apply(lambda x: x.iloc[:10].median() / 1e6)
  5. top10_roll = top10.rolling(5, min_periods=1).mean()
  6. top10_roll.tail()
  7. Out[38]: title_year
  8. 2012.0 192.9
  9. 2013.0 195.9
  10. 2014.0 191.7
  11. 2015.0 186.8
  12. 2016.0 189.1
  13. Name: budget, dtype: float64
  1. # 将上面的数据画到另一张子图中
  2. In[39]: fig2, ax_array = plt.subplots(2, 1, figsize=(14,6), sharex=True)
  3. ax1 = ax_array[0]
  4. ax2 = ax_array[1]
  5. ax1.plot(years, budget, linestyle='--', linewidth=3, color='.2', label='All Movies')
  6. ax1.bar(years_5, ct_norm_5, 3, facecolor='.5', alpha=.3, label='Movies per Year')
  7. ax1.legend(loc='upper left')
  8. ax1.set_xlim(1968, 2017)
  9. plt.setp(ax1.get_xticklines(), visible=False)
  10. for x, y, v in zip(years_5, ct_norm_5, ct_5):
  11. ax1.text(x, y + .5, str(v), ha='center')
  12. ax2.plot(years, top10_roll.values, color='.2', label='Top 10 Movies')
  13. ax2.legend(loc='upper left')
  14. fig2.tight_layout()
  15. fig2.suptitle('Median Movie Budget', y=1.02, **text_kwargs)
  16. fig2.text(0, .6, 'Millions of Dollars', rotation='vertical', ha='center', **text_kwargs)
  17. import os
  18. path = os.path.expanduser('~/Desktop/movie_budget.png')
  19. fig2.savefig(path, bbox_inches='tight')
  20. Out[39]:

2. 用matplotlib做数据可视化 - 图3

原理

  1. In[40]: med_budget_roll.tail()
  2. Out[40]: title_year
  3. 2012.0 20.893
  4. 2013.0 19.893
  5. 2014.0 19.100
  6. 2015.0 17.980
  7. 2016.0 17.780
  8. Name: budget, dtype: float64
  1. # 手动确认一下rolling方法
  2. In[41]: med_budget.loc[2012:2016].mean()
  3. Out[41]: 17.78
  4. In[42]: med_budget.loc[2011:2015].mean()
  5. Out[42]: 17.98
  6. In[43]: med_budget.loc[2010:2014].mean()
  7. Out[43]: 19.1
  1. # 必须使用expanduser创建完整路径
  2. In[44]: os.path.expanduser('~/Desktop/movie_budget.png')
  3. Out[44]: '/Users/Ted/Desktop/movie_budget.png'

更多

  1. In[45]: cols = ['budget', 'title_year', 'imdb_score', 'movie_title']
  2. m = movie[cols].dropna()
  3. # m = movie[['budget', 'title_year', 'imdb_score', 'movie_title']].dropna()
  4. m['budget2'] = m['budget'] / 1e6
  5. np.random.seed(0)
  6. movie_samp = m.query('title_year >= 2000').sample(100)
  7. fig, ax = plt.subplots(figsize=(14,6))
  8. ax.scatter(x='title_year', y='imdb_score', s='budget2', data=movie_samp)
  9. idx_min = movie_samp['imdb_score'].idxmin()
  10. idx_max = movie_samp['imdb_score'].idxmax()
  11. for idx, offset in zip([idx_min, idx_max], [.5, -.5]):
  12. year = movie_samp.loc[idx, 'title_year']
  13. score = movie_samp.loc[idx, 'imdb_score']
  14. title = movie_samp.loc[idx, 'movie_title']
  15. ax.annotate(xy=(year, score),
  16. xytext=(year + 1, score + offset),
  17. s=title + ' ({})'.format(score),
  18. ha='center',
  19. size=16,
  20. arrowprops=dict(arrowstyle="fancy"))
  21. ax.set_title('IMDB Score by Year', size=25)
  22. ax.grid(True)
  23. Out[45]:

2. 用matplotlib做数据可视化 - 图4