4. 可视化flights数据集

  1. # 读取flights数据集
  2. In[52]: flights = pd.read_csv('data/flights.csv')
  3. flights.head()
  4. Out[52]:

4. 可视化flights数据集 - 图1

  1. # 创建两列,表示延迟和准时
  2. In[53]: flights['DELAYED'] = flights['ARR_DELAY'].ge(15).astype(int)
  3. cols = ['DIVERTED', 'CANCELLED', 'DELAYED']
  4. flights['ON_TIME'] = 1 - flights[cols].any(axis=1)
  5. cols.append('ON_TIME')
  6. status = flights[cols].sum()
  7. status
  8. Out[53]: DIVERTED 137
  9. CANCELLED 881
  10. DELAYED 11685
  11. ON_TIME 45789
  12. dtype: int64
  1. # 对类型值和连续值列作图
  2. In[54]: fig, ax_array = plt.subplots(2, 3, figsize=(18,8))
  3. (ax1, ax2, ax3), (ax4, ax5, ax6) = ax_array
  4. fig.suptitle('2015 US Flights - Univariate Summary', size=20)
  5. ac = flights['AIRLINE'].value_counts()
  6. ac.plot(kind='barh', ax=ax1, title ='Airline')
  7. oc = flights['ORG_AIR'].value_counts()
  8. oc.plot(kind='bar', ax=ax2, rot=0, title='Origin City')
  9. dc = flights['DEST_AIR'].value_counts().head(10)
  10. dc.plot(kind='bar', ax=ax3, rot=0, title='Destination City')
  11. status.plot(kind='bar', ax=ax4, rot=0, log=True, title='Flight Status')
  12. flights['DIST'].plot(kind='kde', ax=ax5, xlim=(0, 3000),
  13. title='Distance KDE')
  14. flights['ARR_DELAY'].plot(kind='hist', ax=ax6,
  15. title='Arrival Delay', range=(0,200))
  16. Out[54]: <matplotlib.axes._subplots.AxesSubplot at 0x11a67e3c8>

4. 可视化flights数据集 - 图2

  1. # 添加关于年的列,用起飞时间得到小时和分钟
  2. In[55]: hour = flights['SCHED_DEP'] // 100
  3. minute = flights['SCHED_DEP'] % 100
  4. df_date = flights[['MONTH', 'DAY']].assign(YEAR=2015, HOUR=hour, MINUTE=minute)
  5. df_date.head()
  6. Out[55]:

4. 可视化flights数据集 - 图3

  1. # 用to_datetime函数,将df_date变为Timestamps对象
  2. In[56]: flight_dep = pd.to_datetime(df_date)
  3. flight_dep.head()
  4. Out[56]: 0 2015-01-01 16:25:00
  5. 1 2015-01-01 08:23:00
  6. 2 2015-01-01 13:05:00
  7. 3 2015-01-01 15:55:00
  8. 4 2015-01-01 17:20:00
  9. dtype: datetime64[ns]
  1. # 用flight_dep作为新的行索引,并根据它统计每周的航班数
  2. In[57]: flights.index = flight_dep
  3. fc = flights.resample('W').size()
  4. fc.plot(figsize=(12,3), title='Flights per Week', grid=True)
  5. Out[57]: <matplotlib.axes._subplots.AxesSubplot at 0x109d116d8>

4. 可视化flights数据集 - 图4

  1. # 如果航班数小于1000,则将其当做缺失值。然后用interpolate方法填补缺失值
  2. In[58]: fc_miss = fc.where(fc > 1000)
  3. fc_intp = fc_miss.interpolate(limit_direction='both')
  4. ax = fc_intp.plot(color='black', figsize=(16,4))
  5. fc_intp[fc < 500].plot(linewidth=10, grid=True,
  6. color='.8', ax=ax)
  7. ax.annotate(xy=(.8, .55), xytext=(.8, .77),
  8. xycoords='axes fraction', s='missing data',
  9. ha='center', size=20, arrowprops=dict())
  10. ax.set_title('Flights per Week (Interpolated Missing Data)')
  11. Out[58]: Text(0.5,1,'Flights per Week (Interpolated Missing Data)')

4. 可视化flights数据集 - 图5

  1. # 找到10个有最长平均入境航班航程、最少100航次的机场
  2. In[59]: flights.groupby('DEST_AIR')['DIST'] \
  3. .agg(['mean', 'count']) \
  4. .query('count > 100') \
  5. .sort_values('mean') \
  6. .tail(10) \
  7. .plot(kind='bar', y='mean', legend=False,
  8. rot=0, figsize=(14,4),
  9. title='Average Distance per Destination')
  10. Out[59]: <matplotlib.axes._subplots.AxesSubplot at 0x11a480dd8>

4. 可视化flights数据集 - 图6

  1. # 画出航班时间和航程的散点图
  2. In[60]: fs = flights.reset_index(drop=True)[['DIST', 'AIR_TIME']].query('DIST <= 2000').dropna()
  3. fs.plot(x='DIST', y='AIR_TIME', kind='scatter', s=1, figsize=(16,4))
  4. Out[60]: <matplotlib.axes._subplots.AxesSubplot at 0x11a49b860>

4. 可视化flights数据集 - 图7

  1. # 用cut函数,将航班距离分成八组
  2. In[61]: fs['DIST_GROUP'] = pd.cut(fs['DIST'], bins=range(0, 2001, 250))
  3. fs['DIST_GROUP'].value_counts().sort_index()
  4. Out[61]: (0, 250] 6529
  5. (250, 500] 12631
  6. (500, 750] 11506
  7. (750, 1000] 8832
  8. (1000, 1250] 5071
  9. (1250, 1500] 3198
  10. (1500, 1750] 3885
  11. (1750, 2000] 1815
  12. Name: DIST_GROUP, dtype: int64
  1. # 计算每组的标准差
  2. In[62]: normalize = lambda x: (x - x.mean()) / x.std()
  3. fs['TIME_SCORE'] = fs.groupby('DIST_GROUP')['AIR_TIME'] \
  4. .transform(normalize)
  5. fs.head()
  6. Out[62]:

4. 可视化flights数据集 - 图8

  1. # 用boxplot方法画出异常值
  2. In[63]: ax = fs.boxplot(by='DIST_GROUP', column='TIME_SCORE', figsize=(16,4))
  3. ax.set_title('Z-Scores for Distance Groups')
  4. ax.figure.suptitle('')
  5. /Users/Ted/anaconda/lib/python3.6/site-packages/numpy/core/fromnumeric.py:57: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  6. return getattr(obj, method)(*args, **kwds)
  7. Out[63]: Text(0.5,0.98,'')

4. 可视化flights数据集 - 图9

  1. # 检查超出6个标准偏差的点。用一个DataFrame记录异常点。
  2. In[64]: outliers = flights.iloc[fs[fs['TIME_SCORE'] > 6].index]
  3. outliers = outliers[['AIRLINE','ORG_AIR', 'DEST_AIR', 'AIR_TIME',
  4. 'DIST', 'ARR_DELAY', 'DIVERTED']]
  5. outliers['PLOT_NUM'] = range(1, len(outliers) + 1)
  6. outliers
  7. Out[64]:

4. 可视化flights数据集 - 图10

  1. # 可以这张表的数据确定异常值。pandas提供了将表格附加于图片底部的方法。
  2. In[65]: ax = fs.plot(x='DIST', y='AIR_TIME',
  3. kind='scatter', s=1,
  4. figsize=(16,4), table=outliers)
  5. outliers.plot(x='DIST', y='AIR_TIME',
  6. kind='scatter', s=25, ax=ax, grid=True)
  7. outs = outliers[['AIR_TIME', 'DIST', 'PLOT_NUM']]
  8. for t, d, n in outs.itertuples(index=False):
  9. ax.text(d + 5, t + 5, str(n))
  10. plt.setp(ax.get_xticklabels(), y=.1)
  11. plt.setp(ax.get_xticklines(), visible=False)
  12. ax.set_xlabel('')
  13. ax.set_title('Flight Time vs Distance with Outliers')
  14. Out[65]: Text(0.5,1,'Flight Time vs Distance with Outliers')

4. 可视化flights数据集 - 图11