11. 计算城市之间的航班总数

  1. In[92]: flights = pd.read_csv('data/flights.csv')
  2. flights.head()
  3. Out[92]:

11. 计算城市之间的航班总数 - 图1

  1. # 求每两个城市间的航班总数
  2. In[93]: flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
  3. flights_ct.head()
  4. Out[93]: ORG_AIR DEST_AIR
  5. ATL ABE 31
  6. ABQ 16
  7. ABY 19
  8. ACY 6
  9. AEX 40
  10. dtype: int64
  1. # 选出休斯顿(IAH)和亚特兰大(ATL)之间双方向的航班总数
  2. In[94]: flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]]
  3. Out[94]: ORG_AIR DEST_AIR
  4. ATL IAH 121
  5. IAH ATL 148
  6. dtype: int64
  1. # 分别对每行按照出发地和目的地,按字母排序
  2. In[95]: flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
  3. flights_sort.head()
  4. Out[95]:

11. 计算城市之间的航班总数 - 图2

  1. # 因为现在每行都是独立排序的,列名存在问题。对列重命名,然后再计算所有城市间的航班数
  2. In[96]: rename_dict = {'ORG_AIR':'AIR1','DEST_AIR':'AIR2'}
  3. flights_sort = flights_sort.rename(columns=rename_dict)
  4. flights_ct2 = flights_sort.groupby(['AIR1', 'AIR2']).size()
  5. flights_ct2.head()
  6. Out[96]: AIR1 AIR2
  7. ABE ATL 31
  8. ORD 24
  9. ABI DFW 74
  10. ABQ ATL 16
  11. DEN 46
  12. dtype: int64
  1. # 找到亚特兰大和休斯顿之间的航班数
  2. In[97]: flights_ct2.loc[('ATL', 'IAH')]
  3. Out[97]: 269
  1. # 如果调换顺序,则会出错
  2. In[98]: flights_ct2.loc[('IAH', 'ATL')]
  3. ---------------------------------------------------------------------------
  4. IndexingError Traceback (most recent call last)
  5. <ipython-input-98-56147a7d0bb5> in <module>()
  6. ----> 1 flights_ct2.loc[('IAH', 'ATL')]
  7. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
  8. 1323 except (KeyError, IndexError):
  9. 1324 pass
  10. -> 1325 return self._getitem_tuple(key)
  11. 1326 else:
  12. 1327 key = com._apply_if_callable(key, self.obj)
  13. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
  14. 839
  15. 840 # no multi-index, so validate all of the indexers
  16. --> 841 self._has_valid_tuple(tup)
  17. 842
  18. 843 # ugly hack for GH # 836
  19. /Users/Ted/anaconda/lib/python3.6/site-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
  20. 186 for i, k in enumerate(key):
  21. 187 if i >= self.obj.ndim:
  22. --> 188 raise IndexingError('Too many indexers')
  23. 189 if not self._has_valid_type(k, i):
  24. 190 raise ValueError("Location based indexing can only have [%s] "
  25. IndexingError: Too many indexers

更多

  1. # 用NumPy的sort函数可以大大提高速度
  2. In[99]: data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
  3. data_sorted[:10]
  4. Out[99]: array([['LAX', 'SLC'],
  5. ['DEN', 'IAD'],
  6. ['DFW', 'VPS'],
  7. ['DCA', 'DFW'],
  8. ['LAX', 'MCI'],
  9. ['IAH', 'SAN'],
  10. ['DFW', 'MSY'],
  11. ['PHX', 'SFO'],
  12. ['ORD', 'STL'],
  13. ['IAH', 'SJC']], dtype=object)
  1. # 重新用DataFrame构造器创建一个DataFrame,检测其是否与flights_sorted相等
  2. In[100]: flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
  3. fs_orig = flights_sort.rename(columns={'ORG_AIR':'AIR1', 'DEST_AIR':'AIR2'})
  4. flights_sort2.equals(fs_orig)
  5. Out[100]: True
  1. # 比较速度
  2. In[101]: %timeit flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
  3. 7.82 s ± 189 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
  4. In[102]: %%timeit
  5. data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
  6. flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
  7. 10.9 ms ± 325 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)