7. 用链式方法重现idxmax

  1. # 和前面一样,只选出数值列
  2. In[76]: college = pd.read_csv('data/college.csv', index_col='INSTNM')
  3. cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
  4. for col in cols:
  5. college[col] = pd.to_numeric(college[col], errors='coerce')
  6. college_n = college.select_dtypes(include=[np.number])
  7. criteria = college_n.nunique() == 2
  8. binary_cols = college_n.columns[criteria].tolist()
  9. college_n = college_n.drop(labels=binary_cols, axis='columns')
  10. In[77]: college_n.max().head()
  11. Out[77]: SATVRMID 765.0
  12. SATMTMID 785.0
  13. UGDS 151558.0
  14. UGDS_WHITE 1.0
  15. UGDS_BLACK 1.0
  16. dtype: float64
  1. # college_n.max()可以选出每列的最大值,用eq方法比较DataFrame的每个值和该列的最大值
  2. In[78]: college_n.eq(college_n.max()).head()
  3. Out[78]:

7. 用链式方法重现idxmax - 图1

  1. # 用any方法,选出至少包含一个True值的行
  2. In[79]: has_row_max = college_n.eq(college_n.max()).any(axis='columns')
  3. has_row_max.head()
  4. Out[79]: INSTNM
  5. Alabama A & M University False
  6. University of Alabama at Birmingham False
  7. Amridge University False
  8. University of Alabama in Huntsville False
  9. Alabama State University False
  10. dtype: bool
  1. # 因为只有18列,has_row_max最多只能有18个True,来看下实际共有多少个
  2. In[80]: college_n.shape
  3. Out[80]: (7535, 18)
  4. In[81]: has_row_max.sum()
  5. Out[81]: 401
  1. # 结果很奇怪,这是因为许多百分比的列的最大值是1。转而使用cumsum()累积求和
  2. In[82]: has_row_max.sum()
  3. In[83]: college_n.eq(college_n.max()).cumsum()
  4. Out[83]:

7. 用链式方法重现idxmax - 图2

  1. # 一些列只有一个最大值,比如SATVRMID和SATMTMID,UGDS_WHITE列却有许多最大值。有109所学校的学生100%是白人。如果再使用一次cunsum,1在每列中就只出现一次,而且会是最大值首次出现的位置:
  2. >>> college_n.eq(college_n.max()).cumsum().cumsum()

7. 用链式方法重现idxmax - 图3

  1. # 现在就可以用eq方法去和1进行比较,然后用any方法,选出所有至少包含一个True值的行
  2. In[84]: has_row_max2 = college_n.eq(college_n.max())\
  3. .cumsum()\
  4. .cumsum()\
  5. .eq(1)\
  6. .any(axis='columns')
  7. has_row_max2.head()
  8. Out[84]: INSTNM
  9. Alabama A & M University False
  10. University of Alabama at Birmingham False
  11. Amridge University False
  12. University of Alabama in Huntsville False
  13. Alabama State University False
  14. dtype: bool
  1. # 查看有多少True值
  2. In[85]: has_row_max2.sum()
  3. Out[85]: 16
  1. # 直接通过布尔索引选出这些学校
  2. In[86]: idxmax_cols = has_row_max2[has_row_max2].index
  3. idxmax_cols
  4. Out[86]: Index(['Thunderbird School of Global Management',
  5. 'Southwest University of Visual Arts-Tucson', 'ABC Beauty College Inc',
  6. 'Velvatex College of Beauty Culture',
  7. 'California Institute of Technology',
  8. 'Le Cordon Bleu College of Culinary Arts-San Francisco',
  9. 'MTI Business College Inc', 'Dongguk University-Los Angeles',
  10. 'Mr Leon's School of Hair Design-Moscow',
  11. 'Haskell Indian Nations University', 'LIU Brentwood',
  12. 'Medical College of Wisconsin', 'Palau Community College',
  13. 'California University of Management and Sciences',
  14. 'Cosmopolitan Beauty and Tech School', 'University of Phoenix-Arizona'],
  15. dtype='object', name='INSTNM')
  1. # 和idxmax方法的结果比较
  2. In[87]: set(college_n.idxmax().unique()) == set(idxmax_cols)
  3. Out[87]: True

更多

  1. # 耗时比较
  2. In[88]: %timeit college_n.idxmax().values
  3. 1.11 ms ± 50.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  4. Out[89]: %timeit college_n.eq(college_n.max())\
  5. .cumsum()\
  6. .cumsum()\
  7. .eq(1)\
  8. .any(axis='columns')\
  9. [lambda x: x].index
  10. 5.26 ms ± 35.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)