2. 改变数据类型,降低内存消耗

  1. # 选取五列
  2. In[13]: college = pd.read_csv('data/college.csv')
  3. different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']
  4. col2 = college.loc[:, different_cols]
  5. col2.head()
  6. Out[13]:

2. 改变数据类型,降低内存消耗 - 图1

  1. # 查看数据类型
  2. In[14]: col2.dtypes
  3. Out[14]: RELAFFIL int64
  4. SATMTMID float64
  5. CURROPER int64
  6. INSTNM object
  7. STABBR object
  8. dtype: object
  1. # 用memory_usage方法查看每列的内存消耗
  2. In[15]: original_mem = col2.memory_usage(deep=True)
  3. original_mem
  4. Out[15]: Index 80
  5. RELAFFIL 60280
  6. SATMTMID 60280
  7. CURROPER 60280
  8. INSTNM 660240
  9. STABBR 444565
  10. dtype: int64
  1. # RELAFFIL这列只包含0或1,因此没必要用64位,使用astype方法将其变为8位(1字节)整数
  2. In[16]: col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)
  3. # 再次查看数据类型
  4. In[17]: col2.dtypes
  5. Out[17]: RELAFFIL int8
  6. SATMTMID float64
  7. CURROPER int64
  8. INSTNM object
  9. STABBR object
  10. dtype: object
  1. # 检查两个对象列的独立值的个数
  2. In[18]: col2.select_dtypes(include=['object']).nunique()
  3. Out[18]: INSTNM 7535
  4. STABBR 59
  5. dtype: int64
  1. # STABBR列可以转变为“类型”(Categorical),独立值的个数小于总数的1%
  2. In[19]: col2['STABBR'] = col2['STABBR'].astype('category')
  3. col2.dtypes
  4. Out[19]: RELAFFIL int8
  5. SATMTMID float64
  6. CURROPER int64
  7. INSTNM object
  8. STABBR category
  9. dtype: object
  1. # 再次检查内存的使用
  2. In[20]: new_mem = col2.memory_usage(deep=True)
  3. new_mem
  4. Out[20]: Index 80
  5. RELAFFIL 7535
  6. SATMTMID 60280
  7. CURROPER 60280
  8. INSTNM 660699
  9. STABBR 13576
  10. dtype: int64
  1. # 通过和原始数据比较,RELAFFIL列变为了原来的八分之一,STABBR列只有原始大小的3%
  2. In[21]: new_mem / original_mem
  3. Out[21]: Index 1.000000
  4. RELAFFIL 0.125000
  5. SATMTMID 1.000000
  6. CURROPER 1.000000
  7. INSTNM 1.000695
  8. STABBR 0.030538
  9. dtype: float64

更多

  1. # CURROPER和INSTNM分别是int64和对象类型
  2. In[22]: college = pd.read_csv('data/college.csv')
  3. In[23]: college[['CURROPER', 'INSTNM']].memory_usage(deep=True)
  4. Out[23]: Index 80
  5. CURROPER 60280
  6. INSTNM 660240
  7. dtype: int64
  1. # CURROPER列加上了10000000,但是内存使用没有变化;但是INSTNM列加上了一个a,内存消耗增加了105字节
  2. In[24]: college.loc[0, 'CURROPER'] = 10000000
  3. college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
  4. # college.loc[1, 'INSTNM'] = college.loc[1, 'INSTNM'] + 'a'
  5. college[['CURROPER', 'INSTNM']].memory_usage(deep=True)
  6. Out[24]: Index 80
  7. CURROPER 60280
  8. INSTNM 660345
  9. dtype: int64
  1. # 数据字典中的信息显示MENONLY这列只包含0和1,但是由于含有缺失值,它的类型是浮点型
  2. In[25]: college['MENONLY'].dtype
  3. Out[25]: dtype('float64')
  1. # 任何数值类型的列,只要有一个缺失值,就会成为浮点型;这列中的任何整数都会强制成为浮点型
  2. In[26]: college['MENONLY'].astype('int8') # ValueError: Cannot convert non-finite values (NA or inf) to integer
  3. ---------------------------------------------------------------------------
  4. ValueError Traceback (most recent call last)
  5. <ipython-input-26-98afc27c1701> in <module>()
  6. ----> 1 college['MENONLY'].astype('int8') # ValueError: Cannot convert non-finite values (NA or inf) to integer
  7. ~/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
  8. 116 else:
  9. 117 kwargs[new_arg_name] = new_arg_value
  10. --> 118 return func(*args, **kwargs)
  11. 119 return wrapper
  12. 120 return _deprecate_kwarg
  13. ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
  14. 4002 # else, only a single dtype is given
  15. 4003 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
  16. -> 4004 **kwargs)
  17. 4005 return self._constructor(new_data).__finalize__(self)
  18. 4006
  19. ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
  20. 3455
  21. 3456 def astype(self, dtype, **kwargs):
  22. -> 3457 return self.apply('astype', dtype=dtype, **kwargs)
  23. 3458
  24. 3459 def convert(self, **kwargs):
  25. ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
  26. 3322
  27. 3323 kwargs['mgr'] = self
  28. -> 3324 applied = getattr(b, f)(**kwargs)
  29. 3325 result_blocks = _extend_blocks(applied, result_blocks)
  30. 3326
  31. ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs)
  32. 542 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
  33. 543 return self._astype(dtype, copy=copy, errors=errors, values=values,
  34. --> 544 **kwargs)
  35. 545
  36. 546 def _astype(self, dtype, copy=False, errors='raise', values=None,
  37. ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
  38. 623
  39. 624 # _astype_nansafe works fine with 1-d only
  40. --> 625 values = astype_nansafe(values.ravel(), dtype, copy=True)
  41. 626 values = values.reshape(self.shape)
  42. 627
  43. ~/anaconda3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy)
  44. 685
  45. 686 if not np.isfinite(arr).all():
  46. --> 687 raise ValueError('Cannot convert non-finite values (NA or inf) to '
  47. 688 'integer')
  48. 689
  49. ValueError: Cannot convert non-finite values (NA or inf) to integer
  1. # 对于数据类型,可以替换字符串名:27、28、30、31是等价的
  2. In[27]: college.describe(include=['int64', 'float64']).T
  3. Out[27]:

2. 改变数据类型,降低内存消耗 - 图2

  1. In[28]: college.describe(include=[np.int64, np.float64]).T
  2. Out[28]:

2. 改变数据类型,降低内存消耗 - 图3

  1. In[29]: college['RELAFFIL'] = college['RELAFFIL'].astype(np.int8)
  2. In[30]: college.describe(include=['int', 'float']).T # defaults to 64 bit int/floats
  3. Out[30]:

2. 改变数据类型,降低内存消耗 - 图4

  1. In[31]: college.describe(include=['number']).T # also works as the default int/float are 64 bits
  2. Out[31]:

2. 改变数据类型,降低内存消耗 - 图5

  1. # 转变数据类型时也可以如法炮制
  2. In[32]: college['MENONLY'] = college['MENONLY'].astype('float16')
  3. college['RELAFFIL'] = college['RELAFFIL'].astype('int8')
  4. In[33]: college.index = pd.Int64Index(college.index)
  5. college.index.memory_usage()
  6. Out[33]: 60280