第03章数据分析入门 - 2. 改变数据类型，降低内存消耗 - 《Pandas Cookbook 带注释源码》

2. 改变数据类型，降低内存消耗

#  选取五列
 In[13]: college = pd.read_csv('data/college.csv')
         different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']
         col2 = college.loc[:, different_cols]
         col2.head()
Out[13]:

#  查看数据类型
 In[14]: col2.dtypes
Out[14]: RELAFFIL      int64
         SATMTMID    float64
         CURROPER      int64
         INSTNM       object
         STABBR       object
         dtype: object

#  用memory_usage方法查看每列的内存消耗
 In[15]: original_mem = col2.memory_usage(deep=True)
         original_mem
Out[15]: Index           80
         RELAFFIL     60280
         SATMTMID     60280
         CURROPER     60280
         INSTNM      660240
         STABBR      444565
         dtype: int64

#  RELAFFIL这列只包含0或1，因此没必要用64位，使用astype方法将其变为8位（1字节）整数
 In[16]: col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)
#  再次查看数据类型
 In[17]: col2.dtypes
Out[17]: RELAFFIL       int8
         SATMTMID    float64
         CURROPER      int64
         INSTNM       object
         STABBR       object
         dtype: object

#  检查两个对象列的独立值的个数
 In[18]: col2.select_dtypes(include=['object']).nunique()
Out[18]: INSTNM    7535
         STABBR      59
         dtype: int64

#  STABBR列可以转变为“类型”（Categorical），独立值的个数小于总数的1%
 In[19]: col2['STABBR'] = col2['STABBR'].astype('category')
         col2.dtypes
Out[19]: RELAFFIL        int8
         SATMTMID     float64
         CURROPER       int64
         INSTNM        object
         STABBR      category
         dtype: object

#  再次检查内存的使用
 In[20]: new_mem = col2.memory_usage(deep=True)
         new_mem
Out[20]: Index           80
         RELAFFIL      7535
         SATMTMID     60280
         CURROPER     60280
         INSTNM      660699
         STABBR       13576
         dtype: int64

#  通过和原始数据比较，RELAFFIL列变为了原来的八分之一，STABBR列只有原始大小的3%
 In[21]: new_mem / original_mem
Out[21]: Index       1.000000
         RELAFFIL    0.125000
         SATMTMID    1.000000
         CURROPER    1.000000
         INSTNM      1.000695
         STABBR      0.030538
         dtype: float64

#  CURROPER和INSTNM分别是int64和对象类型
 In[22]: college = pd.read_csv('data/college.csv')
 In[23]: college[['CURROPER', 'INSTNM']].memory_usage(deep=True)
Out[23]: Index           80
         CURROPER     60280
         INSTNM      660240
         dtype: int64

#  CURROPER列加上了10000000，但是内存使用没有变化；但是INSTNM列加上了一个a，内存消耗增加了105字节
 In[24]: college.loc[0, 'CURROPER'] = 10000000
         college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
         #  college.loc[1, 'INSTNM'] = college.loc[1, 'INSTNM'] + 'a'
         college[['CURROPER', 'INSTNM']].memory_usage(deep=True)
Out[24]: Index           80
         CURROPER     60280
         INSTNM      660345
         dtype: int64

#  数据字典中的信息显示MENONLY这列只包含0和1，但是由于含有缺失值，它的类型是浮点型
 In[25]: college['MENONLY'].dtype
Out[25]: dtype('float64')

#  任何数值类型的列，只要有一个缺失值，就会成为浮点型；这列中的任何整数都会强制成为浮点型
 In[26]: college['MENONLY'].astype('int8') #  ValueError: Cannot convert non-finite values (NA or inf) to integer
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-26-98afc27c1701> in <module>()
----> 1 college['MENONLY'].astype('int8') #  ValueError: Cannot convert non-finite values (NA or inf) to integer
~/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    116                 else:
    117                     kwargs[new_arg_name] = new_arg_value
--> 118             return func(*args, **kwargs)
    119         return wrapper
    120     return _deprecate_kwarg
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
   4002         #  else, only a single dtype is given
   4003         new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 4004                                      **kwargs)
   4005         return self._constructor(new_data).__finalize__(self)
   4006 
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
   3455 
   3456     def astype(self, dtype, **kwargs):
-> 3457         return self.apply('astype', dtype=dtype, **kwargs)
   3458 
   3459     def convert(self, **kwargs):
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3322 
   3323             kwargs['mgr'] = self
-> 3324             applied = getattr(b, f)(**kwargs)
   3325             result_blocks = _extend_blocks(applied, result_blocks)
   3326 
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs)
    542     def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
    543         return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 544                             **kwargs)
    545 
    546     def _astype(self, dtype, copy=False, errors='raise', values=None,
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
    623 
    624                 #  _astype_nansafe works fine with 1-d only
--> 625                 values = astype_nansafe(values.ravel(), dtype, copy=True)
    626                 values = values.reshape(self.shape)
    627 
~/anaconda3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy)
    685 
    686         if not np.isfinite(arr).all():
--> 687             raise ValueError('Cannot convert non-finite values (NA or inf) to '
    688                              'integer')
    689 
ValueError: Cannot convert non-finite values (NA or inf) to integer

#  对于数据类型，可以替换字符串名：27、28、30、31是等价的
 In[27]: college.describe(include=['int64', 'float64']).T
Out[27]:

 In[28]: college.describe(include=[np.int64, np.float64]).T
Out[28]:

 In[29]: college['RELAFFIL'] = college['RELAFFIL'].astype(np.int8)
 In[30]: college.describe(include=['int', 'float']).T  #  defaults to 64 bit int/floats
Out[30]:

 In[31]: college.describe(include=['number']).T  #  also works as the default int/float are 64 bits
Out[31]:

#  转变数据类型时也可以如法炮制
 In[32]: college['MENONLY'] = college['MENONLY'].astype('float16')
         college['RELAFFIL'] = college['RELAFFIL'].astype('int8')
 In[33]: college.index = pd.Int64Index(college.index)
         college.index.memory_usage()
Out[33]: 60280

2. 改变数据类型，降低内存消耗

2. 改变数据类型，降低内存消耗

更多