Feature Selection

互分解 / 範例一:Compare cross decomposition methods

http://scikit-learn.org/stable/auto_examples/cross_decomposition/plot_compare_cross_decomposition.html

這個範例目的是比較幾個互分解的方法。互分解運算主要是使用潛在變量模式(Latent variable modeling)分析來尋找兩個矩陣之間的主要相關成份。
對比於外顯變量(Manifest variable),也就是一般的觀察變量(Observational variable),潛在變量是可能會影響實驗觀察的一個未知因素。

(一)引入函式庫及內建手寫數字資料庫

引入之函式庫如下

  1. matplotlib.pyplot: 用來繪製影像
  2. sklearn.cross_decomposition: 互分解物件
  3. PLSCanonical: Partial Least Squares 淨最小平方法
  4. PLSRegression: PLS 淨最小平方迴歸法
  5. CCA: Canonical correlation analysis 典型相關分析
  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
  4. #首先產生500筆常態分佈資料
  5. n = 500
  6. # 共有兩個潛在變量:
  7. l1 = np.random.normal(size=n)
  8. l2 = np.random.normal(size=n)
  9. # np.array([l1, l1, l2, l2]).shape = (4L, 500L)
  10. # latents 為 500 x 4 之矩陣
  11. latents = np.array([l1, l1, l2, l2]).T
  12. #接下來加入亂數形成X, Y矩陣
  13. X = latents + np.random.normal(size=4 * n).reshape((n, 4))
  14. Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
  15. X_train = X[:n / 2]
  16. Y_train = Y[:n / 2]
  17. X_test = X[n / 2:]
  18. Y_test = Y[n / 2:]
  19. # numpy.corrcoef(x, y=None) 用來記算 Pearson product-moment 相關係數
  20. print("Corr(X)")
  21. print(np.round(np.corrcoef(X.T), 2))
  22. print("Corr(Y)")
  23. print(np.round(np.corrcoef(Y.T), 2))
  1. Corr(X)
  2. [[ 1. 0.48 0.02 0. ]
  3. [ 0.48 1. 0.02 -0.02]
  4. [ 0.02 0.02 1. 0.51]
  5. [ 0. -0.02 0.51 1. ]]
  6. Corr(Y)
  7. [[ 1. 0.49 -0.01 0.05]
  8. [ 0.49 1. -0.06 0.06]
  9. [-0.01 -0.06 1. 0.53]
  10. [ 0.05 0.06 0.53 1. ]]
  1. # Canonical (symmetric) PLS
  2. # Transform data
  3. # ~~~~~~~~~~~~~~
  4. plsca = PLSCanonical(n_components=2)
  5. plsca.fit(X_train, Y_train)
  6. X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
  7. X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
  8. # Scatter plot of scores
  9. # ~~~~~~~~~~~~~~~~~~~~~~
  10. # 1) On diagonal plot X vs Y scores on each components
  11. #figure = plt.figure(figsize=(30,20), dpi=300)
  12. plt.figure(figsize=(12, 8), dpi=600)
  13. plt.subplot(221)
  14. plt.plot(X_train_r[:, 0], Y_train_r[:, 0], "ob", label="train")
  15. plt.plot(X_test_r[:, 0], Y_test_r[:, 0], "or", label="test")
  16. plt.xlabel("x scores")
  17. plt.ylabel("y scores")
  18. plt.title('Comp. 1: X vs Y (test corr = %.2f)' %
  19. np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1])
  20. plt.xticks(())
  21. plt.yticks(())
  22. plt.legend(loc="best")
  23. plt.subplot(224)
  24. plt.plot(X_train_r[:, 1], Y_train_r[:, 1], "ob", label="train")
  25. plt.plot(X_test_r[:, 1], Y_test_r[:, 1], "or", label="test")
  26. plt.xlabel("x scores")
  27. plt.ylabel("y scores")
  28. plt.title('Comp. 2: X vs Y (test corr = %.2f)' %
  29. np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1])
  30. plt.xticks(())
  31. plt.yticks(())
  32. plt.legend(loc="best")
  33. # 2) Off diagonal plot components 1 vs 2 for X and Y
  34. plt.subplot(222)
  35. plt.plot(X_train_r[:, 0], X_train_r[:, 1], "*b", label="train")
  36. plt.plot(X_test_r[:, 0], X_test_r[:, 1], "*r", label="test")
  37. plt.xlabel("X comp. 1")
  38. plt.ylabel("X comp. 2")
  39. plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)'
  40. % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1])
  41. plt.legend(loc="best")
  42. plt.xticks(())
  43. plt.yticks(())
  44. plt.subplot(223)
  45. plt.plot(Y_train_r[:, 0], Y_train_r[:, 1], "*b", label="train")
  46. plt.plot(Y_test_r[:, 0], Y_test_r[:, 1], "*r", label="test")
  47. plt.xlabel("Y comp. 1")
  48. plt.ylabel("Y comp. 2")
  49. plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'
  50. % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1])
  51. plt.legend(loc="best")
  52. plt.xticks(())
  53. plt.yticks(())
  54. plt.show()

png

  1. ###############################################################################
  2. # PLS regression, with multivariate response, a.k.a. PLS2
  3. n = 1000
  4. q = 3
  5. p = 10
  6. X = np.random.normal(size=n * p).reshape((n, p))
  7. B = np.array([[1, 2] + [0] * (p - 2)] * q).T
  8. # each Yj = 1*X1 + 2*X2 + noize
  9. Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5
  10. pls2 = PLSRegression(n_components=3)
  11. pls2.fit(X, Y)
  12. print("True B (such that: Y = XB + Err)")
  13. print(B)
  14. # compare pls2.coef_ with B
  15. print("Estimated B")
  16. print(np.round(pls2.coef_, 1))
  17. pls2.predict(X)
  18. ###############################################################################
  19. # PLS regression, with univariate response, a.k.a. PLS1
  20. n = 1000
  21. p = 10
  22. X = np.random.normal(size=n * p).reshape((n, p))
  23. y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
  24. pls1 = PLSRegression(n_components=3)
  25. pls1.fit(X, y)
  26. # note that the number of compements exceeds 1 (the dimension of y)
  27. print("Estimated betas")
  28. print(np.round(pls1.coef_, 1))
  29. ###############################################################################
  30. # CCA (PLS mode B with symmetric deflation)
  31. cca = CCA(n_components=2)
  32. cca.fit(X_train, Y_train)
  33. X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
  34. X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
  1. True B (such that: Y = XB + Err)
  2. [[1 1 1]
  3. [2 2 2]
  4. [0 0 0]
  5. [0 0 0]
  6. [0 0 0]
  7. [0 0 0]
  8. [0 0 0]
  9. [0 0 0]
  10. [0 0 0]
  11. [0 0 0]]
  12. Estimated B
  13. [[ 1. 1. 1. ]
  14. [ 2. 1.9 2. ]
  15. [ 0. 0. 0. ]
  16. [ 0. 0. 0. ]
  17. [ 0. 0. 0. ]
  18. [ 0. 0. -0.1]
  19. [ 0. 0. 0. ]
  20. [ 0. 0. 0.1]
  21. [ 0. 0. 0. ]
  22. [ 0. 0. 0. ]]
  23. Estimated betas
  24. [[ 1. ]
  25. [ 2. ]
  26. [ 0. ]
  27. [ 0. ]
  28. [ 0. ]
  29. [ 0. ]
  30. [ 0. ]
  31. [-0.1]
  32. [ 0. ]
  33. [ 0. ]]

(四)完整程式碼

Python source code: plot_compare_cross_decomposition.py

http://scikit-learn.org/stable/_downloads/plot_compare_cross_decomposition.py

  1. print(__doc__)
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA
  5. ###############################################################################
  6. # Dataset based latent variables model
  7. n = 500
  8. # 2 latents vars:
  9. l1 = np.random.normal(size=n)
  10. l2 = np.random.normal(size=n)
  11. latents = np.array([l1, l1, l2, l2]).T
  12. X = latents + np.random.normal(size=4 * n).reshape((n, 4))
  13. Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
  14. X_train = X[:n / 2]
  15. Y_train = Y[:n / 2]
  16. X_test = X[n / 2:]
  17. Y_test = Y[n / 2:]
  18. print("Corr(X)")
  19. print(np.round(np.corrcoef(X.T), 2))
  20. print("Corr(Y)")
  21. print(np.round(np.corrcoef(Y.T), 2))
  22. ###############################################################################
  23. # Canonical (symmetric) PLS
  24. # Transform data
  25. # ~~~~~~~~~~~~~~
  26. plsca = PLSCanonical(n_components=2)
  27. plsca.fit(X_train, Y_train)
  28. X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
  29. X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
  30. # Scatter plot of scores
  31. # ~~~~~~~~~~~~~~~~~~~~~~
  32. # 1) On diagonal plot X vs Y scores on each components
  33. plt.figure(figsize=(12, 8))
  34. plt.subplot(221)
  35. plt.plot(X_train_r[:, 0], Y_train_r[:, 0], "ob", label="train")
  36. plt.plot(X_test_r[:, 0], Y_test_r[:, 0], "or", label="test")
  37. plt.xlabel("x scores")
  38. plt.ylabel("y scores")
  39. plt.title('Comp. 1: X vs Y (test corr = %.2f)' %
  40. np.corrcoef(X_test_r[:, 0], Y_test_r[:, 0])[0, 1])
  41. plt.xticks(())
  42. plt.yticks(())
  43. plt.legend(loc="best")
  44. plt.subplot(224)
  45. plt.plot(X_train_r[:, 1], Y_train_r[:, 1], "ob", label="train")
  46. plt.plot(X_test_r[:, 1], Y_test_r[:, 1], "or", label="test")
  47. plt.xlabel("x scores")
  48. plt.ylabel("y scores")
  49. plt.title('Comp. 2: X vs Y (test corr = %.2f)' %
  50. np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1])
  51. plt.xticks(())
  52. plt.yticks(())
  53. plt.legend(loc="best")
  54. # 2) Off diagonal plot components 1 vs 2 for X and Y
  55. plt.subplot(222)
  56. plt.plot(X_train_r[:, 0], X_train_r[:, 1], "*b", label="train")
  57. plt.plot(X_test_r[:, 0], X_test_r[:, 1], "*r", label="test")
  58. plt.xlabel("X comp. 1")
  59. plt.ylabel("X comp. 2")
  60. plt.title('X comp. 1 vs X comp. 2 (test corr = %.2f)'
  61. % np.corrcoef(X_test_r[:, 0], X_test_r[:, 1])[0, 1])
  62. plt.legend(loc="best")
  63. plt.xticks(())
  64. plt.yticks(())
  65. plt.subplot(223)
  66. plt.plot(Y_train_r[:, 0], Y_train_r[:, 1], "*b", label="train")
  67. plt.plot(Y_test_r[:, 0], Y_test_r[:, 1], "*r", label="test")
  68. plt.xlabel("Y comp. 1")
  69. plt.ylabel("Y comp. 2")
  70. plt.title('Y comp. 1 vs Y comp. 2 , (test corr = %.2f)'
  71. % np.corrcoef(Y_test_r[:, 0], Y_test_r[:, 1])[0, 1])
  72. plt.legend(loc="best")
  73. plt.xticks(())
  74. plt.yticks(())
  75. plt.show()
  76. ###############################################################################
  77. # PLS regression, with multivariate response, a.k.a. PLS2
  78. n = 1000
  79. q = 3
  80. p = 10
  81. X = np.random.normal(size=n * p).reshape((n, p))
  82. B = np.array([[1, 2] + [0] * (p - 2)] * q).T
  83. # each Yj = 1*X1 + 2*X2 + noize
  84. Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5
  85. pls2 = PLSRegression(n_components=3)
  86. pls2.fit(X, Y)
  87. print("True B (such that: Y = XB + Err)")
  88. print(B)
  89. # compare pls2.coef_ with B
  90. print("Estimated B")
  91. print(np.round(pls2.coef_, 1))
  92. pls2.predict(X)
  93. ###############################################################################
  94. # PLS regression, with univariate response, a.k.a. PLS1
  95. n = 1000
  96. p = 10
  97. X = np.random.normal(size=n * p).reshape((n, p))
  98. y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5
  99. pls1 = PLSRegression(n_components=3)
  100. pls1.fit(X, y)
  101. # note that the number of compements exceeds 1 (the dimension of y)
  102. print("Estimated betas")
  103. print(np.round(pls1.coef_, 1))
  104. ###############################################################################
  105. # CCA (PLS mode B with symmetric deflation)
  106. cca = CCA(n_components=2)
  107. cca.fit(X_train, Y_train)
  108. X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
  109. X_test_r, Y_test_r = plsca.transform(X_test, Y_test)