1 PCA (Principal Component Analysis)

PCA September 28, 2021 1 PCA (Principal Component Analysis) This notebook shows how to plot a PCA with sciki-learn and statsmodels, with or without normalization. [1]: %matplotlib inline [2]: import matplotlib.pyplot as plt plt.style.use('ggplot') [3]: from jyquickhelper import add_notebook_menu add_notebook_menu() [3]: <IPython.core.display.HTML object> More about PCA: Implementing a Principal Component Analysis (PCA) in Python step by step. 1.1 Download data [4]: import pyensae.datasource pyensae.datasource.download_data("auto-mpg.data", url="https://archive.ics.uci.edu/ml/ ,!machine-learning-databases/auto-mpg/") [4]: 'auto-mpg.data' [5]: import pandas df = pandas.read_fwf("auto-mpg.data", encoding="utf-8", names="mpg cylinders displacement horsepower weight acceleration␣ ,!year origin name".split()) df["name"] = df["name"].apply(lambda s : s.strip(' "')) df.head() [5]: mpg cylinders displacement horsepower weight acceleration year \ 0 18.0 8 307.0 130.0 3504.0 12.0 70 1 15.0 8 350.0 165.0 3693.0 11.5 70 2 18.0 8 318.0 150.0 3436.0 11.0 70 3 16.0 8 304.0 150.0 3433.0 12.0 70 4 17.0 8 302.0 140.0 3449.0 10.5 70 origin name 0 1 chevrolet chevelle malibu 1 1 buick skylark 320 2 1 plymouth satellite 1 3 1 amc rebel sst 4 1 ford torino [6]: df.dtypes [6]: mpg float64 cylinders int64 displacement float64 horsepower object weight float64 acceleration float64 year int64 origin int64 name object dtype: object We remove missing values: [7]: df[df.horsepower == "?"] [7]: mpg cylinders displacement horsepower weight acceleration year \ 32 25.0 4 98.0 ? 2046.0 19.0 71 126 21.0 6 200.0 ? 2875.0 17.0 74 330 40.9 4 85.0 ? 1835.0 17.3 80 336 23.6 4 140.0 ? 2905.0 14.3 80 354 34.5 4 100.0 ? 2320.0 15.8 81 374 23.0 4 151.0 ? 3035.0 20.5 82 origin name 32 1 ford pinto 126 1 ford maverick 330 2 renault lecar deluxe 336 1 ford mustang cobra 354 2 renault 18i 374 1 amc concord dl [8]: final = df[df.horsepower != '?'].copy() final["horsepower"] = final["horsepower"].astype(float) [9]: final.to_csv("auto-mpg.data.csv", sep="\t", index=False, encoding="utf-8") [10]: final.shape [10]: (392, 9) 1.2 PCA with scikit-learn [11]: from sklearn.decomposition import PCA X = final[df.columns[1:-1]] Y = final["mpg"] pca = PCA(n_components=2) pca.fit(X) 2 [11]: PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False) [12]: out = pca.transform(X) out[:5] [12]: array([[536.44492922, 50.83312832], [730.34140206, 79.13543921], [470.9815846 , 75.4476722 ], [466.40143367, 62.53420646], [481.66788465, 55.78036021]]) [13]: pca.explained_variance_ratio_, pca.noise_variance_ [13]: (array([0.99756151, 0.0020628 ]), 55.14787750463889) [14]: import matplotlib.pyplot as plt plt.plot(out[:,0], out[:,1], "."); [15]: pca.components_ [15]: array([[ 1.79262233e-03, 1.14341275e-01, 3.89670355e-02, 9.92673415e-01, -1.35283460e-03, -1.33684138e-03, -5.51538021e-04], [ 1.33244815e-02, 9.45778439e-01, 2.98248416e-01, -1.20752748e-01, -3.48258394e-02, -2.38516836e-02, -3.24298106e-03]]) 3 1.3 PCA with scikit-learn and normalization [16]: from sklearn.decomposition import PCA from sklearn.preprocessing import Normalizer from sklearn.pipeline import Pipeline normpca = Pipeline([('norm', Normalizer()), ('pca', PCA(n_components=2))]) normpca.fit(X) [16]: Pipeline(memory=None, steps=[('norm', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False))]) [17]: out = normpca.transform(X) out[:5] [17]: array([[0.02731781, 0.00012872], [0.03511968, 0.00666259], [0.03247168, 0.00632048], [0.0287677 , 0.0060517 ], [0.02758449, 0.00325874]]) [18]: normpca.named_steps['pca'].explained_variance_ratio_, normpca.named_steps['pca']. ,!noise_variance_ [18]: (array([0.86819249, 0.08034075]), 4.332607718595102e-06) [19]: import matplotlib.pyplot as plt plt.plot(out[:,0], out[:,1], "."); 4 [20]: normpca.named_steps['pca'].components_ [20]: array([[ 0.00415209, 0.92648229, 0.11272098, -0.05732771, -0.09162071, -0.34198745, -0.01646403], [ 0.01671457, 0.0789351 , 0.85881718, -0.06957932, 0.02998247, 0.49941847, 0.02763848]]) 1.4 PCA with statsmodels [21]: from statsmodels.sandbox.tools import pca xred, fact, eva, eve = pca(X, keepdim=2, normalize=False) [22]: fact[:5] [22]: array([[536.44492922, -50.83312832], [730.34140206, -79.13543921], [470.9815846 , -75.4476722 ], [466.40143367, -62.53420646], [481.66788465, -55.78036021]]) [23]: eva [23]: array([732151.6743476 , 1513.97202164]) [24]: eve [24]: array([[ 1.79262233e-03, -1.33244815e-02], [ 1.14341275e-01, -9.45778439e-01], [ 3.89670355e-02, -2.98248416e-01], [ 9.92673415e-01, 1.20752748e-01], [-1.35283460e-03, 3.48258394e-02], [-1.33684138e-03, 2.38516836e-02], [-5.51538021e-04, 3.24298106e-03]]) [25]: plt.plot(fact[:,0], fact[:,1], "."); 5 1.5 PCA with statsmodels and normalization [26]: from statsmodels.sandbox.tools import pca from sklearn.preprocessing import normalize X_norm = normalize(X) xred, fact, eva, eve = pca(X_norm, keepdim=2, normalize=True) [27]: eva [27]: array([3.65433661e-04, 3.38164814e-05]) [28]: eve [28]: array([[ -0.21720145, 2.87429329], [-48.46551687, 13.57394009], [ -5.89658384, 147.68504393], [ 2.99888854, -11.96508998], [ 4.79280102, 5.15588534], [ 17.88981698, 85.8816515 ], [ 0.86125514, 4.75280519]]) [29]: plt.plot(fact[:,0], fact[:,1], "."); 6 [30]: [31]: 7.

Load more