1 PCA (Principal Component Analysis)

PCA September 28, 2021 1 PCA (Principal Component Analysis) This notebook shows how to plot a PCA with sciki-learn and statsmodels, with or without normalization. [1]: %matplotlib inline [2]: import matplotlib.pyplot as plt plt.style.use('ggplot') [3]: from jyquickhelper import add_notebook_menu add_notebook_menu() [3]: <IPython.core.display.HTML object> More about PCA: Implementing a Principal Component Analysis (PCA) in Python step by step. 1.1 Download data [4]: import pyensae.datasource pyensae.datasource.download_data("auto-mpg.data", url="https://archive.ics.uci.edu/ml/ ,!machine-learning-databases/auto-mpg/") [4]: 'auto-mpg.data' [5]: import pandas df = pandas.read_fwf("auto-mpg.data", encoding="utf-8", names="mpg cylinders displacement horsepower weight acceleration␣ ,!year origin name".split()) df["name"] = df["name"].apply(lambda s : s.strip(' "')) df.head() [5]: mpg cylinders displacement horsepower weight acceleration year \ 0 18.0 8 307.0 130.0 3504.0 12.0 70 1 15.0 8 350.0 165.0 3693.0 11.5 70 2 18.0 8 318.0 150.0 3436.0 11.0 70 3 16.0 8 304.0 150.0 3433.0 12.0 70 4 17.0 8 302.0 140.0 3449.0 10.5 70 origin name 0 1 chevrolet chevelle malibu 1 1 buick skylark 320 2 1 plymouth satellite 1 3 1 amc rebel sst 4 1 ford torino [6]: df.dtypes [6]: mpg float64 cylinders int64 displacement float64 horsepower object weight float64 acceleration float64 year int64 origin int64 name object dtype: object We remove missing values: [7]: df[df.horsepower == "?"] [7]: mpg cylinders displacement horsepower weight acceleration year \ 32 25.0 4 98.0 ? 2046.0 19.0 71 126 21.0 6 200.0 ? 2875.0 17.0 74 330 40.9 4 85.0 ? 1835.0 17.3 80 336 23.6 4 140.0 ? 2905.0 14.3 80 354 34.5 4 100.0 ? 2320.0 15.8 81 374 23.0 4 151.0 ? 3035.0 20.5 82 origin name 32 1 ford pinto 126 1 ford maverick 330 2 renault lecar deluxe 336 1 ford mustang cobra 354 2 renault 18i 374 1 amc concord dl [8]: final = df[df.horsepower != '?'].copy() final["horsepower"] = final["horsepower"].astype(float) [9]: final.to_csv("auto-mpg.data.csv", sep="\t", index=False, encoding="utf-8") [10]: final.shape [10]: (392, 9) 1.2 PCA with scikit-learn [11]: from sklearn.decomposition import PCA X = final[df.columns[1:-1]] Y = final["mpg"] pca = PCA(n_components=2) pca.fit(X) 2 [11]: PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False) [12]: out = pca.transform(X) out[:5] [12]: array([[536.44492922, 50.83312832], [730.34140206, 79.13543921], [470.9815846 , 75.4476722 ], [466.40143367, 62.53420646], [481.66788465, 55.78036021]]) [13]: pca.explained_variance_ratio_, pca.noise_variance_ [13]: (array([0.99756151, 0.0020628 ]), 55.14787750463889) [14]: import matplotlib.pyplot as plt plt.plot(out[:,0], out[:,1], "."); [15]: pca.components_ [15]: array([[ 1.79262233e-03, 1.14341275e-01, 3.89670355e-02, 9.92673415e-01, -1.35283460e-03, -1.33684138e-03, -5.51538021e-04], [ 1.33244815e-02, 9.45778439e-01, 2.98248416e-01, -1.20752748e-01, -3.48258394e-02, -2.38516836e-02, -3.24298106e-03]]) 3 1.3 PCA with scikit-learn and normalization [16]: from sklearn.decomposition import PCA from sklearn.preprocessing import Normalizer from sklearn.pipeline import Pipeline normpca = Pipeline([('norm', Normalizer()), ('pca', PCA(n_components=2))]) normpca.fit(X) [16]: Pipeline(memory=None, steps=[('norm', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False))]) [17]: out = normpca.transform(X) out[:5] [17]: array([[0.02731781, 0.00012872], [0.03511968, 0.00666259], [0.03247168, 0.00632048], [0.0287677 , 0.0060517 ], [0.02758449, 0.00325874]]) [18]: normpca.named_steps['pca'].explained_variance_ratio_, normpca.named_steps['pca']. ,!noise_variance_ [18]: (array([0.86819249, 0.08034075]), 4.332607718595102e-06) [19]: import matplotlib.pyplot as plt plt.plot(out[:,0], out[:,1], "."); 4 [20]: normpca.named_steps['pca'].components_ [20]: array([[ 0.00415209, 0.92648229, 0.11272098, -0.05732771, -0.09162071, -0.34198745, -0.01646403], [ 0.01671457, 0.0789351 , 0.85881718, -0.06957932, 0.02998247, 0.49941847, 0.02763848]]) 1.4 PCA with statsmodels [21]: from statsmodels.sandbox.tools import pca xred, fact, eva, eve = pca(X, keepdim=2, normalize=False) [22]: fact[:5] [22]: array([[536.44492922, -50.83312832], [730.34140206, -79.13543921], [470.9815846 , -75.4476722 ], [466.40143367, -62.53420646], [481.66788465, -55.78036021]]) [23]: eva [23]: array([732151.6743476 , 1513.97202164]) [24]: eve [24]: array([[ 1.79262233e-03, -1.33244815e-02], [ 1.14341275e-01, -9.45778439e-01], [ 3.89670355e-02, -2.98248416e-01], [ 9.92673415e-01, 1.20752748e-01], [-1.35283460e-03, 3.48258394e-02], [-1.33684138e-03, 2.38516836e-02], [-5.51538021e-04, 3.24298106e-03]]) [25]: plt.plot(fact[:,0], fact[:,1], "."); 5 1.5 PCA with statsmodels and normalization [26]: from statsmodels.sandbox.tools import pca from sklearn.preprocessing import normalize X_norm = normalize(X) xred, fact, eva, eve = pca(X_norm, keepdim=2, normalize=True) [27]: eva [27]: array([3.65433661e-04, 3.38164814e-05]) [28]: eve [28]: array([[ -0.21720145, 2.87429329], [-48.46551687, 13.57394009], [ -5.89658384, 147.68504393], [ 2.99888854, -11.96508998], [ 4.79280102, 5.15588534], [ 17.88981698, 85.8816515 ], [ 0.86125514, 4.75280519]]) [29]: plt.plot(fact[:,0], fact[:,1], "."); 6 [30]: [31]: 7.

1 PCA (Principal Component Analysis)

Details

Download

Copyright

We respect the copyrights and intellectual property rights of all users. All uploaded documents are either original works of the uploader or authorized works of the rightful owners.

Support