<<

PCA

September 28, 2021

1 PCA (Principal Component Analysis)

This notebook shows how to plot a PCA with sciki-learn and , with or without normalization. [1]: %matplotlib inline

[2]: import matplotlib.pyplot as plt plt.style.use('ggplot')

[3]: from jyquickhelper import add_notebook_menu add_notebook_menu()

[3]:

More about PCA: Implementing a Principal Component Analysis (PCA) in Python step by step.

1.1 Download data [4]: import pyensae.datasource pyensae.datasource.download_data("auto-mpg.data", url="https://archive.ics.uci.edu/ml/ ,→machine-learning-databases/auto-mpg/")

[4]: 'auto-mpg.data'

[5]: import df = pandas.read_fwf("auto-mpg.data", encoding="utf-8", names="mpg cylinders displacement horsepower weight acceleration␣ ,→year origin name".split()) df["name"] = df["name"].apply(lambda s : s.strip(' "')) df.head()

[5]: mpg cylinders displacement horsepower weight acceleration year \ 0 18.0 8 307.0 130.0 3504.0 12.0 70 1 15.0 8 350.0 165.0 3693.0 11.5 70 2 18.0 8 318.0 150.0 3436.0 11.0 70 3 16.0 8 304.0 150.0 3433.0 12.0 70 4 17.0 8 302.0 140.0 3449.0 10.5 70

origin name 0 1 chevrolet chevelle malibu 1 1 buick skylark 320 2 1 plymouth satellite

1 3 1 amc rebel sst 4 1 ford torino

[6]: df.dtypes

[6]: mpg float64 cylinders int64 displacement float64 horsepower object weight float64 acceleration float64 year int64 origin int64 name object dtype: object

We remove missing values: [7]: df[df.horsepower == "?"]

[7]: mpg cylinders displacement horsepower weight acceleration year \ 32 25.0 4 98.0 ? 2046.0 19.0 71 126 21.0 6 200.0 ? 2875.0 17.0 74 330 40.9 4 85.0 ? 1835.0 17.3 80 336 23.6 4 140.0 ? 2905.0 14.3 80 354 34.5 4 100.0 ? 2320.0 15.8 81 374 23.0 4 151.0 ? 3035.0 20.5 82

origin name 32 1 ford pinto 126 1 ford maverick 330 2 renault lecar deluxe 336 1 ford mustang cobra 354 2 renault 18i 374 1 amc concord dl

[8]: final = df[df.horsepower != '?'].copy() final["horsepower"] = final["horsepower"].astype(float)

[9]: final.to_csv("auto-mpg.data.csv", sep="\t", index=False, encoding="utf-8")

[10]: final.shape

[10]: (392, 9)

1.2 PCA with scikit-learn [11]: from sklearn.decomposition import PCA X = final[df.columns[1:-1]] Y = final["mpg"] pca = PCA(n_components=2) pca.fit(X)

2 [11]: PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False)

[12]: out = pca.transform(X) out[:5]

[12]: array([[536.44492922, 50.83312832], [730.34140206, 79.13543921], [470.9815846 , 75.4476722 ], [466.40143367, 62.53420646], [481.66788465, 55.78036021]])

[13]: pca.explained_variance_ratio_, pca.noise_variance_

[13]: (array([0.99756151, 0.0020628 ]), 55.14787750463889)

[14]: import matplotlib.pyplot as plt plt.plot(out[:,0], out[:,1], ".");

[15]: pca.components_

[15]: array([[ 1.79262233e-03, 1.14341275e-01, 3.89670355e-02, 9.92673415e-01, -1.35283460e-03, -1.33684138e-03, -5.51538021e-04], [ 1.33244815e-02, 9.45778439e-01, 2.98248416e-01, -1.20752748e-01, -3.48258394e-02, -2.38516836e-02, -3.24298106e-03]])

3 1.3 PCA with scikit-learn and normalization [16]: from sklearn.decomposition import PCA from sklearn.preprocessing import Normalizer from sklearn.pipeline import Pipeline

normpca = Pipeline([('norm', Normalizer()), ('pca', PCA(n_components=2))]) normpca.fit(X)

[16]: Pipeline(memory=None, steps=[('norm', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False))])

[17]: out = normpca.transform(X) out[:5]

[17]: array([[0.02731781, 0.00012872], [0.03511968, 0.00666259], [0.03247168, 0.00632048], [0.0287677 , 0.0060517 ], [0.02758449, 0.00325874]])

[18]: normpca.named_steps['pca'].explained_variance_ratio_, normpca.named_steps['pca']. ,→noise_variance_

[18]: (array([0.86819249, 0.08034075]), 4.332607718595102e-06)

[19]: import matplotlib.pyplot as plt plt.plot(out[:,0], out[:,1], ".");

4 [20]: normpca.named_steps['pca'].components_

[20]: array([[ 0.00415209, 0.92648229, 0.11272098, -0.05732771, -0.09162071, -0.34198745, -0.01646403], [ 0.01671457, 0.0789351 , 0.85881718, -0.06957932, 0.02998247, 0.49941847, 0.02763848]])

1.4 PCA with statsmodels [21]: from statsmodels.sandbox.tools import pca xred, fact, eva, eve = pca(X, keepdim=2, normalize=False)

[22]: fact[:5]

[22]: array([[536.44492922, -50.83312832], [730.34140206, -79.13543921], [470.9815846 , -75.4476722 ], [466.40143367, -62.53420646], [481.66788465, -55.78036021]])

[23]: eva

[23]: array([732151.6743476 , 1513.97202164])

[24]: eve

[24]: array([[ 1.79262233e-03, -1.33244815e-02], [ 1.14341275e-01, -9.45778439e-01], [ 3.89670355e-02, -2.98248416e-01], [ 9.92673415e-01, 1.20752748e-01], [-1.35283460e-03, 3.48258394e-02], [-1.33684138e-03, 2.38516836e-02], [-5.51538021e-04, 3.24298106e-03]])

[25]: plt.plot(fact[:,0], fact[:,1], ".");

5 1.5 PCA with statsmodels and normalization [26]: from statsmodels.sandbox.tools import pca from sklearn.preprocessing import normalize X_norm = normalize(X) xred, fact, eva, eve = pca(X_norm, keepdim=2, normalize=True)

[27]: eva

[27]: array([3.65433661e-04, 3.38164814e-05])

[28]: eve

[28]: array([[ -0.21720145, 2.87429329], [-48.46551687, 13.57394009], [ -5.89658384, 147.68504393], [ 2.99888854, -11.96508998], [ 4.79280102, 5.15588534], [ 17.88981698, 85.8816515 ], [ 0.86125514, 4.75280519]])

[29]: plt.plot(fact[:,0], fact[:,1], ".");

6 [30]:

[31]:

7