PCA
September 28, 2021
1 PCA (Principal Component Analysis)
This notebook shows how to plot a PCA with sciki-learn and statsmodels, with or without normalization. [1]: %matplotlib inline
[2]: import matplotlib.pyplot as plt plt.style.use('ggplot')
[3]: from jyquickhelper import add_notebook_menu add_notebook_menu()
[3]:
More about PCA: Implementing a Principal Component Analysis (PCA) in Python step by step.
1.1 Download data [4]: import pyensae.datasource pyensae.datasource.download_data("auto-mpg.data", url="https://archive.ics.uci.edu/ml/ ,→machine-learning-databases/auto-mpg/")
[4]: 'auto-mpg.data'
[5]: import pandas df = pandas.read_fwf("auto-mpg.data", encoding="utf-8", names="mpg cylinders displacement horsepower weight acceleration␣ ,→year origin name".split()) df["name"] = df["name"].apply(lambda s : s.strip(' "')) df.head()
[5]: mpg cylinders displacement horsepower weight acceleration year \ 0 18.0 8 307.0 130.0 3504.0 12.0 70 1 15.0 8 350.0 165.0 3693.0 11.5 70 2 18.0 8 318.0 150.0 3436.0 11.0 70 3 16.0 8 304.0 150.0 3433.0 12.0 70 4 17.0 8 302.0 140.0 3449.0 10.5 70
origin name 0 1 chevrolet chevelle malibu 1 1 buick skylark 320 2 1 plymouth satellite
1 3 1 amc rebel sst 4 1 ford torino
[6]: df.dtypes
[6]: mpg float64 cylinders int64 displacement float64 horsepower object weight float64 acceleration float64 year int64 origin int64 name object dtype: object
We remove missing values: [7]: df[df.horsepower == "?"]
[7]: mpg cylinders displacement horsepower weight acceleration year \ 32 25.0 4 98.0 ? 2046.0 19.0 71 126 21.0 6 200.0 ? 2875.0 17.0 74 330 40.9 4 85.0 ? 1835.0 17.3 80 336 23.6 4 140.0 ? 2905.0 14.3 80 354 34.5 4 100.0 ? 2320.0 15.8 81 374 23.0 4 151.0 ? 3035.0 20.5 82
origin name 32 1 ford pinto 126 1 ford maverick 330 2 renault lecar deluxe 336 1 ford mustang cobra 354 2 renault 18i 374 1 amc concord dl
[8]: final = df[df.horsepower != '?'].copy() final["horsepower"] = final["horsepower"].astype(float)
[9]: final.to_csv("auto-mpg.data.csv", sep="\t", index=False, encoding="utf-8")
[10]: final.shape
[10]: (392, 9)
1.2 PCA with scikit-learn [11]: from sklearn.decomposition import PCA X = final[df.columns[1:-1]] Y = final["mpg"] pca = PCA(n_components=2) pca.fit(X)
2 [11]: PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False)
[12]: out = pca.transform(X) out[:5]
[12]: array([[536.44492922, 50.83312832], [730.34140206, 79.13543921], [470.9815846 , 75.4476722 ], [466.40143367, 62.53420646], [481.66788465, 55.78036021]])
[13]: pca.explained_variance_ratio_, pca.noise_variance_
[13]: (array([0.99756151, 0.0020628 ]), 55.14787750463889)
[14]: import matplotlib.pyplot as plt plt.plot(out[:,0], out[:,1], ".");
[15]: pca.components_
[15]: array([[ 1.79262233e-03, 1.14341275e-01, 3.89670355e-02, 9.92673415e-01, -1.35283460e-03, -1.33684138e-03, -5.51538021e-04], [ 1.33244815e-02, 9.45778439e-01, 2.98248416e-01, -1.20752748e-01, -3.48258394e-02, -2.38516836e-02, -3.24298106e-03]])
3 1.3 PCA with scikit-learn and normalization [16]: from sklearn.decomposition import PCA from sklearn.preprocessing import Normalizer from sklearn.pipeline import Pipeline
normpca = Pipeline([('norm', Normalizer()), ('pca', PCA(n_components=2))]) normpca.fit(X)
[16]: Pipeline(memory=None, steps=[('norm', Normalizer(copy=True, norm='l2')), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False))])
[17]: out = normpca.transform(X) out[:5]
[17]: array([[0.02731781, 0.00012872], [0.03511968, 0.00666259], [0.03247168, 0.00632048], [0.0287677 , 0.0060517 ], [0.02758449, 0.00325874]])
[18]: normpca.named_steps['pca'].explained_variance_ratio_, normpca.named_steps['pca']. ,→noise_variance_
[18]: (array([0.86819249, 0.08034075]), 4.332607718595102e-06)
[19]: import matplotlib.pyplot as plt plt.plot(out[:,0], out[:,1], ".");
4 [20]: normpca.named_steps['pca'].components_
[20]: array([[ 0.00415209, 0.92648229, 0.11272098, -0.05732771, -0.09162071, -0.34198745, -0.01646403], [ 0.01671457, 0.0789351 , 0.85881718, -0.06957932, 0.02998247, 0.49941847, 0.02763848]])
1.4 PCA with statsmodels [21]: from statsmodels.sandbox.tools import pca xred, fact, eva, eve = pca(X, keepdim=2, normalize=False)
[22]: fact[:5]
[22]: array([[536.44492922, -50.83312832], [730.34140206, -79.13543921], [470.9815846 , -75.4476722 ], [466.40143367, -62.53420646], [481.66788465, -55.78036021]])
[23]: eva
[23]: array([732151.6743476 , 1513.97202164])
[24]: eve
[24]: array([[ 1.79262233e-03, -1.33244815e-02], [ 1.14341275e-01, -9.45778439e-01], [ 3.89670355e-02, -2.98248416e-01], [ 9.92673415e-01, 1.20752748e-01], [-1.35283460e-03, 3.48258394e-02], [-1.33684138e-03, 2.38516836e-02], [-5.51538021e-04, 3.24298106e-03]])
[25]: plt.plot(fact[:,0], fact[:,1], ".");
5 1.5 PCA with statsmodels and normalization [26]: from statsmodels.sandbox.tools import pca from sklearn.preprocessing import normalize X_norm = normalize(X) xred, fact, eva, eve = pca(X_norm, keepdim=2, normalize=True)
[27]: eva
[27]: array([3.65433661e-04, 3.38164814e-05])
[28]: eve
[28]: array([[ -0.21720145, 2.87429329], [-48.46551687, 13.57394009], [ -5.89658384, 147.68504393], [ 2.99888854, -11.96508998], [ 4.79280102, 5.15588534], [ 17.88981698, 85.8816515 ], [ 0.86125514, 4.75280519]])
[29]: plt.plot(fact[:,0], fact[:,1], ".");
6 [30]:
[31]:
7