dimension reduction

Posted by neverset on December 29, 2020

Linear Algebra Methods

  • Matrix factorization methods
  • Principal Components Analysis

      # evaluate pca with logistic regression algorithm for classification
      from numpy import mean
      from numpy import std
      from sklearn.datasets import make_classification
      from sklearn.model_selection import cross_val_score
      from sklearn.model_selection import RepeatedStratifiedKFold
      from sklearn.pipeline import Pipeline
      from sklearn.decomposition import PCA
      from sklearn.linear_model import LogisticRegression
      # define dataset
      X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)
      # define the pipeline
      steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
      model = Pipeline(steps=steps)
      # evaluate model
      cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
      n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
      # report performance
      print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
    
  • Singular Value Decomposition

      # evaluate svd with logistic regression algorithm for classification
      from numpy import mean
      from numpy import std
      from sklearn.datasets import make_classification
      from sklearn.model_selection import cross_val_score
      from sklearn.model_selection import RepeatedStratifiedKFold
      from sklearn.pipeline import Pipeline
      from sklearn.decomposition import TruncatedSVD
      from sklearn.linear_model import LogisticRegression
      # define dataset
      X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)
      # define the pipeline
      steps = [('svd', TruncatedSVD(n_components=10)), ('m', LogisticRegression())]
      model = Pipeline(steps=steps)
      # evaluate model
      cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
      n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
      # report performance
      print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
    
  • Non-Negative Matrix Factorization
  • Linear Discriminant Analysis

      # evaluate lda with logistic regression algorithm for classification
      from numpy import mean
      from numpy import std
      from sklearn.datasets import make_classification
      from sklearn.model_selection import cross_val_score
      from sklearn.model_selection import RepeatedStratifiedKFold
      from sklearn.pipeline import Pipeline
      from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
      from sklearn.linear_model import LogisticRegression
      # define dataset
      X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=10, random_state=7)
      # define the pipeline
      steps = [('lda', LinearDiscriminantAnalysis(n_components=1)), ('m', LogisticRegression())]
      model = Pipeline(steps=steps)
      # evaluate model
      cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
      n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
      # report performance
      print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
    

Manifold Learning Methods

  • Isomap Embedding

      # define the pipeline
      steps = [('iso', Isomap(n_components=10)), ('m', LogisticRegression())]
      model = Pipeline(steps=steps)
    
  • Locally Linear Embedding

    # define the pipeline steps = [(‘lle’, LocallyLinearEmbedding(n_components=10)), (‘m’, LogisticRegression())] model = Pipeline(steps=steps)

  • Multidimensional Scaling
  • Spectral Embedding
  • t-distributed Stochastic Neighbor Embedding