C# FA Correlation Example

← All NMath Stats Code Examples

 

using System;
using System.IO;

using CenterSpace.NMath.Core;
using CenterSpace.NMath.Stats;

namespace CenterSpace.NMath.Stats.Examples.CSharp
{
  class FACorrelationExample
  {
    static void Main( string[] args )
    {
      // NMath Stats provide classes for performing a factor analysis on a set of case data. 
      // Case data should be provided to these classes in matrix form - the variable values
      // in columns and each row representing a case. In this example we look at some data
      // for car sales. For each sale ten variable values are recorded:
      //
      // type - sedan, mini-van, etc, recorded as integers 0, 1,...
      // price - the price in thousands,
      // engine_s - engine size,
      // horsepow - horsepower,
      // wheelbas - wheelbase,
      // width - car width,
      // length - car length,
      // curb_wgt - weight,
      // fuel_cap - fuel capacity,
      // mpg - fuel miles per gallon.
      //
      // We would like to predict car sales from this set of predictors. However, many 
      // of the predictors are correlated, and we fear that this might adversely affect
      // our results. So we use factor analysis to focus on a manageable set of 
      // predictors.

      // First load the data, which is in a comma delimited form.
      DataFrame carSalesData = DataFrame.Load( "car_sales.csv", true, false, ",", true ).CleanRows();

      // NMath Stats provides three classes for
      // performing factor analysis. All will perform analysis on the correlation matrix
      // or the covariance matrix of case data. In addition each of these classes has
      // two class parameters, one specifying the algorithm used to extract the factors,
      // and the other specifying a factor rotation method. Here we use the class
      // FactorAnalysisCorrelation, which analyzes the correlation matrix, with 
      // principal factors extraction and varimax rotation.
      // The other two factor analysis classes are FactorAnalysisCovariance, for analyzing
      // the covariance matrix, and DoubleFactorAnalysis which can be used if you don't
      // have access to the original case data, just the correlation or covariance matrix
      // (DoubleFactorAnalysis is a base class for FactorAnalysisCorrelation and
      // FactorAnalysisCovariance).

      // Construct the factor analysis object we use for our analysis. The simplest
      // constructor takes only the case data as an argument. Other constructors take
      // instances of the class parameter classes Extraction and Rotation, 
      // allowing you to pre-configure the options on these objects. When not specified,
      // instances Extraction and Rotation classes will be created with their no-argument
      // constructors for use in the analysis.
      var FA = new FactorAnalysisCorrelation<PCFactorExtraction, VarimaxRotation>( carSalesData.ToDoubleMatrix() );

      // First, look at the extracted communalities. 
      // Extracted communalities are estimates of the proportion of variance in each variable
      // accounted for by the factors. 
      DoubleVector extractedCommunalities = FA.ExtractedCommunalities;
      Console.WriteLine();
      Console.WriteLine( "Predictor\tExtracted Communality" );
      Console.WriteLine( "-------------------------------------" );
      for ( int i = 0; i < carSalesData.Cols; i++ )
      {
        Console.Write( carSalesData[i].Name + "\t" );
        if ( carSalesData[i].Name.Length < 8 )
        {
          Console.Write( '\t' );
        }
        Console.WriteLine( extractedCommunalities[i].ToString( "G3" ) );
      }
      // The communalities are all high (close to 1.0) indicating that the extracted 
      // factors represent the variables well.

      Console.WriteLine();

      // Next we look at the variance explained by the initial solution 
      // by printing out a table of these values. 
      // The first column will just be the extracted factor number. 
      //
      // The second 'Total' column gives the eigenvalue, or amount of 
      // variance in the original variables accounted for by each factor.
      // Since by default factors with eigenvalues greater than one will be
      // extracted, the first three factors will be extracted.
      //
      // The % of Variance column gives the ratio, expressed as a percentage, 
      // of the variance accounted for by each factor to the total 
      // variance in all of the variables.
      //
      // The Cumulative % column gives the percentage of variance accounted 
      // for by the first n factors. For example, the cumulative percentage
      // for the second factor is the sum of the percentage of variance
      // for the first and second factors.
      Console.WriteLine( "Factor\tTotal\tVariance\tCumulative" );
      Console.WriteLine( "------------------------------------------" );
      for ( int i = 0; i < FA.VarianceProportions.Length; i++ )
      {
        Console.Write( i + "\t" );
        Console.Write( FA.FactorExtraction.Eigenvalues[i].ToString( "G4" ) + '\t' );
        Console.Write( FA.VarianceProportions[i].ToString( "P4" ) + '\t' );
        Console.WriteLine( FA.CumulativeVarianceProportions[i].ToString( "P4" ) );
      }
      // We can see from this table that the first three factors account for nearly
      // 88% of the total variance.

      // Next we look at the the percentages of variance explained by the
      // extracted rotated factors. Comparing this table with the first
      // three rows of the previous one (three factors are extracted)
      // we see that the cumulative percentage of variation explained by the
      // extracted factors is maintained by the rotated factors, 
      // but that variation is now spread more evenly over the factors.
      // This suggests that that the rotated factor matrix will be 
      // easier to interpret than with the unrotated matrix.
      double eigenValueSum = NMathFunctions.Sum( FA.FactorExtraction.Eigenvalues );
      DoubleVector RotatedSSLoadingsVarianceProportions = FA.RotatedSumOfSquaredLoadings / eigenValueSum;
      Console.WriteLine( "\nRotated Extraction Sums of Squared Loadings" );
      Console.WriteLine();
      Console.WriteLine( "Factor\tTotal\tVariance\tCumulative" );
      Console.WriteLine( "------------------------------------------" );
      double cumulative = 0;

      for ( int i = 0; i < FA.NumberOfFactors; i++ )
      {
        cumulative += RotatedSSLoadingsVarianceProportions[i];
        Console.Write( i );
        Console.Write( "\t" );
        Console.Write( FA.RotatedSumOfSquaredLoadings[i].ToString( "G3" ) );
        Console.Write( "\t" );
        Console.Write( RotatedSSLoadingsVarianceProportions[i].ToString( "P3" ) );
        Console.Write( "\t" );
        Console.WriteLine( cumulative.ToString( "P3" ) );
      }

      Console.WriteLine();

      // The rotated factor matrix helps you to determine what the factors represent.
      DoubleMatrix rotatedComponentMatrix = FA.RotatedFactors;
      Console.WriteLine( "\nRotated Factor Matrix" );
      Console.WriteLine();
      Console.WriteLine( "Predictor\tFactor" );
      Console.WriteLine( "\t\t1\t2\t3" );
      Console.WriteLine( "-------------------------------------" );
      for ( int i = 0; i < carSalesData.Cols; i++ )
      {
        Console.Write( carSalesData[i].Name + '\t' );
        if ( carSalesData[i].Name.Length < 8 )
        {
          Console.Write( '\t' );
        }
        Console.Write( rotatedComponentMatrix[i, 0].ToString( "F3" ) + '\t' );
        Console.Write( rotatedComponentMatrix[i, 1].ToString( "F3" ) + '\t' );
        Console.WriteLine( rotatedComponentMatrix[i, 2].ToString( "F3" ) );
      }
      Console.WriteLine();
      // The first factor is most highly correlated with price (in thousands) 
      // and horsepow (horsepower). Price in thousands is a better representative, 
      // however, because it is less correlated with the other two factors.
      //
      // The second factor is most highly correlated with Length.
      //
      // The third factor is most highly correlated with vehicle type.
      // This suggests that you can focus on price, length, 
      // and type in further analyses. To do so, however, would ignore
      // any input the other variables might contribute to the analysis.
      // It is therefore preferable to use the three new factors as
      // our new variables. They are representative of all ten original
      // variables and are not linearly correlated with one another.
      // The case data values for new factor variables are contained in the factor
      // scores matrix. There are different algorithms for producing the factors
      // scores. The FactorScores function can be passed an object implementing
      // the IFactorScores interface, thus specifying the algorithm to be used.
      // If no argument is passed to the FactorScores method, the regression
      // algorithm for computing factor scores will be used. The method is 
      // implemented in the class RegressionFactorScores.

      // Print out the factor scores for the first three cases.
      Console.WriteLine( "Factor scores for the first three cases (normalized)" );
      Console.WriteLine( "----------------------------------------------------" );
      var rowSlice = new Slice( 0, 3 );
      Console.WriteLine( FA.FactorScores()[rowSlice, Slice.All].ToTabDelimited( "G3" ) );

      // Factor scores are a linear combination of the ten original variable values. 
      // The coefficients used for the linear combination are found in the 
      // factor score coefficients matrix. This matrix may be obtained from the
      // FactorScoreCoefficients method on the factor analysis class. Like factor
      // scores, the algorithm for their computation may be specified by passing
      // an object implementing the IFactorScores interface to this method. If
      // no method is passed, scores coefficients will be computed using the 
      // regression algorithm implemented in the class RegressionFactorScores.
      //
      // Suppose we receive two new cases containing values for the ten car sales
      // predictor variables. We can compute the values, or scores, for our three
      // new factor variables by multiplying by the factor score coefficients:
      DoubleMatrix scoreCoefficients = FA.FactorScoreCoefficients();
      var newCaseData = new DoubleMatrix( "2x10 [0.0 38.9 3.8 196.0 115.4 71.9 177.0 3.972 17.5 27.8  " +
                                          "1.0 46.0 2.5 220.0 101.6 73.4 168.6 3.75  19.0 20.0]" );
      DoubleMatrix scores = NMathFunctions.Product( newCaseData, scoreCoefficients );
      Console.WriteLine( "Scores for new case data" );
      Console.WriteLine( "---------------------------------------------" );
      Console.WriteLine( scores.ToTabDelimited( "G3" ) );

      Console.WriteLine();
      Console.WriteLine( "Press Enter Key" );
      Console.Read();
    }
  }
}

← All NMath Stats Code Examples
Top