← All NMath Code Examples
using System;
using System.IO;
using CenterSpace.NMath.Core;
namespace CenterSpace.NMath.Examples.CSharp
{
class FACorrelationExample
{
static void Main( string[] args )
{
// NMath Stats provide classes for performing a factor analysis on a set of case data.
// Case data should be provided to these classes in matrix form - the variable values
// in columns and each row representing a case. In this example we look at some data
// for car sales. For each sale ten variable values are recorded:
//
// type - sedan, mini-van, etc, recorded as integers 0, 1,...
// price - the price in thousands,
// engine_s - engine size,
// horsepow - horsepower,
// wheelbas - wheelbase,
// width - car width,
// length - car length,
// curb_wgt - weight,
// fuel_cap - fuel capacity,
// mpg - fuel miles per gallon.
//
// We would like to predict car sales from this set of predictors. However, many
// of the predictors are correlated, and we fear that this might adversely affect
// our results. So we use factor analysis to focus on a manageable set of
// predictors.
// First load the data, which is in a comma delimited form.
DataFrame carSalesData = DataFrame.Load( "car_sales.csv", true, false, ",", true ).CleanRows();
// NMath Stats provides three classes for
// performing factor analysis. All will perform analysis on the correlation matrix
// or the covariance matrix of case data. In addition each of these classes has
// two class parameters, one specifying the algorithm used to extract the factors,
// and the other specifying a factor rotation method. Here we use the class
// FactorAnalysisCorrelation, which analyzes the correlation matrix, with
// principal factors extraction and varimax rotation.
// The other two factor analysis classes are FactorAnalysisCovariance, for analyzing
// the covariance matrix, and DoubleFactorAnalysis which can be used if you dont
// have access to the original case data, just the correlation or covariance matrix
// (DoubleFactorAnalysis is a base class for FactorAnalysisCorrelation and
// FactorAnalysisCovariance).
// Construct the factor analysis object we use for our analysis. The simplest
// constructor takes only the case data as an argument. Other constructors take
// instances of the class parameter classes Extraction and Rotation,
// allowing you to pre-configure the options on these objects. When not specified,
// instances Extraction and Rotation classes will be created with their no-argument
// constructors for use in the analysis.
var FA = new FactorAnalysisCorrelation<PCFactorExtraction, VarimaxRotation>( carSalesData.ToDoubleMatrix() );
// First, look at the extracted communalities.
// Extracted communalities are estimates of the proportion of variance in each variable
// accounted for by the factors.
DoubleVector extractedCommunalities = FA.ExtractedCommunalities;
Console.WriteLine();
Console.WriteLine( "Predictor\tExtracted Communality" );
Console.WriteLine( "-------------------------------------" );
for ( int i = 0; i < carSalesData.Cols; i++ )
{
Console.Write( carSalesData[i].Name + "\t" );
if ( carSalesData[i].Name.Length < 8 )
{
Console.Write( \t);
}
Console.WriteLine( extractedCommunalities[i].ToString( "G3" ) );
}
// The communalities are all high (close to 1.0) indicating that the extracted
// factors represent the variables well.
Console.WriteLine();
// Next we look at the variance explained by the initial solution
// by printing out a table of these values.
// The first column will just be the extracted factor number.
//
// The second Totalcolumn gives the eigenvalue, or amount of
// variance in the original variables accounted for by each factor.
// Since by default factors with eigenvalues greater than one will be
// extracted, the first three factors will be extracted.
//
// The % of Variance column gives the ratio, expressed as a percentage,
// of the variance accounted for by each factor to the total
// variance in all of the variables.
//
// The Cumulative % column gives the percentage of variance accounted
// for by the first n factors. For example, the cumulative percentage
// for the second factor is the sum of the percentage of variance
// for the first and second factors.
Console.WriteLine( "Factor\tTotal\tVariance\tCumulative" );
Console.WriteLine( "------------------------------------------" );
for ( int i = 0; i < FA.VarianceProportions.Length; i++ )
{
Console.Write( i + "\t" );
Console.Write( FA.FactorExtraction.Eigenvalues[i].ToString( "G4" ) + \t);
Console.Write( FA.VarianceProportions[i].ToString( "P4" ) + \t);
Console.WriteLine( FA.CumulativeVarianceProportions[i].ToString( "P4" ) );
}
// We can see from this table that the first three factors account for nearly
// 88% of the total variance.
// Next we look at the the percentages of variance explained by the
// extracted rotated factors. Comparing this table with the first
// three rows of the previous one (three factors are extracted)
// we see that the cumulative percentage of variation explained by the
// extracted factors is maintained by the rotated factors,
// but that variation is now spread more evenly over the factors.
// This suggests that that the rotated factor matrix will be
// easier to interpret than with the unrotated matrix.
double eigenValueSum = NMathFunctions.Sum( FA.FactorExtraction.Eigenvalues );
DoubleVector RotatedSSLoadingsVarianceProportions = FA.RotatedSumOfSquaredLoadings / eigenValueSum;
Console.WriteLine( "\nRotated Extraction Sums of Squared Loadings" );
Console.WriteLine();
Console.WriteLine( "Factor\tTotal\tVariance\tCumulative" );
Console.WriteLine( "------------------------------------------" );
double cumulative = 0;
for ( int i = 0; i < FA.NumberOfFactors; i++ )
{
cumulative += RotatedSSLoadingsVarianceProportions[i];
Console.Write( i );
Console.Write( "\t" );
Console.Write( FA.RotatedSumOfSquaredLoadings[i].ToString( "G3" ) );
Console.Write( "\t" );
Console.Write( RotatedSSLoadingsVarianceProportions[i].ToString( "P3" ) );
Console.Write( "\t" );
Console.WriteLine( cumulative.ToString( "P3" ) );
}
Console.WriteLine();
// The rotated factor matrix helps you to determine what the factors represent.
DoubleMatrix rotatedComponentMatrix = FA.RotatedFactors;
Console.WriteLine( "\nRotated Factor Matrix" );
Console.WriteLine();
Console.WriteLine( "Predictor\tFactor" );
Console.WriteLine( "\t\t1\t2\t3" );
Console.WriteLine( "-------------------------------------" );
for ( int i = 0; i < carSalesData.Cols; i++ )
{
Console.Write( carSalesData[i].Name + \t);
if ( carSalesData[i].Name.Length < 8 )
{
Console.Write( \t);
}
Console.Write( rotatedComponentMatrix[i, 0].ToString( "F3" ) + \t);
Console.Write( rotatedComponentMatrix[i, 1].ToString( "F3" ) + \t);
Console.WriteLine( rotatedComponentMatrix[i, 2].ToString( "F3" ) );
}
Console.WriteLine();
// The first factor is most highly correlated with price (in thousands)
// and horsepow (horsepower). Price in thousands is a better representative,
// however, because it is less correlated with the other two factors.
//
// The second factor is most highly correlated with Length.
//
// The third factor is most highly correlated with vehicle type.
// This suggests that you can focus on price, length,
// and type in further analyses. To do so, however, would ignore
// any input the other variables might contribute to the analysis.
// It is therefore preferable to use the three new factors as
// our new variables. They are representative of all ten original
// variables and are not linearly correlated with one another.
// The case data values for new factor variables are contained in the factor
// scores matrix. There are different algorithms for producing the factors
// scores. The FactorScores function can be passed an object implementing
// the IFactorScores interface, thus specifying the algorithm to be used.
// If no argument is passed to the FactorScores method, the regression
// algorithm for computing factor scores will be used. The method is
// implemented in the class RegressionFactorScores.
// Print out the factor scores for the first three cases.
Console.WriteLine( "Factor scores for the first three cases (normalized)" );
Console.WriteLine( "----------------------------------------------------" );
var rowSlice = new Slice( 0, 3 );
Console.WriteLine( FA.FactorScores()[rowSlice, Slice.All].ToTabDelimited( "G3" ) );
// Factor scores are a linear combination of the ten original variable values.
// The coefficients used for the linear combination are found in the
// factor score coefficients matrix. This matrix may be obtained from the
// FactorScoreCoefficients method on the factor analysis class. Like factor
// scores, the algorithm for their computation may be specified by passing
// an object implementing the IFactorScores interface to this method. If
// no method is passed, scores coefficients will be computed using the
// regression algorithm implemented in the class RegressionFactorScores.
//
// Suppose we receive two new cases containing values for the ten car sales
// predictor variables. We can compute the values, or scores, for our three
// new factor variables by multiplying by the factor score coefficients:
DoubleMatrix scoreCoefficients = FA.FactorScoreCoefficients();
var newCaseData = new DoubleMatrix( "2x10 [0.0 38.9 3.8 196.0 115.4 71.9 177.0 3.972 17.5 27.8 " +
"1.0 46.0 2.5 220.0 101.6 73.4 168.6 3.75 19.0 20.0]" );
DoubleMatrix scores = NMathFunctions.Product( newCaseData, scoreCoefficients );
Console.WriteLine( "Scores for new case data" );
Console.WriteLine( "---------------------------------------------" );
Console.WriteLine( scores.ToTabDelimited( "G3" ) );
Console.WriteLine();
Console.WriteLine( "Press Enter Key" );
Console.Read();
}
}
}
← All NMath Code Examples