using System; using System.IO; using CenterSpace.NMath.Core; namespace CenterSpace.NMath.Examples.CSharp { class FACorrelationExample { static void Main( string[] args ) { // NMath Stats provide classes for performing a factor analysis on a set of case data. // Case data should be provided to these classes in matrix form - the variable values // in columns and each row representing a case. In this example we look at some data // for car sales. For each sale ten variable values are recorded: // // type - sedan, mini-van, etc, recorded as integers 0, 1,... // price - the price in thousands, // engine_s - engine size, // horsepow - horsepower, // wheelbas - wheelbase, // width - car width, // length - car length, // curb_wgt - weight, // fuel_cap - fuel capacity, // mpg - fuel miles per gallon. // // We would like to predict car sales from this set of predictors. However, many // of the predictors are correlated, and we fear that this might adversely affect // our results. So we use factor analysis to focus on a manageable set of // predictors. // First load the data, which is in a comma delimited form. DataFrame carSalesData = DataFrame.Load( "car_sales.csv", true, false, ",", true ).CleanRows(); // NMath Stats provides three classes for // performing factor analysis. All will perform analysis on the correlation matrix // or the covariance matrix of case data. In addition each of these classes has // two class parameters, one specifying the algorithm used to extract the factors, // and the other specifying a factor rotation method. Here we use the class // FactorAnalysisCorrelation, which analyzes the correlation matrix, with // principal factors extraction and varimax rotation. // The other two factor analysis classes are FactorAnalysisCovariance, for analyzing // the covariance matrix, and DoubleFactorAnalysis which can be used if you dont // have access to the original case data, just the correlation or covariance matrix // (DoubleFactorAnalysis is a base class for FactorAnalysisCorrelation and // FactorAnalysisCovariance). // Construct the factor analysis object we use for our analysis. The simplest // constructor takes only the case data as an argument. Other constructors take // instances of the class parameter classes Extraction and Rotation, // allowing you to pre-configure the options on these objects. When not specified, // instances Extraction and Rotation classes will be created with their no-argument // constructors for use in the analysis. var FA = new FactorAnalysisCorrelation<PCFactorExtraction, VarimaxRotation>( carSalesData.ToDoubleMatrix() ); // First, look at the extracted communalities. // Extracted communalities are estimates of the proportion of variance in each variable // accounted for by the factors. DoubleVector extractedCommunalities = FA.ExtractedCommunalities; Console.WriteLine(); Console.WriteLine( "Predictor\tExtracted Communality" ); Console.WriteLine( "-------------------------------------" ); for ( int i = 0; i < carSalesData.Cols; i++ ) { Console.Write( carSalesData[i].Name + "\t" ); if ( carSalesData[i].Name.Length < 8 ) { Console.Write( \t); } Console.WriteLine( extractedCommunalities[i].ToString( "G3" ) ); } // The communalities are all high (close to 1.0) indicating that the extracted // factors represent the variables well. Console.WriteLine(); // Next we look at the variance explained by the initial solution // by printing out a table of these values. // The first column will just be the extracted factor number. // // The second Totalcolumn gives the eigenvalue, or amount of // variance in the original variables accounted for by each factor. // Since by default factors with eigenvalues greater than one will be // extracted, the first three factors will be extracted. // // The % of Variance column gives the ratio, expressed as a percentage, // of the variance accounted for by each factor to the total // variance in all of the variables. // // The Cumulative % column gives the percentage of variance accounted // for by the first n factors. For example, the cumulative percentage // for the second factor is the sum of the percentage of variance // for the first and second factors. Console.WriteLine( "Factor\tTotal\tVariance\tCumulative" ); Console.WriteLine( "------------------------------------------" ); for ( int i = 0; i < FA.VarianceProportions.Length; i++ ) { Console.Write( i + "\t" ); Console.Write( FA.FactorExtraction.Eigenvalues[i].ToString( "G4" ) + \t); Console.Write( FA.VarianceProportions[i].ToString( "P4" ) + \t); Console.WriteLine( FA.CumulativeVarianceProportions[i].ToString( "P4" ) ); } // We can see from this table that the first three factors account for nearly // 88% of the total variance. // Next we look at the the percentages of variance explained by the // extracted rotated factors. Comparing this table with the first // three rows of the previous one (three factors are extracted) // we see that the cumulative percentage of variation explained by the // extracted factors is maintained by the rotated factors, // but that variation is now spread more evenly over the factors. // This suggests that that the rotated factor matrix will be // easier to interpret than with the unrotated matrix. double eigenValueSum = NMathFunctions.Sum( FA.FactorExtraction.Eigenvalues ); DoubleVector RotatedSSLoadingsVarianceProportions = FA.RotatedSumOfSquaredLoadings / eigenValueSum; Console.WriteLine( "\nRotated Extraction Sums of Squared Loadings" ); Console.WriteLine(); Console.WriteLine( "Factor\tTotal\tVariance\tCumulative" ); Console.WriteLine( "------------------------------------------" ); double cumulative = 0; for ( int i = 0; i < FA.NumberOfFactors; i++ ) { cumulative += RotatedSSLoadingsVarianceProportions[i]; Console.Write( i ); Console.Write( "\t" ); Console.Write( FA.RotatedSumOfSquaredLoadings[i].ToString( "G3" ) ); Console.Write( "\t" ); Console.Write( RotatedSSLoadingsVarianceProportions[i].ToString( "P3" ) ); Console.Write( "\t" ); Console.WriteLine( cumulative.ToString( "P3" ) ); } Console.WriteLine(); // The rotated factor matrix helps you to determine what the factors represent. DoubleMatrix rotatedComponentMatrix = FA.RotatedFactors; Console.WriteLine( "\nRotated Factor Matrix" ); Console.WriteLine(); Console.WriteLine( "Predictor\tFactor" ); Console.WriteLine( "\t\t1\t2\t3" ); Console.WriteLine( "-------------------------------------" ); for ( int i = 0; i < carSalesData.Cols; i++ ) { Console.Write( carSalesData[i].Name + \t); if ( carSalesData[i].Name.Length < 8 ) { Console.Write( \t); } Console.Write( rotatedComponentMatrix[i, 0].ToString( "F3" ) + \t); Console.Write( rotatedComponentMatrix[i, 1].ToString( "F3" ) + \t); Console.WriteLine( rotatedComponentMatrix[i, 2].ToString( "F3" ) ); } Console.WriteLine(); // The first factor is most highly correlated with price (in thousands) // and horsepow (horsepower). Price in thousands is a better representative, // however, because it is less correlated with the other two factors. // // The second factor is most highly correlated with Length. // // The third factor is most highly correlated with vehicle type. // This suggests that you can focus on price, length, // and type in further analyses. To do so, however, would ignore // any input the other variables might contribute to the analysis. // It is therefore preferable to use the three new factors as // our new variables. They are representative of all ten original // variables and are not linearly correlated with one another. // The case data values for new factor variables are contained in the factor // scores matrix. There are different algorithms for producing the factors // scores. The FactorScores function can be passed an object implementing // the IFactorScores interface, thus specifying the algorithm to be used. // If no argument is passed to the FactorScores method, the regression // algorithm for computing factor scores will be used. The method is // implemented in the class RegressionFactorScores. // Print out the factor scores for the first three cases. Console.WriteLine( "Factor scores for the first three cases (normalized)" ); Console.WriteLine( "----------------------------------------------------" ); var rowSlice = new Slice( 0, 3 ); Console.WriteLine( FA.FactorScores()[rowSlice, Slice.All].ToTabDelimited( "G3" ) ); // Factor scores are a linear combination of the ten original variable values. // The coefficients used for the linear combination are found in the // factor score coefficients matrix. This matrix may be obtained from the // FactorScoreCoefficients method on the factor analysis class. Like factor // scores, the algorithm for their computation may be specified by passing // an object implementing the IFactorScores interface to this method. If // no method is passed, scores coefficients will be computed using the // regression algorithm implemented in the class RegressionFactorScores. // // Suppose we receive two new cases containing values for the ten car sales // predictor variables. We can compute the values, or scores, for our three // new factor variables by multiplying by the factor score coefficients: DoubleMatrix scoreCoefficients = FA.FactorScoreCoefficients(); var newCaseData = new DoubleMatrix( "2x10 [0.0 38.9 3.8 196.0 115.4 71.9 177.0 3.972 17.5 27.8 " + "1.0 46.0 2.5 220.0 101.6 73.4 168.6 3.75 19.0 20.0]" ); DoubleMatrix scores = NMathFunctions.Product( newCaseData, scoreCoefficients ); Console.WriteLine( "Scores for new case data" ); Console.WriteLine( "---------------------------------------------" ); Console.WriteLine( scores.ToTabDelimited( "G3" ) ); Console.WriteLine(); Console.WriteLine( "Press Enter Key" ); Console.Read(); } } }← All NMath Code Examples