using System; using System.Collections.Generic; using System.IO; using CenterSpace.NMath.Core; namespace LogisticRegressionExample { /// <summary> /// A .NET example in C# showing how to perform logistic regression. /// </summary> class LogisticRegressionExample { static void Main( string[] args ) { Console.WriteLine(); Console.WriteLine( "Coronary Heart Disease Example -----------------" ); Console.WriteLine( Environment.NewLine ); CoronaryHeartDiseaseAge(); Console.WriteLine( Environment.NewLine ); Console.WriteLine( "Low Birth Weight Example -----------------------" ); Console.WriteLine( Environment.NewLine ); LowBirthWeight(); Console.WriteLine( Environment.NewLine ); Console.WriteLine( "Crime Example -----------------------------------" ); Console.WriteLine( Environment.NewLine ); Crime(); Console.WriteLine( Environment.NewLine ); Console.WriteLine( "Press Enter Key" ); Console.Read(); } /// <summary> /// Example relating the presence of coronary heart disease and age. The data consist of subjects /// age and the whether or not the subject displays evidence of coronary heart disease /// (1 for present, 0 for not present). /// </summary> static void CoronaryHeartDiseaseAge() { // The data for this example are stored in a matrix. The first column contains the independent, // or predictor, variable values. The second column contains the observed outcome values (0 or 1), // where 1 indicates the presence of coronary heart disease, and 0 denotes its absence. var chdDataAll = new DoubleMatrix( new StreamReader( new FileStream( "chdage.mat", FileMode.Open ) ) ); // The first column is patient ID. We need last two columns. var chdData = chdDataAll[Slice.All, new Slice( 1, 2 )]; // A logistic regression can be constructed from data in the following format: a matrix whose // rows contain the predictor variable values, and a vector of booleans for the observed values. var obs = new bool[chdData.Rows]; for ( int i = 0; i < chdData.Rows; i++ ) { obs[i] = chdData[i, 1] != 0.0; } DoubleMatrix regMat = chdData[Slice.All, new Slice( 0, 1 )]; // The logistic regression class takes a class parameter indicating the parameter calculation // algorithm to use. Here we use a Newton-Raphson calculator class, essentially an iteratively // reweighted least squares. Since we want our model to have an intercept parameter, we set // the last argument to true. var logReg = new LogisticRegression<NewtonRaphsonParameterCalc>( regMat, obs, true ); // First we check that parameter calculation is successful. If not, we // print out some diagnostic information and exit. if ( !logReg.IsGood ) { Console.WriteLine( "Logistic regression parameter calculation failed:" ); Console.WriteLine( logReg.ParameterCalculationErrorMessage ); var parameterCalc = logReg.ParameterCalculator; Console.WriteLine( "Maximum iterations: " + parameterCalc.MaxIterations ); Console.WriteLine( "Number of iterations: " + parameterCalc.Iterations ); Console.WriteLine( "Newton Raphson converged: " + parameterCalc.Converged ); return; } // Parameter calculation is successful. The fit analysis class is still // under construction and will contain more statistics. For now we look // at the G-statistic. var fitAnalysis = new LogisticRegressionFitAnalysis<NewtonRaphsonParameterCalc>( logReg ); Console.WriteLine( "Log likelihood: " + fitAnalysis.LogLikelihood.ToString( "G3" ) ); Console.WriteLine( "G-statistic: " + fitAnalysis.GStatistic.ToString( "G3" ) ); Console.WriteLine( "G-statistic P-value: " + fitAnalysis.GStatisticPValue.ToString( "G3" ) ); Console.WriteLine(); // Print out the parameter values and related statistics: var parameterEstimates = logReg.ParameterEstimates; Console.WriteLine( "Intercept Parameter:" ); Console.WriteLine( parameterEstimates[0].ToString() ); Console.WriteLine(); Console.WriteLine( "Age Coefficient:" ); Console.WriteLine( parameterEstimates[1].ToString() ); Console.WriteLine(); // Predict the probability of the presence of coronary heart disease for some ages. var ages = new DoubleMatrix( "5x1 [29.0 37.0 48.0 64.0 78.0]" ); var probabilities = logReg.PredictedProbabilities( ages ); for ( int i = 0; i < ages.Rows; i++ ) { Console.WriteLine( "The probability of the presence of coronary heart disease at age {0} is {1}", ages[i, 0], probabilities[i].ToString( "G3" ) ); } } /// <summary> /// Example applying logistic regression to a study of low birth weights. The goal of this study was /// to identify risk factors associated with giving birth to a low birth weight baby. There are four /// variables under consideration: Age, Weight of subject, Race, and Number of physician visits during /// pregnancy. /// </summary> static void LowBirthWeight() { DataFrame data = DataFrame.Load( "lowbwt.dat", true, true, " ", true ); // Logistic regression provides a convenience method for producing design, or dummy, variables // using "reference cell coding". If the categorical variable has k levels, there will be k - 1 // design variables created. Reference cell coding involves setting all the design variable // values to 0 for the reference group, and then setting a single design variable equal to 1 for each of // the other groups. // We first create a data frame containing the design variables and their values // constructed from the Race column of the data. Since the race variable has // 3 levels there will be two design variables. By default they will be named // Race_0 and Race_1. int raceColIndex = data.IndexOfColumn( "Race" ); DataFrame raceDesignVars = LogisticRegression<NewtonRaphsonParameterCalc>.DesignVariables( data[raceColIndex] ); // Next we remove the Race column from our input data and replace it with // the two design variable columns. data.RemoveColumn( raceColIndex ); for ( int c = 0; c < raceDesignVars.Cols; c++ ) { data.InsertColumn( raceColIndex + c, raceDesignVars[c] ); } // Now convert the data frames data to a matrix of floating point values. DoubleMatrix matrixDat = data.ToDoubleMatrix(); // The first column of the data is patient ID and the second column of the data contains the // observed condition of low birth weight. A 1 in the observation column indicates low birth weight // and a 0 indicated normal birth weight. We want to exclude the first column of patient IDs from the // regression data. DoubleMatrix A = matrixDat[Range.All, new Range( 1, Position.End )]; // We now construct the logistic regression. This constructor allows // you to leave the column of observed values in the data matrix. // However you must supply the constructor with the index of the // observation column and a predicate function object for converting // the numerical values to boolean: true if the condition is present // and false if it is not. So in constructing the object we pass in // the matrix containing the independent, or predictor, variable // values and the observed values. Next we pass in a 0 indicating the // matrix column at index 0 contains the observed values. Next we pass // in a lambda expression indicating the nonzero values in the observation // column indicate the presence of low birth weight. Finally we // include an intercept parameter as indicated by the final true // argument. var lr = new LogisticRegression<NewtonRaphsonParameterCalc>( matrixDat, 0, x => x != 0, true ); // Check to see if parameter calculation succeeded. If not print out diagnostics // and exit. Console.WriteLine( "LR good? " + lr.IsGood ); if ( !lr.IsGood ) { Console.WriteLine( "Logistic regression parameter calculation failed:" ); Console.WriteLine( lr.ParameterCalculationErrorMessage ); var parameterCalc = lr.ParameterCalculator; Console.WriteLine( "Maximum iterations: " + parameterCalc.MaxIterations ); Console.WriteLine( "Number of iterations: " + parameterCalc.Iterations ); Console.WriteLine( "Newton Raphson converged: " + parameterCalc.Converged ); return; } // Parameter calculation succeeded. Print out the model parameter estimates // and related information. var parameterEstimates = lr.ParameterEstimates; for ( int i = 0; i < parameterEstimates.Length; i++ ) { var estimate = parameterEstimates[i]; if ( i == 0 ) { Console.WriteLine( "Constant term = {0}, SE = {1}", estimate.Value.ToString( "G3" ), estimate.StandardError.ToString( "G3" ) ); } else { Console.WriteLine( "Coefficient for {0} = {1}, SE = {2}", data[i].Name, estimate.Value.ToString( "G3" ), estimate.StandardError.ToString( "G3" ) ); } } Console.WriteLine(); // We can look at the parameter covariance matrix. Console.WriteLine( "Parameter covariance matrix:" ); Console.WriteLine( lr.ParameterCovarianceMatrix.ToTabDelimited( "G8" ) ); Console.WriteLine(); // Finally, print out some fit information. var fitAnalysis = new LogisticRegressionFitAnalysis<NewtonRaphsonParameterCalc>( lr ); Console.WriteLine( "Log likelihood = " + fitAnalysis.LogLikelihood.ToString( "G3" ) ); Console.WriteLine( "G-statistic = " + fitAnalysis.GStatistic.ToString( "G3" ) ); var pValue = fitAnalysis.GStatisticPValue; Console.WriteLine( "Pr[X^2({0}) > {1}] = {2}", lr.NumberOfPredictors, fitAnalysis.GStatistic.ToString( "G3" ), fitAnalysis.GStatisticPValue.ToString( "G3" ) ); // Predict the probability of a 29 year old white women weighing 159 pounds and with // 5 physician visits during pregnancy. DoubleVector subject = new DoubleVector( 29.0, 159.0, 0.0, 0.0, 5.0 ); double prob = lr.PredictedProbability( subject ); Console.WriteLine( "Estimated probability of a white woman age {0}, weighing {1} lbs, {2} Dr. visits is {3}", subject[0], subject[1], subject[4], prob.ToString( "G5" ) ); } static void Crime() { DataFrame crimeData = DataFrame.Load( "crime.dat", true, false, " ", true ); string[] columnNames = new string[] { "CrimeRat", "MaleTeen", "South", "Educ", "Police59" }; int[] columns = new int[columnNames.Length]; for ( int i = 0; i < columnNames.Length; i++ ) { columns[i] = crimeData.IndexOfColumn( columnNames[i] ); } Subset s = new Subset( columns ); var data = crimeData.GetColumns( s ); var matrixData = data.ToDoubleMatrix(); var lr = new LogisticRegression<NewtonRaphsonParameterCalc>( matrixData, 0, x => x >= 110.0, true ); Console.WriteLine( "lr is good: " + lr.IsGood ); var paramEst = lr.ParameterEstimates; for ( int i = 0; i < paramEst.Length; i++ ) { Console.WriteLine( paramEst[i].ToString() ); } var fit = new LogisticRegressionFitAnalysis<NewtonRaphsonParameterCalc>( lr ); var pearson = fit.PearsonStatistic(); Console.WriteLine( "Pearson Statistic -" ); Console.WriteLine( Environment.NewLine + "Pearson: " + pearson ); Console.WriteLine(); // Calculate the Hosmer Lemeshow statistic using 10 groups. Console.WriteLine( "Hosmer Lemeshow Statistic -" ); var hosmerLemeshowStat = fit.HLStatistic( 10 ); Console.WriteLine( hosmerLemeshowStat ); } } }← All NMath Code Examples