C# Logistic Regression Example

← All NMath Stats Code Examples

 

using System;
using System.Collections.Generic;
using System.IO;

using CenterSpace.NMath.Core;
using CenterSpace.NMath.Stats;

namespace LogisticRegressionExample
{
  /// <summary>
  /// A .NET example in C# showing how to perform logistic regression.
  /// </summary>
  class LogisticRegressionExample
  {
    static void Main( string[] args )
    {
      Console.WriteLine();
      Console.WriteLine( "Coronary Heart Disease Example -----------------" );
      Console.WriteLine( Environment.NewLine );
      CoronaryHeartDiseaseAge();
      Console.WriteLine( Environment.NewLine );

      Console.WriteLine( "Low Birth Weight Example -----------------------" );
      Console.WriteLine( Environment.NewLine );
      LowBirthWeight();
      Console.WriteLine( Environment.NewLine );

      Console.WriteLine( "Crime Example -----------------------------------" );
      Console.WriteLine( Environment.NewLine );
      Crime();
      Console.WriteLine( Environment.NewLine );

      Console.WriteLine( "Press Enter Key" );
      Console.Read();
    }


    /// <summary>
    /// Example relating the presence of coronary heart disease and age. The data consist of subjects'
    /// age and the whether or not the subject displays evidence of coronary heart disease
    /// (1 for present, 0 for not present).
    /// </summary>
    static void CoronaryHeartDiseaseAge()
    {
      // The data for this example are stored in a matrix. The first column contains the independent,
      // or predictor, variable values. The second column contains the observed outcome values (0 or 1),
      // where 1 indicates the presence of coronary heart disease, and 0 denotes its absence.
      var chdDataAll = new DoubleMatrix( new StreamReader( new FileStream( "chdage.mat", FileMode.Open ) ) );

      // The first column is patient ID.  We need last two columns.
      var chdData = chdDataAll[Slice.All, new Slice( 1, 2 )];

      // A logistic regression can be constructed from data in the following format: a matrix whose
      // rows contain the predictor variable values, and a vector of booleans for the observed values.
      var obs = new bool[chdData.Rows];
      for ( int i = 0; i < chdData.Rows; i++ )
      {
        obs[i] = chdData[i, 1] != 0.0;
      }
      DoubleMatrix regMat = chdData[Slice.All, new Slice( 0, 1 )];

      // The logistic regression class takes a class parameter indicating the parameter calculation
      // algorithm to use. Here we use a Newton-Raphson calculator class, essentially an iteratively
      // reweighted least squares. Since we want our model to have an intercept parameter, we set
      // the last argument to true.
      var logReg = new LogisticRegression<NewtonRaphsonParameterCalc>( regMat, obs, true );

      // First we check that parameter calculation is successful. If not, we
      // print out some diagnostic information and exit.
      if ( !logReg.IsGood )
      {
        Console.WriteLine( "Logistic regression parameter calculation failed:" );
        Console.WriteLine( logReg.ParameterCalculationErrorMessage );
        var parameterCalc = logReg.ParameterCalculator;
        Console.WriteLine( "Maximum iterations: " + parameterCalc.MaxIterations );
        Console.WriteLine( "Number of iterations: " + parameterCalc.Iterations );
        Console.WriteLine( "Newton Raphson converged: " + parameterCalc.Converged );
        return;
      }

      // Parameter calculation is successful. The fit analysis class is still
      // under construction and will contain more statistics. For now we look
      // at the G-statistic.
      var fitAnalysis = new LogisticRegressionFitAnalysis<NewtonRaphsonParameterCalc>( logReg );
      Console.WriteLine( "Log likelihood: " + fitAnalysis.LogLikelihood.ToString( "G3" ) );
      Console.WriteLine( "G-statistic: " + fitAnalysis.GStatistic.ToString( "G3" ) );
      Console.WriteLine( "G-statistic P-value: " + fitAnalysis.GStatisticPValue.ToString( "G3" ) );
      Console.WriteLine();

      // Print out the parameter values and related statistics:
      var parameterEstimates = logReg.ParameterEstimates;
      Console.WriteLine( "Intercept Parameter:" );
      Console.WriteLine( parameterEstimates[0].ToString() );
      Console.WriteLine();
      Console.WriteLine( "Age Coefficient:" );
      Console.WriteLine( parameterEstimates[1].ToString() );
      Console.WriteLine();

      // Predict the probability of the presence of coronary heart disease for some ages.
      var ages = new DoubleMatrix( "5x1 [29.0 37.0 48.0 64.0 78.0]" );
      var probabilities = logReg.PredictedProbabilities( ages );
      for ( int i = 0; i < ages.Rows; i++ )
      {
        Console.WriteLine( "The probability of the presence of coronary heart disease at age {0} is {1}",
          ages[i, 0], probabilities[i].ToString( "G3" ) );
      }
    }

    /// <summary>
    /// Example applying logistic regression to a study of low birth weights. The goal of this study was
    /// to identify risk factors associated with giving birth to a low birth weight baby. There are four
    /// variables under consideration: Age, Weight of subject, Race, and Number of physician visits during
    /// pregnancy.
    /// </summary>
    static void LowBirthWeight()
    {
      DataFrame data = DataFrame.Load( "lowbwt.dat", true, true, " ", true );

      // Logistic regression provides a convenience method for producing design, or dummy, variables
      // using "reference cell coding". If the categorical variable has k levels, there will be k - 1
      // design variables created. Reference cell coding involves setting all the design variable
      // values to 0 for the reference group, and then setting a single design variable equal to 1 for each of
      // the other groups.

      // We first create a data frame containing the design variables and their values 
      // constructed from the Race column of the data. Since the race variable has
      // 3 levels there will be two design variables. By default they will be named
      // Race_0 and Race_1.
      int raceColIndex = data.IndexOfColumn( "Race" );
      DataFrame raceDesignVars = LogisticRegression<NewtonRaphsonParameterCalc>.DesignVariables( data[raceColIndex] );

      // Next we remove the Race column from our input data and replace it with 
      // the two design variable columns.
      data.RemoveColumn( raceColIndex );
      for ( int c = 0; c < raceDesignVars.Cols; c++ )
      {
        data.InsertColumn( raceColIndex + c, raceDesignVars[c] );
      }

      // Now convert the data frame's data to a matrix of floating point values.
      DoubleMatrix matrixDat = data.ToDoubleMatrix();

      // The first column of the data is patient ID and the second column of the data contains the
      // observed condition of low birth weight. A 1 in the observation column indicates low birth weight
      // and a 0 indicated normal birth weight. We want to exclude the first column of patient ID's from the
      // regression data.
      DoubleMatrix A = matrixDat[Range.All, new Range( 1, Position.End )];

      // We now construct the logistic regression. This constructor allows
      // you to leave the column of observed values in the data matrix. 
      // However you must supply the constructor with the index of the 
      // observation column and a predicate function object for converting
      // the numerical values to boolean: true if the condition is present
      // and false if it is not. So in constructing the object we pass in
      // the matrix containing the independent, or predictor, variable 
      // values and the observed values. Next we pass in a 0 indicating the
      // matrix column at index 0 contains the observed values. Next we pass
      // in a lambda expression indicating the nonzero values in the observation
      // column indicate the presence of low birth weight. Finally we 
      // include an intercept parameter as indicated by the final true 
      // argument.
      var lr = new LogisticRegression<NewtonRaphsonParameterCalc>( matrixDat, 0, x => x != 0, true );

      // Check to see if parameter calculation succeeded. If not print out diagnostics
      // and exit.
      Console.WriteLine( "LR good? " + lr.IsGood );
      if ( !lr.IsGood )
      {
        Console.WriteLine( "Logistic regression parameter calculation failed:" );
        Console.WriteLine( lr.ParameterCalculationErrorMessage );
        var parameterCalc = lr.ParameterCalculator;
        Console.WriteLine( "Maximum iterations: " + parameterCalc.MaxIterations );
        Console.WriteLine( "Number of iterations: " + parameterCalc.Iterations );
        Console.WriteLine( "Newton Raphson converged: " + parameterCalc.Converged );
        return;
      }

      // Parameter calculation succeeded. Print out the model parameter estimates
      // and related information.
      var parameterEstimates = lr.ParameterEstimates;
      for ( int i = 0; i < parameterEstimates.Length; i++ )
      {
        var estimate = parameterEstimates[i];
        if ( i == 0 )
        {
          Console.WriteLine( "Constant term = {0}, SE = {1}", estimate.Value.ToString( "G3" ),
            estimate.StandardError.ToString( "G3" ) );
        }
        else
        {
          Console.WriteLine( "Coefficient for {0} = {1}, SE = {2}", data[i].Name, estimate.Value.ToString( "G3" ),
            estimate.StandardError.ToString( "G3" ) );
        }
      }
      Console.WriteLine();

      // We can look at the parameter covariance matrix.
      Console.WriteLine( "Parameter covariance matrix:" );
      Console.WriteLine( lr.ParameterCovarianceMatrix.ToTabDelimited( "G8" ) );
      Console.WriteLine();

      // Finally, print out some fit information.
      var fitAnalysis = new LogisticRegressionFitAnalysis<NewtonRaphsonParameterCalc>( lr );
      Console.WriteLine( "Log likelihood = " + fitAnalysis.LogLikelihood.ToString( "G3" ) );
      Console.WriteLine( "G-statistic = " + fitAnalysis.GStatistic.ToString( "G3" ) );
      var pValue = fitAnalysis.GStatisticPValue;
      Console.WriteLine( "Pr[X^2({0}) > {1}] = {2}", lr.NumberOfPredictors, fitAnalysis.GStatistic.ToString( "G3" ),
        fitAnalysis.GStatisticPValue.ToString( "G3" ) );

      // Predict the probability of a 29 year old white women weighing 159 pounds and with
      // 5 physician visits during pregnancy.
      DoubleVector subject = new DoubleVector( 29.0, 159.0, 0.0, 0.0, 5.0 );
      double prob = lr.PredictedProbability( subject );
      Console.WriteLine( "Estimated probability of a white woman age {0}, weighing {1} lbs, {2} Dr. visits is {3}",
        subject[0], subject[1], subject[4], prob.ToString( "G5" ) );
    }

    static void Crime()
    {
      DataFrame crimeData = DataFrame.Load( "crime.dat", true, false, " ", true );

      string[] columnNames = new string[] { "CrimeRat", "MaleTeen", "South", "Educ", "Police59" };
      int[] columns = new int[columnNames.Length];
      for ( int i = 0; i < columnNames.Length; i++ )
      {
        columns[i] = crimeData.IndexOfColumn( columnNames[i] );
      }

      Subset s = new Subset( columns );
      var data = crimeData.GetColumns( s );
      var matrixData = data.ToDoubleMatrix();
      var lr = new LogisticRegression<NewtonRaphsonParameterCalc>( matrixData, 0, x => x >= 110.0, true );
      Console.WriteLine( "lr is good: " + lr.IsGood );
      var paramEst = lr.ParameterEstimates;
      for ( int i = 0; i < paramEst.Length; i++ )
      {
        Console.WriteLine( paramEst[i].ToString() );
      }

      var fit = new LogisticRegressionFitAnalysis<NewtonRaphsonParameterCalc>( lr );
      var pearson = fit.PearsonStatistic();
      Console.WriteLine( "Pearson Statistic -" );
      Console.WriteLine( Environment.NewLine + "Pearson: " + pearson );
      Console.WriteLine();

      // Calculate the Hosmer Lemeshow statistic using 10 groups.
      Console.WriteLine( "Hosmer Lemeshow Statistic -" );
      var hosmerLemeshowStat = fit.HLStatistic( 10 );
      Console.WriteLine( hosmerLemeshowStat );
    }
  }
}

← All NMath Stats Code Examples
Top