Tcl Library Source Code
Check-in [98e5b7ce4d]
Not logged in
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2018 Conference, Houston/TX, US, Oct 15-19
Send your abstracts to tclconference@googlegroups.com or submit via the online form
by Aug 20.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Polished the source code for the multivariate regression a bit
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 98e5b7ce4d98fd56e9feb7aa8e25508bb3761f74
User & Date: arjenmarkus 2007-03-05 20:46:39
Context
2007-03-07
22:28
* compiler_peg_mecpu.tcl: Fixed typo in name of required package * pkgIndex.tcl: ('gasm;, was incorrectly 'gas'). Bumped to version 0.1.1. check-in: 2977d8c358 user: andreas_kupries tags: trunk
2007-03-05
20:46
Polished the source code for the multivariate regression a bit check-in: 98e5b7ce4d user: arjenmarkus tags: trunk
2007-02-27
23:24
* sets/s.c (from_any): Crashing bug in the Critcl implementation of 'struct::set'. Remembered the old object type X in the from_any conversion function, then converted to type 'list', and at the end tried to release the list using the freeintrep function of type X instead of type 'list'. Fixed by moving the code to remember the type after the conversion to a 'list'. check-in: 615847e351 user: andreas_kupries tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to modules/math/ChangeLog.





1
2
3
4
5
6
7





2007-02-27  Arjen Markus  <arjenmarkus@users.sourceforge.net>

	* statistics.man : added description of multivariate linear regression procedures
                    (contribution by Eric Kemp-Benedict)
	* statistics.tcl : sources "mvlinreg.tcl" now
	* mvlinreg.tcl   : original source code from Eric, still needs some polishing
                    (the test case needs to be integrated too)
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
2007-03-05  Arjen Markus  <arjenmarkus@users.sourceforge.net>

	* mvlinreg.tcl   : polished the source code (adding standard headers)
                    Still to do: test cases

2007-02-27  Arjen Markus  <arjenmarkus@users.sourceforge.net>

	* statistics.man : added description of multivariate linear regression procedures
                    (contribution by Eric Kemp-Benedict)
	* statistics.tcl : sources "mvlinreg.tcl" now
	* mvlinreg.tcl   : original source code from Eric, still needs some polishing
                    (the test case needs to be integrated too)
Changes to modules/math/mvlinreg.tcl.


1
2
3
4
5
6
7
8
9


10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43



44





45


46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

83
84
85

86
87
88
89

90
91
92
93
94







95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222

223
224


225
226
227
228

229
230
231
232
233







234
235
236
237
238
239
240
241


# Copyright 2007 Eric Kemp-Benedict
# Released under the BSD license under any terms
# that allow it to be compatible with tcllib

package provide mvlinreg 0.1

package require math::linearalgebra 1.0
package require math::statistics 0.1.1



# mvlinreg = Multivariate Linear Regression

namespace eval mvlinreg {
    variable epsilon 1.0e-7
    
    namespace export tstat wls ols
    
    namespace import -force \
        ::math::linearalgebra::mkMatrix \
        ::math::linearalgebra::mkVector \
        ::math::linearalgebra::mkIdentity \
        ::math::linearalgebra::mkDiagonal \
        ::math::linearalgebra::getrow \
        ::math::linearalgebra::setrow \
        ::math::linearalgebra::getcol \
        ::math::linearalgebra::setcol \
        ::math::linearalgebra::getelem \
        ::math::linearalgebra::setelem \
        ::math::linearalgebra::dotproduct \
        ::math::linearalgebra::matmul \
        ::math::linearalgebra::add \
        ::math::linearalgebra::sub \
        ::math::linearalgebra::solveGauss
    
    # NOTE: The authors of math::linearalgebra seem to have 
    #   forgotten to export ::math::linearalgebra::transpose
        
    ########################################
    #
    # t-stats
    #
    ########################################
    # mvlinreg::tstat n ?alpha?
    # Returns inverse of the single-tailed t distribution
    #  given number of degrees of freedom & confidence



    # Defaults to alpha = 0.05





    # Iterates until result is within epsilon of the target


    proc tstat {n {alpha 0.05}} {
        variable epsilon
        variable tvals
        
        if [info exists tvals($n:$alpha)] {
            return $tvals($n:$alpha)
        }
        
        set deltat [expr {100 * $epsilon}]
        # For one-tailed distribution, 
        set ptarg [expr {1.000 - $alpha/2.0}]
        set maxiter 100
        
        # Initial value for t
        set t 2.0
        
        set niter 0
        while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
            set pstar [::math::statistics::cdf-students-t $n $t]
            set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
            set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]
            
            set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]
            
            incr niter
            if {$niter == $maxiter} {
                error "mvlinreg::tstat: Did not converge after $niter iterations"
                return {}
            }
        }
        
        # Cache the result to shorten the call in future
        set tvals($n:$alpha) $t
        
        return $t
    }
        

    ########################################
    #
    # Weighted Least Squares

    #
    ########################################
    # mvlinreg::wls w [list y x's] w [list y x's] ...
    # Returns:

    #   R-squared
    #   Adj R-squared
    #   coefficients of x's in fit
    #   standard errors of the coefficients
    #   95% confidence bounds for coefficients







    proc wls {args} {
        
        # Fill the matrices of x & y values, and weights
        # For n points, k coefficients
        
        # The number of points is equal to half the arguments (n weights, n points)
        set n [expr {[llength $args]/2}]
        
        set firstloop true
        # Sum up all y values to take an average
        set ysum 0
        # Add up the weights
        set wtsum 0
        # Count over rows (points) as you go
        set point 0
        foreach {wt pt} $args {
            
            # Check inputs
            if {[string is double $wt] == 0} {
                error "mvlinreg::wls: Weight \"$wt\" is not a number"
                return {}
            }
            
            ## -- Check dimensions, initialize
            if $firstloop {
                # k = num of vals in pt = 1 + number of x's (because of constant)
                set k [llength $pt]
                if {$n <= [expr {$k + 1}]} {
                    error "mvlinreg::wls: Too few degrees of freedom: $k variables but only $n points"
                    return {}
                }
                set X [mkMatrix $n $k]
                set y [mkVector $n]
                set I_x [mkIdentity $k]
                set I_y [mkIdentity $n]
                
                set firstloop false
                
            } else {
                # Have to have same number of x's for all points
                if {$k != [llength $pt]} {
                    error "mvlinreg::wls: Point \"$pt\" has wrong number of values (expected $k)"
                    # Clean up
                    return {}
                }
            }
            
            ## -- Extract values from set of points
            # Make a list of y values
            set yval [expr {double([lindex $pt 0])}]
            setelem y $point $yval
            set ysum [expr {$ysum + $wt * $yval}]
            set wtsum [expr {$wtsum + $wt}]
            # Add x-values to the x-matrix
            set xrow [lrange $pt 1 end]
            # Add the constant (value = 1.0)
            lappend xrow 1.0
            setrow X $point $xrow
            # Create list of weights & square root of weights
            lappend w [expr {double($wt)}]
            lappend sqrtw [expr {sqrt(double($wt))}]
            
            incr point
            
        }

        set ymean [expr {double($ysum)/$wtsum}]
        set W [mkDiagonal $w]
        set sqrtW [mkDiagonal $sqrtw]
        
        # Calculate sum os square differences for x's
        for {set r 0} {$r < $k} {incr r} {
            set xsqrsum 0.0
            set xmeansum 0.0
            # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
            for {set t 0} {$t < $n} {incr t} {
                set xval [getelem $X $t $r]
                set xmeansum [expr {$xmeansum + double($xval)}]
                set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
            }
            lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
        }
        
        ## -- Set up the X'WX matrix
        set XtW [matmul [::math::linearalgebra::transpose $X] $W]
        set XtWX [matmul $XtW $X]
        
        # Invert
        set M [solveGauss $XtWX $I_x]
        
        set beta [matmul $M [matmul $XtW $y]]
        
        ### -- Residuals & R-squared
        # 1) Generate list of diagonals of the hat matrix
        set H [matmul $X [matmul $M $XtW]]
        for {set i 0} {$i < $n} {incr i} {
            lappend h_ii [getelem $H $i $i]
        }

        set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
        set yhat [matmul $H $y]
        
        # 2) Generate list of residuals, sum of squared residuals, r-squared
        set sstot 0.0
        set ssreg 0.0
        # Note: Relying on representation of Vector as a list for y, yhat
        foreach yval $y wt $w yhatval $yhat {
            set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
            set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
        }
        set r2 [expr {double($ssreg)/$sstot}]
        set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
        set sumsqresid [dotproduct $R $R]
        set s2 [expr {double($sumsqresid) / double($n - $k)}]
        
        ### -- Confidence intervals for coefficients
        set tvalue [tstat [expr {$n - $k}]]
        for {set i 0} {$i < $k} {incr i} {
            set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
            set mid [lindex $beta $i]
            lappend stderrs $stderr
            lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
        }

        return [list $r2 $adjr2 $beta $stderrs $confinterval]
    }
    
    ########################################

    #
    # Ordinary (unweighted) Least Squares


    #
    ########################################
    # mvlinreg::ols [list y x's] [list y x's] ...
    # Returns:

    #   R-squared
    #   Adj R-squared
    #   coefficients of x's in fit
    #   standard errors of the coefficients
    #   95% confidence bounds for coefficients







    proc ols {args} {
        set newargs {}
        foreach pt $args {
            lappend newargs 1 $pt
        }
        return [eval wls $newargs]
    }
}
>
>
|
|
|

<
<

|

>
>
|
>
|

|
|
|















|
<
<
|
|
<
|
|
<
<
<
|
|
>
>
>
|
>
>
>
>
>
|
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
|
|
|
|
|
|
|
|
|
>
|
|
|
>
|
<
<
|
>
|
|
|
|
|
>
>
>
>
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
>
|
<
>
>
|
<
<
|
>
|
|
|
|
|
>
>
>
>
>
>
>
|
|
|
|
|
|
<

1
2
3
4
5
6


7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


35
36

37
38



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

80
81
82
83
84
85
86
87
88
89
90
91
92
93
94


95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238

239
240
241


242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261

262
# mvreglin.tcl --
#     Addition to the statistics package
#     Copyright 2007 Eric Kemp-Benedict
#     Released under the BSD license under any terms
#     that allow it to be compatible with tcllib



package require math::linearalgebra 1.0
package require math::statistics 0.4

# ::math::statistics --
#     This file adds:
#     mvlinreg = Multivariate Linear Regression
#
namespace eval ::math::statistics {
    variable epsilon 1.0e-7

    namespace export tstat mv-wls mv-ols

    namespace import -force \
        ::math::linearalgebra::mkMatrix \
        ::math::linearalgebra::mkVector \
        ::math::linearalgebra::mkIdentity \
        ::math::linearalgebra::mkDiagonal \
        ::math::linearalgebra::getrow \
        ::math::linearalgebra::setrow \
        ::math::linearalgebra::getcol \
        ::math::linearalgebra::setcol \
        ::math::linearalgebra::getelem \
        ::math::linearalgebra::setelem \
        ::math::linearalgebra::dotproduct \
        ::math::linearalgebra::matmul \
        ::math::linearalgebra::add \
        ::math::linearalgebra::sub \
        ::math::linearalgebra::solveGauss \


        ::math::linearalgebra::transpose
}


# tstats --



#     Returns inverse of the single-tailed t distribution
#     given number of degrees of freedom & confidence
#
# Arguments:
#     n        Number of degrees of freedom
#     alpha    Confidence level (defaults to 0.05)
#
# Result:
#     Inverse of the t-distribution
#
# Note:
#     Iterates until result is within epsilon of the target.
#     It is possible that the iteration does not converge
#
proc ::math::statistics::tstat {n {alpha 0.05}} {
    variable epsilon
    variable tvals

    if [info exists tvals($n:$alpha)] {
        return $tvals($n:$alpha)
    }

    set deltat [expr {100 * $epsilon}]
    # For one-tailed distribution,
    set ptarg [expr {1.000 - $alpha/2.0}]
    set maxiter 100

    # Initial value for t
    set t 2.0

    set niter 0
    while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
        set pstar [::math::statistics::cdf-students-t $n $t]
        set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
        set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]

        set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]

        incr niter
        if {$niter == $maxiter} {
            return -code error "::math::statistics::tstat: Did not converge after $niter iterations"

        }
    }

    # Cache the result to shorten the call in future
    set tvals($n:$alpha) $t

    return $t
}

# mv-wls --
#     Weighted Least Squares
#
# Arguments:
#     args       Alternating list of weights and observations
#


# Result:
#     List containing:
#     * R-squared
#     * Adjusted R-squared
#     * Coefficients of x's in fit
#     * Standard errors of the coefficients
#     * 95% confidence bounds for coefficients
#
# Note:
#     The observations are lists starting with the dependent variable y
#     and then the values of the independent variables (x1, x2, ...):
#
#     mv-wls w [list y x's] w [list y x's] ...
#
proc ::math::statistics::mv-wls {args} {

    # Fill the matrices of x & y values, and weights
    # For n points, k coefficients

    # The number of points is equal to half the arguments (n weights, n points)
    set n [expr {[llength $args]/2}]

    set firstloop true
    # Sum up all y values to take an average
    set ysum 0
    # Add up the weights
    set wtsum 0
    # Count over rows (points) as you go
    set point 0
    foreach {wt pt} $args {

        # Check inputs
        if {[string is double $wt] == 0} {
            return -code error "::math::statistics::mv-wls: Weight \"$wt\" is not a number"
            return {}
        }

        ## -- Check dimensions, initialize
        if $firstloop {
            # k = num of vals in pt = 1 + number of x's (because of constant)
            set k [llength $pt]
            if {$n <= [expr {$k + 1}]} {
                return -code error "::math::statistics::mv-wls: Too few degrees of freedom: $k variables but only $n points"
                return {}
            }
            set X [mkMatrix $n $k]
            set y [mkVector $n]
            set I_x [mkIdentity $k]
            set I_y [mkIdentity $n]

            set firstloop false

        } else {
            # Have to have same number of x's for all points
            if {$k != [llength $pt]} {
                return -code error "::math::statistics::mv-wls: Point \"$pt\" has wrong number of values (expected $k)"
                # Clean up
                return {}
            }
        }

        ## -- Extract values from set of points
        # Make a list of y values
        set yval [expr {double([lindex $pt 0])}]
        setelem y $point $yval
        set ysum [expr {$ysum + $wt * $yval}]
        set wtsum [expr {$wtsum + $wt}]
        # Add x-values to the x-matrix
        set xrow [lrange $pt 1 end]
        # Add the constant (value = 1.0)
        lappend xrow 1.0
        setrow X $point $xrow
        # Create list of weights & square root of weights
        lappend w [expr {double($wt)}]
        lappend sqrtw [expr {sqrt(double($wt))}]

        incr point

    }

    set ymean [expr {double($ysum)/$wtsum}]
    set W [mkDiagonal $w]
    set sqrtW [mkDiagonal $sqrtw]

    # Calculate sum os square differences for x's
    for {set r 0} {$r < $k} {incr r} {
        set xsqrsum 0.0
        set xmeansum 0.0
        # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
        for {set t 0} {$t < $n} {incr t} {
            set xval [getelem $X $t $r]
            set xmeansum [expr {$xmeansum + double($xval)}]
            set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
        }
        lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
    }

    ## -- Set up the X'WX matrix
    set XtW [matmul [::math::linearalgebra::transpose $X] $W]
    set XtWX [matmul $XtW $X]

    # Invert
    set M [solveGauss $XtWX $I_x]

    set beta [matmul $M [matmul $XtW $y]]

    ### -- Residuals & R-squared
    # 1) Generate list of diagonals of the hat matrix
    set H [matmul $X [matmul $M $XtW]]
    for {set i 0} {$i < $n} {incr i} {
        lappend h_ii [getelem $H $i $i]
    }

    set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
    set yhat [matmul $H $y]

    # 2) Generate list of residuals, sum of squared residuals, r-squared
    set sstot 0.0
    set ssreg 0.0
    # Note: Relying on representation of Vector as a list for y, yhat
    foreach yval $y wt $w yhatval $yhat {
        set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
        set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
    }
    set r2 [expr {double($ssreg)/$sstot}]
    set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
    set sumsqresid [dotproduct $R $R]
    set s2 [expr {double($sumsqresid) / double($n - $k)}]

    ### -- Confidence intervals for coefficients
    set tvalue [tstat [expr {$n - $k}]]
    for {set i 0} {$i < $k} {incr i} {
        set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
        set mid [lindex $beta $i]
        lappend stderrs $stderr
        lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
    }

    return [list $r2 $adjr2 $beta $stderrs $confinterval]
}

# mv-ols --
#     Ordinary Least Squares
#

# Arguments:
#     args       List of observations
#


# Result:
#     List containing:
#     * R-squared
#     * Adjusted R-squared
#     * Coefficients of x's in fit
#     * Standard errors of the coefficients
#     * 95% confidence bounds for coefficients
#
# Note:
#     The observations are lists starting with the dependent variable y
#     and then the values of the independent variables (x1, x2, ...):
#
#     mv-ols [list y x's] [list y x's] ...
#
proc ::math::statistics::mv-ols {args} {
    set newargs {}
    foreach pt $args {
        lappend newargs 1 $pt
    }
    return [eval wls $newargs]

}