_priv_/bim-ss/lowess_8h_source.html

00001 #ifndef _LOWESS_H_INCLUDED
00002 #define _LOWESS_H_INCLUDED
00003
00004 #include <cstdlib>
00005 #include <cmath>
00006 #include <algorithm>    // std::min, std::max
00007
00008 /** Templated class for LOWESS (locally weighted scatterplot smoothing).
00009  *
00010  *  The lowess code is translated from `RATFOR` lowess code written by W. S.
00011  *  Cleveland as obtained from `NETLIB`.
00012  *  It is based on two functions written in ratfor: `lowest`
00013  *  and `lowess`. The code has since been refactored and commented further.
00014  *
00015  *  @authors   W.S. Cleveland, Hannes Roest
00016  *  @copyright Copyright (c) 2015. All rights reserved.
00017  *             This software is released under a three-clause BSD license.
00018  */
00019 template <typename Container, typename T>
00020 class Lowess
00021 {
00022     inline T pow2(T x) { return x * x;  }
00023     inline T pow3(T x) { return x * x * x;  }
00024
00025     /// Return the median of a sequence of numbers defined by the random
00026     /// access iterators begin and end.  The sequence must not be empty
00027     /// (median is undefined for an empty set).
00028     ///
00029     /// The numbers must be convertible to double.
00030     ///
00031     template <class RandAccessIter>
00032     T median(RandAccessIter begin, RandAccessIter end)
00033     {
00034       std::size_t size = end - begin;
00035       std::size_t middleIdx = size / 2;
00036       RandAccessIter target = begin + middleIdx;
00037       std::nth_element(begin, target, end);
00038
00039       if (size % 2 != 0)
00040       {
00041         //Odd number of elements
00042         return *target;
00043       }
00044       else
00045       {
00046         //Even number of elements
00047         double a = *target;
00048         RandAccessIter targetNeighbor = target - 1;
00049         targetNeighbor = std::max_element(begin, target);
00050         return (a + *targetNeighbor) / 2.0;
00051       }
00052     }
00053
00054     /// Calculate weights for weighted regression.
00055     bool calculate_weights(const Container& x,
00056                            const size_t n,
00057                            const T current_x,
00058                            const bool use_resid_weights,
00059                            const size_t nleft,
00060                            const Container& resid_weights,
00061                            Container& weights,
00062                            size_t& nrt,
00063                            const T h)
00064     {
00065       T r;
00066       size_t j;
00067
00068       T h9 = .999 * h;
00069       T h1 = .001 * h;
00070       T a = 0.0; // sum of weights
00071
00072       // compute weights (pick up all ties on right)
00073       for (j = nleft; j < n; j++)
00074       {
00075         // Compute the distance measure, then apply the tricube
00076         // function on the distance to get the weight.
00077         // use_resid_weights will be False on the first iteration, then True
00078         // on the subsequent ones, after some residuals have been calculated.
00079         weights[j] = 0.0;
00080         r = std::abs(x[j] - current_x);
00081         if (r <= h9)
00082         {
00083           if (r > h1)
00084           {
00085             // small enough for non-zero weight
00086             // compute tricube function: ( 1 - (r/h)^3 )^3
00087             weights[j] = pow3(1.0 - pow3(r / h));
00088           }
00089           else
00090           {
00091             weights[j] = 1.0;
00092           }
00093
00094           if (use_resid_weights)
00095           {
00096             weights[j] = resid_weights[j] * weights[j];
00097           }
00098
00099           a += weights[j];
00100         }
00101         else if (x[j] > current_x)
00102         {
00103           // get out at first zero wt on right
00104           break;
00105         }
00106       }
00107
00108       // rightmost pt (may be greater than nright because of ties)
00109       nrt = j - 1;
00110       if (a <= 0.0)
00111       {
00112         return false;
00113       }
00114       else
00115       {
00116         // normalize weights (make sum of w[j] == 1)
00117         for (j = nleft; j <= nrt; j++)
00118         {
00119           weights[j] = weights[j] / a;
00120         }
00121
00122         return true;
00123       }
00124     }
00125
00126     /// Calculate smoothed/fitted y-value by weighted regression.
00127     void calculate_y_fit(const Container& x,
00128                          const Container& y,
00129                          const T current_x,
00130                          const size_t n,
00131                          const size_t nleft,
00132                          const size_t nrt,
00133                          const T h,
00134                          T& ys,
00135                          Container& weights)
00136     {
00137       T range = x[n - 1] - x[0];
00138
00139       if (h > 0.0)
00140       {
00141         // use linear fit
00142
00143         // No regression function (e.g. lstsq) is called. Instead a "projection
00144         // vector" p_i_j is calculated, and y_fit[i] = sum(p_i_j * y[j]) = y_fit[i]
00145         // for j s.t. x[j] is in the neighborhood of x[i]. p_i_j is a function of
00146         // the weights, x[i], and its neighbors.
00147         // To save space, p_i_j is computed in place using the weight vector.
00148
00149         // find weighted center of x values
00150         T sum_weighted_x = 0.0; // originally variable a
00151         for (size_t j = nleft; j <= nrt; j++)
00152         {
00153           sum_weighted_x += weights[j] * x[j];
00154         }
00155
00156         T b = current_x - sum_weighted_x; // originally variable b
00157         T weighted_sqdev = 0.0; // originally variable c
00158         for (size_t j = nleft; j <= nrt; j++)
00159         {
00160           weighted_sqdev += weights[j] *
00161                             (x[j] - sum_weighted_x) * (x[j] - sum_weighted_x);
00162         }
00163
00164         if (sqrt(weighted_sqdev) > .001 * range)
00165         {
00166           // points are spread out enough to compute slope
00167           b = b / weighted_sqdev;
00168           for (size_t j = nleft; j <= nrt; j++)
00169           {
00170             // Compute p_i_j in place
00171             weights[j] = weights[j] * (1.0 + b * (x[j] - sum_weighted_x));
00172           }
00173         }
00174       }
00175
00176       ys = 0.0;
00177       for (size_t j = nleft; j <= nrt; j++)
00178       {
00179         ys += weights[j] * y[j];
00180       }
00181     }
00182
00183     bool lowest(const Container& x,
00184                 const Container& y,
00185                 size_t n,
00186                 T current_x, //xs
00187                 T& ys,
00188                 size_t nleft,
00189                 size_t nright,
00190                 Container& weights, // vector w
00191                 bool use_resid_weights,  // userw
00192                 const Container& resid_weights)
00193     {
00194       T h;
00195       size_t nrt; // rightmost pt (may be greater than nright because of ties)
00196
00197       h = std::max(current_x - x[nleft], x[nright] - current_x);
00198
00199       // Calculate the weights for the regression in this neighborhood.
00200       // Determine if at least some weights are positive, so a regression
00201       // is ok.
00202       bool fit_ok = calculate_weights(x, n, current_x, use_resid_weights,
00203                                       nleft, resid_weights,
00204                                       weights, nrt, h);
00205       if (!fit_ok)
00206       {
00207         return fit_ok;
00208       }
00209
00210       // If it is ok to fit, run the weighted least squares regression
00211       calculate_y_fit(x, y, current_x, n, nleft, nrt, h, ys, weights);
00212
00213       return fit_ok;
00214     }
00215
00216     /// Find the indices bounding the k-nearest-neighbors of the current point.
00217     void update_neighborhood(const Container& x,
00218                              const size_t n,
00219                              const size_t i,
00220                              size_t& nleft,
00221                              size_t& nright)
00222     {
00223       T d1, d2;
00224       // A subtle loop. Start from the current neighborhood range:
00225       // [nleft, nright). Shift both ends rightwards by one
00226       // (so that the neighborhood still contains ns points), until
00227       // the current point is in the center (or just to the left of
00228       // the center) of the neighborhood. This neighborhood will
00229       // contain the ns-nearest neighbors of x[i].
00230       //
00231       // Once the right end hits the end of the data, hold the
00232       // neighborhood the same for the remaining x[i]s.
00233       while (nright < n - 1)
00234       {
00235         // move nleft, nright to right if radius decreases
00236         d1 = x[i] - x[nleft];
00237         d2 = x[nright + 1] - x[i];
00238         // if d1 <= d2 with x[nright+1] == x[nright], lowest fixes
00239         if (d1 <= d2) break;
00240         // radius will not decrease by move right
00241         nleft++;
00242         nright++;
00243       }
00244     }
00245
00246     /// Update the counters of the local regression.
00247     void update_indices(const Container& x,
00248                         const size_t n,
00249                         const T delta,
00250                         size_t& i,
00251                         size_t& last,
00252                         Container& ys)
00253     {
00254       // For most points within delta of the current point, we skip the
00255       // weighted linear regression (which save much computation of
00256       // weights and fitted points). Instead, we'll jump to the last
00257       // point within delta, fit the weighted regression at that point,
00258       // and linearly interpolate in between.
00259
00260       // the last point actually estimated
00261       last = i;
00262
00263       // This loop increments until we fall just outside of delta distance,
00264       // copying the results for any repeated x's along the way.
00265       T cut = x[last] + delta;
00266       for (i = last + 1; i < n; i++)
00267       {
00268         // find close points
00269         if (x[i] > cut) break;
00270
00271         // i one beyond last pt within cut
00272         if (x[i] == x[last])
00273         {
00274           // exact match in x
00275           // if tied with previous x-value, just use the already
00276           // fitted y, and update the last-fit counter.
00277           ys[i] = ys[last];
00278           last = i;
00279         }
00280       }
00281
00282
00283       // the next point to fit the regression at is either one prior to i (since
00284       // i should be the first point outside of delta) or it is "last + 1" in the
00285       // case that i never got incremented. This insures we always step forward.
00286       // -> back 1 point so interpolation within delta, but always go forward
00287       i = std::max(last + 1, i - 1);
00288     }
00289
00290     /// Calculate smoothed/fitted y by linear interpolation between the current
00291     /// and previous y fitted by weighted regression.
00292     void interpolate_skipped_fits(const Container& x,
00293                                   const size_t i,
00294                                   const size_t last,
00295                                   Container& ys)
00296     {
00297       // skipped points -- interpolate
00298       T alpha;
00299       T denom = x[i] - x[last]; // non-zero - proof?
00300       for (size_t j = last + 1; j < i; j = j + 1)
00301       {
00302         alpha = (x[j] - x[last]) / denom;
00303         ys[j] = alpha * ys[i] + (1.0 - alpha) * ys[last];
00304       }
00305     }
00306
00307     /// Calculate residual weights for the next `robustifying` iteration.
00308     void calculate_residual_weights(const size_t n,
00309                                     const Container& weights,
00310                                     Container& resid_weights)
00311     {
00312       T r;
00313
00314       for (size_t i = 0; i < n; i++)
00315       {
00316         resid_weights[i] = std::abs(weights[i]);
00317       }
00318
00319       // ***********************************
00320       // Compute pseudo-median (take average even if we have an odd number of
00321       // elements), following the original implementation. We could also use a
00322       // true median calculation here:
00323       // T cmad = 6.0 * median(resid_weights.begin(), resid_weights.end());
00324       // ***********************************
00325
00326       size_t m1 = n / 2; // FORTRAN starts with one, CPP with zero
00327       // size_t m1 = 1 + n / 2; // original FORTRAN code
00328       // size_t m2 = n - m1 + 1; // see below, we don't explicitly sort but use max_element
00329
00330       // Use nth element to find element m1, which produces a partially sorted
00331       // vector. This means we can get element m2 by looking for the maximum in the
00332       // remainder.
00333       typename Container::iterator it_m1 = resid_weights.begin() + m1;
00334       std::nth_element(resid_weights.begin(), it_m1, resid_weights.end());
00335       typename Container::iterator it_m2 = std::max_element(
00336         resid_weights.begin(), it_m1);
00337       T cmad = 3.0 * (*it_m1 + *it_m2);
00338       T c9 = .999 * cmad;
00339       T c1 = .001 * cmad;
00340
00341       for (size_t i = 0; i < n; i++)
00342       {
00343         r = std::abs(weights[i]);
00344         if (r <= c1)
00345         {
00346           // near 0, avoid underflow
00347           resid_weights[i] = 1.0;
00348         }
00349         else if (r > c9)
00350         {
00351           // near 1, avoid underflow
00352           resid_weights[i] = 0.0;
00353         }
00354         else
00355         {
00356           resid_weights[i] = pow2(1.0 - pow2(r / cmad));
00357         }
00358       }
00359     }
00360
00361 public:
00362     int lowess(const Container& x,
00363                const Container& y,
00364                double frac,    // parameter f
00365                int nsteps,
00366                T delta,
00367                Container& ys,
00368                Container& resid_weights,   // vector rw
00369                Container& weights   // vector res
00370                )
00371     {
00372       bool fit_ok;
00373       size_t i, last, nleft, nright, ns;
00374
00375       size_t n = x.size();
00376       if (n < 2)
00377       {
00378         ys[0] = y[0];
00379         return 1;
00380       }
00381
00382       // how many points around estimation point should be used for regression:
00383       // at least two, at most n points
00384       size_t tmp = (size_t)(frac * (double)n);
00385       ns = std::max(std::min(tmp, n), (size_t)2);
00386
00387       // robustness iterations
00388       for (int iter = 1; iter <= nsteps + 1; iter++)
00389       {
00390         // start of array in C++ at 0 / in FORTRAN at 1
00391         nleft = 0;
00392         nright = ns - 1;
00393         last = -1;          // index of prev estimated point
00394         i = 0;              // index of current point
00395
00396         // Fit all data points y[i] until the end of the array
00397         do
00398         {
00399           // Identify the neighborhood around the current x[i]
00400           // -> get the nearest ns points
00401           update_neighborhood(x, n, i, nleft, nright);
00402
00403           // Calculate weights and apply fit (original lowest function)
00404           fit_ok = lowest(x, y, n, x[i], ys[i], nleft, nright,
00405                           weights, (iter > 1), resid_weights);
00406
00407           // if something went wrong during the fit, use y[i] as the
00408           // fitted value at x[i]
00409           if (!fit_ok) ys[i] = y[i];
00410
00411           // If we skipped some points (because of how delta was set), go back
00412           // and fit them by linear interpolation.
00413           if (last < i - 1)
00414           {
00415             interpolate_skipped_fits(x, i, last, ys);
00416           }
00417
00418           // Update the last fit counter to indicate we've now fit this point.
00419           // Find the next i for which we'll run a regression.
00420           update_indices(x, n, delta, i, last, ys);
00421         }
00422         while (last < n - 1);
00423
00424         // compute current residuals
00425         for (i = 0; i < n; i++)
00426         {
00427           weights[i] = y[i] - ys[i];
00428         }
00429
00430         // compute robustness weights except last time
00431         if (iter > nsteps) break;
00432
00433         calculate_residual_weights(n, weights, resid_weights);
00434       }
00435       return 0;
00436     }
00437 };
00438
00439 #endif // _LOWESS_H_INCLUDED