# HG changeset patch # User Jordi Gutiérrez Hermoso # Date 1462835548 14400 # Node ID 45fb52ba30574f59110621125ce648180a497c80 # Parent 4fb3b87b8610f3bd3dd4bd685d98c61d13082aa2 Add D implementation of the medcouple Currently slow, gotta figure out why. diff --git a/medcouple.d b/medcouple.d new file mode 100644 --- /dev/null +++ b/medcouple.d @@ -0,0 +1,229 @@ +// medcouple.d --- + +// Copyright © 2016 Jordi Gutiérrez Hermoso + +// Author: Jordi Gutiérrez Hermoso + +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 3 +// of the License, or (at your option) any later version. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +import std.stdio; +import std.array; +import std.conv: to; +import std.range: iota, zip; +import std.string: strip; +import std.typecons: Tuple; +import std.algorithm: filter, map, max, reduce, sort, topN; +import std.math: abs; + +// Could also just be reduce!"a+b", they call this "string lambda" +alias sum = reduce!((a,b) => a + b); + +int signum(T)(T val) { + return (cast(T)(0) < val ) - (val < cast(T)(0)); +} + +double wmedian(double[] A, long[] W) { + auto AW = zip(A,W).array; + long n = AW.length; + auto wtot = sum(W); + + typeof(n) + beg = 0, + end = n - 1; + + while(true) { + auto mid = (beg+end)/2; + topN!((x, y) => x[0] > y[0])(AW, mid); + + auto trial = AW[mid][0]; + + typeof(n) wleft = 0, wright = 0; + + foreach(aw; AW) { + auto a = aw[0]; + auto w = aw[1]; + if (a > trial) + wleft += w; + else + wright += w; + } + + if (2*wleft > wtot) + end = mid; + else if (2*wright < wtot) + beg = mid; + else + return trial; + } +} + +double medcouple(const double[] X, + double eps1 = double.epsilon, + double eps2 = double.min) +{ + long n = X.length, n2 = (n-1)/2; + if (n < 3) + return 0; + + auto Z = X.dup; + sort!((a,b) => a > b)(Z); + + auto Zmed = n % 2 == 1 ? Z[n2] : (Z[n2] + Z[n2 + 1])/2; + + // Check if the median is at the edges up to relative epsilon + if (abs(Z[0] - Zmed) < eps1*(eps1 + abs(Zmed))) + return -1.0; + if (abs(Z[n - 1] - Zmed) < eps1*(eps1 + abs(Zmed))) + return 1.0; + + // Centre Z wrt median, so that median(Z) = 0. + Z[] -= Zmed; + + // Scale inside [-0.5, 0.5], for greater numerical stability. + auto Zden = 2*max(Z[0], -Z[n - 1]); + Z[] /= Zden; + Zmed /= Zden; + + + auto + Zeps = eps1*(eps1 + abs(Zmed)), + + // These overlap on the entries that are tied with the median + Zplus = filter!(z => z >= -Zeps)(Z).array, + Zminus = filter!(z => Zeps >= z)(Z).array; + + long + n_plus = Zplus.length, + n_minus = Zminus.length; + + + /* + Kernel function h for the medcouple, closing over the values of + Zplus and Zminus just defined above. + + In case a and be are within epsilon of the median, the kernel + is the signum of their position. + */ + auto h_kern(long i, long j){ + auto + a = Zplus[i], + b = Zminus[j]; + + if (abs(a - b) <= 2*eps2) + return signum(n_plus - 1 - i - j); + else + return (a+b)/(a-b); + } + + + auto + // Init left and right borders + L = replicate([0L], n_plus), + R = replicate([n_minus - 1], n_plus), + + Ltot = 0L, + Rtot = n_minus*n_plus, + medc_idx = Rtot/2; + + // kth pair algorithm (Johnson & Mizoguchi) + while (Rtot - Ltot > n_plus) { + + // First, compute the weighted median of row medians inside the + // given bounds + auto + I = filter!(i => L[i] <= R[i])(iota(0, n_plus-1)), + A = map!(i => h_kern(i, (L[i] + R[i])/2))(I).array, + W = map!(i => R[i] - L[i] + 1)(I).array, + + Am = wmedian(A, W), + Am_eps = eps1*(eps1 + abs(Am)); + + // Compute new left and right boundaries, based on the weighted + // median + auto P = new long[n_plus], Q = new long[n_plus]; + { + auto j = 0L; + for (auto i = n_plus - 1; i >= 0; i--) { + while (j < n_minus && h_kern(i, j) - Am > Am_eps) + j++; + P[i] = j - 1; + } + } + + { + auto j = n_minus - 1; + for (auto i = 0; i < n_plus; i++) { + while (j >= 0 && h_kern(i, j) - Am < -Am_eps) + j--; + Q[i] = j + 1; + } + } + + // Discard left or right entries, depending on which side the + // medcouple is. + auto + sumP = sum(P) + n_plus, + sumQ = sum(Q); + + if (medc_idx <= sumP - 1) { + R = P; + Rtot = sumP; + } + else { + if (medc_idx > sumQ - 1) { + L = Q; + Ltot = sumQ; + } + else + return Am; + } + } + + // Didn't find the median, but now we have a very small search space + // to find it in, just between the left and right boundaries. This + // space is of size Rtot - Ltot which is <= n_plus + auto + A = new double[Rtot - Ltot], + idx = 0L; + + for (auto i = 0L; i < n_plus; i++) { + for (auto j = L[i]; j <= R[i]; j++){ + A[idx] = h_kern(i, j); + ++idx; + } + } + + topN!((x,y) => x > y)(A, medc_idx - Ltot); + return A[medc_idx - Ltot]; + +} + +int main(string[] argv) { + if (argv.length < 2) { + stderr.writeln("Please provide input file."); + return 1; + } + + auto fname = argv[1]; + double[] data; + + foreach(line; File(fname).byLine) { + if (! line.empty) + data ~= to!(double)(line.strip); + } + + writefln("%0.16f", medcouple(data)); + + return 0; +}