LaundrySorcery

Log | Files | Refs

commit d7f8520c8fa5855910fa0db7c376707ab695df91
parent 66665f2565d66fee6c3b4727c0d2e9111dc449f8
Author: Dominik Schmidt <das1993@hotmail.com>
Date:   Thu, 28 Jun 2018 21:31:04 +0000

Add a clustering algorithm to analyze the logfiles

This is basically k-means which increases cluster-count until the variance is small enough

Diffstat:
src/clustering.d | 162+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 162 insertions(+), 0 deletions(-)

diff --git a/src/clustering.d b/src/clustering.d @@ -0,0 +1,162 @@ +import std.stdio; +import std.range; +import std.algorithm; +import std.typecons; +import std.math; +import std.traits; + +alias Point=uint; + +/** + * Returns mean and variance of the points given in the range + */ +auto gaussian(T=double, R)(R r) if(isForwardRange!R && isNumeric!(ElementType!R)){ + auto p=r + .fold!((a,b){return tuple!(T,T,size_t)(a[0]+b, a[1]+b^^2,a[2]+1);})(tuple!(T,T,size_t)(0,0,0)); + return tuple!(T,"mean",T,"variance")(p[0]/p[2],p[1]/p[2]-(p[0]/p[2])^^2); +} +unittest{ + auto res=gaussian([3,4,5]); + assert(res.mean==4.0); + assert(res.variance==2.0); +} + +//struct Cluster(T) if(hasLength!T && isForwardRange!T && is(ElementType!T == Point)){ +// T points; +struct Cluster{ + Point[] points; + double mean; + double variance; + + void add(Point p){ + points~=p; + } + void reset(){ + points.length=0; + } + + void calculate(){ + auto g=gaussian(points); + mean=g[0]; + variance=g[1]; + } + + float calculate_delta(){ + auto m=mean,v=variance; + calculate(); + return (m-mean)^^2+(v-variance)^^2; + } +} + +/** + * Returns a pointer to the cluster that is closest to point p + */ +auto closest(Cluster[] clusters, Point p) +in{ + assert(clusters.length>0); +} +out(res){ + assert(res!=null); +} +do{ + Cluster *res; + float mindist=float.max; + foreach(ref c; clusters){ + auto dist=abs(p-c.mean); + if(dist<mindist){ + res=&c; + mindist=dist; + } + } + return res; +} + +/** + * Adds the point p to the cluster whose mean is closest to it + */ +void addToClosest(Cluster[] clusters, Point p){ + closest(clusters,p).add(p); +} + +import std.random; + +/** + * The standard k-means algorithm + */ +void kmeans(Point[] points, ref Cluster[] clusters,uint maxiter=100){ + foreach(v;enumerate(points.randomSample(clusters.length))){ + clusters[v[0]].mean=v[1]; + } + + void reset(){ + foreach(ref c; clusters){ + c.reset(); + } + } + + foreach(iteration; iota(0,maxiter)){ + reset(); + points.each!(a=>addToClosest(clusters,a)); + auto s=clusters.map!((ref a)=>a.calculate_delta()).sum; + if(s<1e-6){ + return; + } + } +} + +/** + * This does k-means with increasing cluster sizes until the maximal + * std-deviation/mean ratio is below cutoff. + */ + +Cluster[] autokmeans(Point[] points, float cutoff=0.5, uint maxclusters=10){ + Cluster[] res; + foreach(nc; iota(1,maxclusters+1)){ + res=new Cluster[nc]; + kmeans(points, res); + if(res.map!(a=>sqrt(a.variance)/a.mean).fold!"max(a,b)"(0.0)<cutoff){ + return res; + } + } + return res; +} +unittest{ + Cluster c; + c.points=[5,2]; + c.calculate(); + assert(c.mean==3.5); + assert(c.variance==2.25); +} + +import std.file; + +int main(string[] args){ + File f; + if(args.length!=2){ + stderr.writeln("Usage: ", args[0], " </path/to/log/file>"); + return 1; + } + if(args[1]=="-"){ + f=stdin; + } + else if(!exists(args[1])){ + stderr.writeln(args[1], " does not exist"); + return 1; + } + else{ + f.open(args[1]); + } + + auto points=f + .byRecord!(uint, uint)("%s %s") + .map!(a=>a[1]-a[0]) + .filter!(a=>a>10*60) + .array; + + auto res=points.autokmeans(); + res.sort!"a.mean<b.mean"; + res + .each!(a=>writeln(a.points.length, "\t", a.mean, "\t", a.variance)); + + return 0; +}