commit d7f8520c8fa5855910fa0db7c376707ab695df91
parent 66665f2565d66fee6c3b4727c0d2e9111dc449f8
Author: Dominik Schmidt <das1993@hotmail.com>
Date: Thu, 28 Jun 2018 21:31:04 +0000
Add a clustering algorithm to analyze the logfiles
This is basically k-means which increases cluster-count until the variance is small enough
Diffstat:
src/clustering.d | | | 162 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 162 insertions(+), 0 deletions(-)
diff --git a/src/clustering.d b/src/clustering.d
@@ -0,0 +1,162 @@
+import std.stdio;
+import std.range;
+import std.algorithm;
+import std.typecons;
+import std.math;
+import std.traits;
+
+alias Point=uint;
+
+/**
+ * Returns mean and variance of the points given in the range
+ */
+auto gaussian(T=double, R)(R r) if(isForwardRange!R && isNumeric!(ElementType!R)){
+ auto p=r
+ .fold!((a,b){return tuple!(T,T,size_t)(a[0]+b, a[1]+b^^2,a[2]+1);})(tuple!(T,T,size_t)(0,0,0));
+ return tuple!(T,"mean",T,"variance")(p[0]/p[2],p[1]/p[2]-(p[0]/p[2])^^2);
+}
+unittest{
+ auto res=gaussian([3,4,5]);
+ assert(res.mean==4.0);
+ assert(res.variance==2.0);
+}
+
+//struct Cluster(T) if(hasLength!T && isForwardRange!T && is(ElementType!T == Point)){
+// T points;
+struct Cluster{
+ Point[] points;
+ double mean;
+ double variance;
+
+ void add(Point p){
+ points~=p;
+ }
+ void reset(){
+ points.length=0;
+ }
+
+ void calculate(){
+ auto g=gaussian(points);
+ mean=g[0];
+ variance=g[1];
+ }
+
+ float calculate_delta(){
+ auto m=mean,v=variance;
+ calculate();
+ return (m-mean)^^2+(v-variance)^^2;
+ }
+}
+
+/**
+ * Returns a pointer to the cluster that is closest to point p
+ */
+auto closest(Cluster[] clusters, Point p)
+in{
+ assert(clusters.length>0);
+}
+out(res){
+ assert(res!=null);
+}
+do{
+ Cluster *res;
+ float mindist=float.max;
+ foreach(ref c; clusters){
+ auto dist=abs(p-c.mean);
+ if(dist<mindist){
+ res=&c;
+ mindist=dist;
+ }
+ }
+ return res;
+}
+
+/**
+ * Adds the point p to the cluster whose mean is closest to it
+ */
+void addToClosest(Cluster[] clusters, Point p){
+ closest(clusters,p).add(p);
+}
+
+import std.random;
+
+/**
+ * The standard k-means algorithm
+ */
+void kmeans(Point[] points, ref Cluster[] clusters,uint maxiter=100){
+ foreach(v;enumerate(points.randomSample(clusters.length))){
+ clusters[v[0]].mean=v[1];
+ }
+
+ void reset(){
+ foreach(ref c; clusters){
+ c.reset();
+ }
+ }
+
+ foreach(iteration; iota(0,maxiter)){
+ reset();
+ points.each!(a=>addToClosest(clusters,a));
+ auto s=clusters.map!((ref a)=>a.calculate_delta()).sum;
+ if(s<1e-6){
+ return;
+ }
+ }
+}
+
+/**
+ * This does k-means with increasing cluster sizes until the maximal
+ * std-deviation/mean ratio is below cutoff.
+ */
+
+Cluster[] autokmeans(Point[] points, float cutoff=0.5, uint maxclusters=10){
+ Cluster[] res;
+ foreach(nc; iota(1,maxclusters+1)){
+ res=new Cluster[nc];
+ kmeans(points, res);
+ if(res.map!(a=>sqrt(a.variance)/a.mean).fold!"max(a,b)"(0.0)<cutoff){
+ return res;
+ }
+ }
+ return res;
+}
+unittest{
+ Cluster c;
+ c.points=[5,2];
+ c.calculate();
+ assert(c.mean==3.5);
+ assert(c.variance==2.25);
+}
+
+import std.file;
+
+int main(string[] args){
+ File f;
+ if(args.length!=2){
+ stderr.writeln("Usage: ", args[0], " </path/to/log/file>");
+ return 1;
+ }
+ if(args[1]=="-"){
+ f=stdin;
+ }
+ else if(!exists(args[1])){
+ stderr.writeln(args[1], " does not exist");
+ return 1;
+ }
+ else{
+ f.open(args[1]);
+ }
+
+ auto points=f
+ .byRecord!(uint, uint)("%s %s")
+ .map!(a=>a[1]-a[0])
+ .filter!(a=>a>10*60)
+ .array;
+
+ auto res=points.autokmeans();
+ res.sort!"a.mean<b.mean";
+ res
+ .each!(a=>writeln(a.points.length, "\t", a.mean, "\t", a.variance));
+
+ return 0;
+}