Add a clustering algorithm to analyze the logfiles

commit d7f8520c8fa5855910fa0db7c376707ab695df91
parent 66665f2565d66fee6c3b4727c0d2e9111dc449f8
Author: Dominik Schmidt <das1993@hotmail.com>
Date:   Thu, 28 Jun 2018 21:31:04 +0000

Add a clustering algorithm to analyze the logfiles

This is basically k-means which increases cluster-count until the variance is small enough

Diffstat:
src/clustering.d  | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 162 insertions(+), 0 deletions(-)
diff --git a/src/clustering.d b/src/clustering.d
@@ -0,0 +1,162 @@
+import std.stdio;
+import std.range;
+import std.algorithm;
+import std.typecons;
+import std.math;
+import std.traits;
+
+alias Point=uint;
+
+/**
+ * Returns mean and variance of the points given in the range
+ */
+auto gaussian(T=double, R)(R r) if(isForwardRange!R && isNumeric!(ElementType!R)){
+	auto p=r
+		.fold!((a,b){return tuple!(T,T,size_t)(a[0]+b, a[1]+b^^2,a[2]+1);})(tuple!(T,T,size_t)(0,0,0));
+	return tuple!(T,"mean",T,"variance")(p[0]/p[2],p[1]/p[2]-(p[0]/p[2])^^2);
+}
+unittest{
+	auto res=gaussian([3,4,5]);
+	assert(res.mean==4.0);
+	assert(res.variance==2.0);
+}
+
+//struct Cluster(T) if(hasLength!T && isForwardRange!T && is(ElementType!T == Point)){
+//	T points;
+struct Cluster{
+	Point[] points;
+	double mean;
+	double variance;
+
+	void add(Point p){
+		points~=p;
+	}
+	void reset(){
+		points.length=0;
+	}
+	
+	void calculate(){
+		auto g=gaussian(points);
+		mean=g[0];
+		variance=g[1];
+	}
+	
+	float calculate_delta(){
+		auto m=mean,v=variance;
+		calculate();
+		return (m-mean)^^2+(v-variance)^^2;
+	}
+}
+
+/**
+ * Returns a pointer to the cluster that is closest to point p
+ */
+auto closest(Cluster[] clusters, Point p)
+in{
+	assert(clusters.length>0);
+}
+out(res){
+	assert(res!=null);
+}
+do{
+	Cluster *res;
+	float mindist=float.max;
+	foreach(ref c; clusters){
+		auto dist=abs(p-c.mean);
+		if(dist<mindist){
+			res=&c;
+			mindist=dist;
+		}
+	}
+	return res;
+}
+
+/**
+ * Adds the point p to the cluster whose mean is closest to it
+ */
+void addToClosest(Cluster[] clusters, Point p){
+	closest(clusters,p).add(p);
+}
+
+import std.random;
+
+/**
+ * The standard k-means algorithm
+ */
+void kmeans(Point[] points, ref Cluster[] clusters,uint maxiter=100){
+	foreach(v;enumerate(points.randomSample(clusters.length))){
+		clusters[v[0]].mean=v[1];
+	}
+	
+	void reset(){
+		foreach(ref c; clusters){
+			c.reset();
+		}
+	}
+	
+	foreach(iteration; iota(0,maxiter)){
+		reset();
+		points.each!(a=>addToClosest(clusters,a));
+		auto s=clusters.map!((ref a)=>a.calculate_delta()).sum;
+		if(s<1e-6){
+			return;
+		}
+	}
+}
+
+/**
+ * This does k-means with increasing cluster sizes until the maximal 
+ * std-deviation/mean ratio is below cutoff.
+ */
+
+Cluster[] autokmeans(Point[] points, float cutoff=0.5, uint maxclusters=10){
+	Cluster[] res;
+	foreach(nc; iota(1,maxclusters+1)){
+		res=new Cluster[nc];
+		kmeans(points, res);
+		if(res.map!(a=>sqrt(a.variance)/a.mean).fold!"max(a,b)"(0.0)<cutoff){
+			return res;
+		}
+	}
+	return res;
+}
+unittest{
+	Cluster c;
+	c.points=[5,2];
+	c.calculate();
+	assert(c.mean==3.5);
+	assert(c.variance==2.25);
+}
+
+import std.file;
+
+int main(string[] args){
+	File f;
+	if(args.length!=2){
+		stderr.writeln("Usage: ", args[0], " </path/to/log/file>");
+		return 1;
+	}
+	if(args[1]=="-"){
+		f=stdin;
+	}
+	else if(!exists(args[1])){
+		stderr.writeln(args[1], " does not exist");
+		return 1;
+	}
+	else{
+		f.open(args[1]);
+	}
+	
+	auto points=f
+		.byRecord!(uint, uint)("%s %s")
+		.map!(a=>a[1]-a[0])
+		.filter!(a=>a>10*60)
+		.array;
+		
+	auto res=points.autokmeans();
+	res.sort!"a.mean<b.mean";
+	res
+		.each!(a=>writeln(a.points.length, "\t", a.mean, "\t", a.variance));
+	
+	return 0;
+}

	LaundrySorcery
	Log \| Files \| Refs

LaundrySorcery