LaundrySorcery

git clone git://xatko.vsos.ethz.ch/LaundrySorcery.git
Log | Files | Refs

laundryclustery.d (3913B)


      1 import std.stdio;
      2 import std.range;
      3 import std.algorithm;
      4 import std.typecons;
      5 import std.math;
      6 import std.traits;
      7 
      8 alias Point=uint;
      9 
     10 /**
     11  * Returns mean and variance of the points given in the range
     12  */
     13 auto gaussian(T=double, R)(R r) if(isForwardRange!R && isNumeric!(ElementType!R)){
     14 	auto p=r
     15 		.fold!((a,b){return tuple!(T,T,size_t)(a[0]+b, a[1]+b^^2,a[2]+1);})(tuple!(T,T,size_t)(0,0,0));
     16 	return tuple!(T,"mean",T,"variance")(p[0]/p[2],p[1]/p[2]-(p[0]/p[2])^^2);
     17 }
     18 unittest{
     19 	auto res=gaussian([3,4,5]);
     20 	assert(res.mean==4.0);
     21 	assert(res.variance==2.0);
     22 }
     23 
     24 //struct Cluster(T) if(hasLength!T && isForwardRange!T && is(ElementType!T == Point)){
     25 //	T points;
     26 struct Cluster{
     27 	Point[] points;
     28 	double mean;
     29 	double variance;
     30 
     31 	void add(Point p){
     32 		points~=p;
     33 	}
     34 	void reset(){
     35 		points.length=0;
     36 	}
     37 	
     38 	void calculate(){
     39 		auto g=gaussian(points);
     40 		mean=g[0];
     41 		variance=g[1];
     42 	}
     43 	
     44 	float calculate_delta(){
     45 		auto m=mean,v=variance;
     46 		calculate();
     47 		return (m-mean)^^2+(v-variance)^^2;
     48 	}
     49 }
     50 
     51 /**
     52  * Returns a pointer to the cluster that is closest to point p
     53  */
     54 auto closest(Cluster[] clusters, Point p)
     55 in{
     56 	assert(clusters.length>0);
     57 }
     58 out(res){
     59 	assert(res[0]!=null);
     60 }
     61 do{
     62 	Cluster *res;
     63 	float mindist=float.max;
     64 	foreach(ref c; clusters){
     65 		auto dist=abs(p-c.mean);
     66 		if(dist<mindist){
     67 			res=&c;
     68 			mindist=dist;
     69 		}
     70 	}
     71 	return tuple(res,mindist);
     72 }
     73 
     74 /**
     75  * Adds the point p to the cluster whose mean is closest to it
     76  */
     77 void addToClosest(Cluster[] clusters, Point p){
     78 	closest(clusters,p)[0].add(p);
     79 }
     80 
     81 
     82 /**
     83  * The k-means++ algorithm
     84  */
     85 void kmeans(Point[] points, ref Cluster[] clusters,uint maxiter=100){
     86 	import std.random;
     87 	
     88 	//K-means++ initialization
     89 	clusters[0].mean=points.choice();
     90 	ulong[] distances=new ulong[points.length];
     91 	for(uint i=1; i<clusters.length; i++){
     92 		auto initialized=clusters[0..i];
     93 		ulong sum=0;
     94 		points
     95 			.map!(a=>cast(ulong)closest(initialized,a)[1]^^2)
     96 			.tee!(a=>sum+=a)
     97 			.copy(distances);
     98 		auto rand=uniform(0,sum);
     99 		foreach(ii,dist;enumerate(distances)){
    100 			if(rand<dist){
    101 				clusters[i].mean=points[ii];
    102 				break;
    103 			}
    104 			rand-=dist;
    105 		}
    106 	}
    107 	
    108 	void reset(){
    109 		foreach(ref c; clusters){
    110 			c.reset();
    111 		}
    112 	}
    113 	
    114 	foreach(iteration; iota(0,maxiter)){
    115 		reset();
    116 		points.each!(a=>addToClosest(clusters,a));
    117 		auto s=clusters.map!((ref a)=>a.calculate_delta()).sum;
    118 		if(s<1e-6){
    119 			return;
    120 		}
    121 	}
    122 }
    123 
    124 /**
    125  * This does k-means with increasing cluster sizes until the maximal 
    126  * std-deviation/mean ratio is below cutoff.
    127  */
    128 
    129 Cluster[] autokmeans(Point[] points, float reltol=0.1, float abstol=30*60, uint maxclusters=10){
    130 	Cluster[] res;
    131 	foreach(nc; iota(1,maxclusters+1)){
    132 		res=new Cluster[nc];
    133 		kmeans(points, res);
    134 		auto tol=res
    135 			.map!((a){auto stdev=sqrt(a.variance); return tuple(stdev/a.mean, stdev);})
    136 			.map!(a=>tuple(a[0]<reltol,a[1]<abstol))
    137 			.fold!((a,b)=>a = a && b[0] && b[1])(true);
    138 		if(tol){
    139 			return res;
    140 		}
    141 	}
    142 	return res;
    143 }
    144 unittest{
    145 	Cluster c;
    146 	c.points=[5,2];
    147 	c.calculate();
    148 	assert(c.mean==3.5);
    149 	assert(c.variance==2.25);
    150 }
    151 
    152 import std.file;
    153 
    154 int main(string[] args){
    155 	File f;
    156 	import std.getopt;
    157 	import std.format;
    158 	string plotfile;
    159 	auto helpWanted=getopt(args,
    160 		"plotfile|p", &plotfile,
    161 	);
    162 	if(args.length!=2){
    163 		stderr.writeln("Usage: ", args[0], " </path/to/log/file>");
    164 		return 1;
    165 	}
    166 	if(args[1]=="-"){
    167 		f=stdin;
    168 	}
    169 	else if(!exists(args[1])){
    170 		stderr.writeln(args[1], " does not exist");
    171 		return 1;
    172 	}
    173 	else{
    174 		f.open(args[1]);
    175 	}
    176 	
    177 	auto points=f
    178 		.byRecord!(uint, uint)("%s %s")
    179 		.map!(a=>a[1]-a[0])
    180 		.filter!(a=>a>10*60)
    181 		.filter!(a=>a<5*60*60)
    182 		.array;
    183 		
    184 	auto res=points.autokmeans();
    185 	res.sort!"a.mean<b.mean";
    186 	res
    187 		.filter!(a=>a.points.length>0)
    188 		.each!(a=>writeln(a.points.length, "\t", a.mean, "\t", a.variance));
    189 
    190 	if(plotfile){
    191 		File cf=File(plotfile,"w+");
    192 		scope(exit) cf.close();
    193 		foreach(i,c;enumerate(res)){
    194 			c.points.each!(a=>cf.writeln(i,"\t ",a));
    195 		}
    196 	}
    197 	return 0;
    198 }