laundryclustery.d (3913B)
1 import std.stdio; 2 import std.range; 3 import std.algorithm; 4 import std.typecons; 5 import std.math; 6 import std.traits; 7 8 alias Point=uint; 9 10 /** 11 * Returns mean and variance of the points given in the range 12 */ 13 auto gaussian(T=double, R)(R r) if(isForwardRange!R && isNumeric!(ElementType!R)){ 14 auto p=r 15 .fold!((a,b){return tuple!(T,T,size_t)(a[0]+b, a[1]+b^^2,a[2]+1);})(tuple!(T,T,size_t)(0,0,0)); 16 return tuple!(T,"mean",T,"variance")(p[0]/p[2],p[1]/p[2]-(p[0]/p[2])^^2); 17 } 18 unittest{ 19 auto res=gaussian([3,4,5]); 20 assert(res.mean==4.0); 21 assert(res.variance==2.0); 22 } 23 24 //struct Cluster(T) if(hasLength!T && isForwardRange!T && is(ElementType!T == Point)){ 25 // T points; 26 struct Cluster{ 27 Point[] points; 28 double mean; 29 double variance; 30 31 void add(Point p){ 32 points~=p; 33 } 34 void reset(){ 35 points.length=0; 36 } 37 38 void calculate(){ 39 auto g=gaussian(points); 40 mean=g[0]; 41 variance=g[1]; 42 } 43 44 float calculate_delta(){ 45 auto m=mean,v=variance; 46 calculate(); 47 return (m-mean)^^2+(v-variance)^^2; 48 } 49 } 50 51 /** 52 * Returns a pointer to the cluster that is closest to point p 53 */ 54 auto closest(Cluster[] clusters, Point p) 55 in{ 56 assert(clusters.length>0); 57 } 58 out(res){ 59 assert(res[0]!=null); 60 } 61 do{ 62 Cluster *res; 63 float mindist=float.max; 64 foreach(ref c; clusters){ 65 auto dist=abs(p-c.mean); 66 if(dist<mindist){ 67 res=&c; 68 mindist=dist; 69 } 70 } 71 return tuple(res,mindist); 72 } 73 74 /** 75 * Adds the point p to the cluster whose mean is closest to it 76 */ 77 void addToClosest(Cluster[] clusters, Point p){ 78 closest(clusters,p)[0].add(p); 79 } 80 81 82 /** 83 * The k-means++ algorithm 84 */ 85 void kmeans(Point[] points, ref Cluster[] clusters,uint maxiter=100){ 86 import std.random; 87 88 //K-means++ initialization 89 clusters[0].mean=points.choice(); 90 ulong[] distances=new ulong[points.length]; 91 for(uint i=1; i<clusters.length; i++){ 92 auto initialized=clusters[0..i]; 93 ulong sum=0; 94 points 95 .map!(a=>cast(ulong)closest(initialized,a)[1]^^2) 96 .tee!(a=>sum+=a) 97 .copy(distances); 98 auto rand=uniform(0,sum); 99 foreach(ii,dist;enumerate(distances)){ 100 if(rand<dist){ 101 clusters[i].mean=points[ii]; 102 break; 103 } 104 rand-=dist; 105 } 106 } 107 108 void reset(){ 109 foreach(ref c; clusters){ 110 c.reset(); 111 } 112 } 113 114 foreach(iteration; iota(0,maxiter)){ 115 reset(); 116 points.each!(a=>addToClosest(clusters,a)); 117 auto s=clusters.map!((ref a)=>a.calculate_delta()).sum; 118 if(s<1e-6){ 119 return; 120 } 121 } 122 } 123 124 /** 125 * This does k-means with increasing cluster sizes until the maximal 126 * std-deviation/mean ratio is below cutoff. 127 */ 128 129 Cluster[] autokmeans(Point[] points, float reltol=0.1, float abstol=30*60, uint maxclusters=10){ 130 Cluster[] res; 131 foreach(nc; iota(1,maxclusters+1)){ 132 res=new Cluster[nc]; 133 kmeans(points, res); 134 auto tol=res 135 .map!((a){auto stdev=sqrt(a.variance); return tuple(stdev/a.mean, stdev);}) 136 .map!(a=>tuple(a[0]<reltol,a[1]<abstol)) 137 .fold!((a,b)=>a = a && b[0] && b[1])(true); 138 if(tol){ 139 return res; 140 } 141 } 142 return res; 143 } 144 unittest{ 145 Cluster c; 146 c.points=[5,2]; 147 c.calculate(); 148 assert(c.mean==3.5); 149 assert(c.variance==2.25); 150 } 151 152 import std.file; 153 154 int main(string[] args){ 155 File f; 156 import std.getopt; 157 import std.format; 158 string plotfile; 159 auto helpWanted=getopt(args, 160 "plotfile|p", &plotfile, 161 ); 162 if(args.length!=2){ 163 stderr.writeln("Usage: ", args[0], " </path/to/log/file>"); 164 return 1; 165 } 166 if(args[1]=="-"){ 167 f=stdin; 168 } 169 else if(!exists(args[1])){ 170 stderr.writeln(args[1], " does not exist"); 171 return 1; 172 } 173 else{ 174 f.open(args[1]); 175 } 176 177 auto points=f 178 .byRecord!(uint, uint)("%s %s") 179 .map!(a=>a[1]-a[0]) 180 .filter!(a=>a>10*60) 181 .filter!(a=>a<5*60*60) 182 .array; 183 184 auto res=points.autokmeans(); 185 res.sort!"a.mean<b.mean"; 186 res 187 .filter!(a=>a.points.length>0) 188 .each!(a=>writeln(a.points.length, "\t", a.mean, "\t", a.variance)); 189 190 if(plotfile){ 191 File cf=File(plotfile,"w+"); 192 scope(exit) cf.close(); 193 foreach(i,c;enumerate(res)){ 194 c.points.each!(a=>cf.writeln(i,"\t ",a)); 195 } 196 } 197 return 0; 198 }