00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 package de.picana.clusterer;
00014
00015 import de.picana.control.*;
00016 import de.picana.logging.*;
00017 import de.picana.math.*;
00018
00019 import java.io.*;
00020 import java.util.*;
00021
00022 import weka.core.*;
00023
00024
00031 public class AML extends GenericML {
00032
00033 private double rho;
00034 private boolean pruning = true;
00035
00036
00038 public AML() {
00039 }
00040
00041
00042 public void init(ParameterSet params, Logger logger) {
00043
00044 super.init(params, logger);
00045
00046 rho = 0.0;
00047 try {
00048 rho = Double.parseDouble((String)params.getParameter("rho"));
00049 } catch (NumberFormatException nfe) {}
00050 }
00051
00052
00053 protected void buildFirst() {
00054
00055 statwriter.println("algo_clusterer_name:AML");
00056 statwriter.println("algo_clusterer_rho:" + rho);
00057
00058 int i, j;
00059 MLVector vec, vec_i, vec_j;
00060 HashMap max_index = new HashMap();
00061
00062 double max_dist = 0.0;
00063 double dist = 0.0;
00064
00065 for (i=0; i < freq_table.size(); i++) {
00066
00067 vec_i = (MLVector)freq_table.get(i);
00068
00069 for (j=0; j < i; j++) {
00070
00071 vec_j = (MLVector)freq_table.get(j);
00072
00073 dist = Distance.weighted_euklidian(
00074 vec_i.value, vec_i.freq, vec_j.value, vec_j.freq,
00075 rho, training_set.numInstances());
00076
00077
00078
00079 if (dist > max_dist) {
00080 max_index.clear();
00081 max_dist = dist;
00082
00083 max_index.put(new IntegerPair(i, j), new Integer(1));
00084 logger.info(LOGSRC, "Found new maximum distance " + max_dist);
00085
00086 } else if (dist == max_dist) {
00087 max_index.put(new IntegerPair(i, j), new Integer(1));
00088 }
00089 }
00090 }
00091
00092 Iterator keys = max_index.keySet().iterator();
00093 while (keys.hasNext()) {
00094 IntegerPair pair = (IntegerPair)keys.next();
00095 MLVector vec_a = (MLVector)freq_table.get(pair.a);
00096 MLVector vec_b = (MLVector)freq_table.get(pair.b);
00097 logger.debug(LOGSRC, "(" + pair.a + ") " + vec_a.toString() + " - " +
00098 "(" + pair.b + ") " + vec_b.toString() + " = " + max_dist);
00099 }
00100
00101 IntegerPair pair = (IntegerPair)getRandomElement(max_index.keySet());
00102 MLVector vec_a = (MLVector)freq_table.get(pair.a);
00103 MLVector vec_b = (MLVector)freq_table.get(pair.b);
00104 centroids.add(vec_a);
00105 logger.info(LOGSRC, "centroid[0] = " + vec_a.toString());
00106 centroids.add(vec_b);
00107 logger.info(LOGSRC, "centroid[1] = " + vec_b.toString());
00108 }
00109
00110
00111 protected void buildRest() {
00112
00113 int i, j, k;
00114 MLVector vec;
00115 MLVector vec_a;
00116 MLVector vec_b;
00117 List max_index = new ArrayList();
00118
00119 double max_dist;
00120 double min_dist;
00121 double act_dist;
00122
00123 for (i=0; i < num_clusters-2; i++) {
00124
00125 max_dist = 0.0;
00126 max_index.clear();
00127
00128 for (j=0; j < freq_table.size(); j++) {
00129
00130 vec_a = (MLVector)freq_table.get(j);
00131
00132 min_dist = Double.MAX_VALUE;
00133
00134 for (k=0; k < centroids.size(); k++) {
00135
00136 vec_b = (MLVector)centroids.get(k);
00137
00138 act_dist = Distance.weighted_euklidian(
00139 vec_a.value, vec_a.freq, vec_b.value,
00140 rho, training_set.numInstances());
00141
00142
00143
00144 if (act_dist < min_dist)
00145 min_dist = act_dist;
00146 }
00147
00148 if (min_dist > max_dist) {
00149
00150 max_index.clear();
00151 max_index.add(new Integer(j));
00152 max_dist = min_dist;
00153
00154 } else if (min_dist == max_dist) {
00155
00156 max_index.add(new Integer(j));
00157 }
00158 }
00159
00160 Integer index = (Integer)getRandomElement(max_index);
00161 vec = (MLVector)freq_table.get(index.intValue());
00162 centroids.add(vec);
00163 logger.info(LOGSRC, "centroid[" + (i+2) + "] = " + vec.toString());
00164 }
00165 }
00166 }