/* $Id: popularity-index.c,v 1.13 2003/07/23 19:03:44 wessels Exp $ */

/*
 * 
 * So, here's the deal:
 * 
 * 1) input is a stream of URLs, e.g. one field from a cache access.log
 * 
 * 2) we count how many times each URL is accessed
 * 
 * 3) sort the counts and optionally output the list for plotting.
 * Presumably, if you plot the data on a log-log scale you get
 * a more-or-less straight line.
 * 
 * 4) extract a sample of the data points, take thier logarithms,
 * and apply a linear least-squares fit to get the slope.
 * We'll call the slope the 'index of dispersion' for lack of
 * anything more meaningful yet.
 * 
 * If the logarithms fit the line y = mx + b, then the real data
 * fits the hyperbolic distribution:
 * 
 * Y = e^b * X^m
 * 
 * This says that the Xth most popular URL will be accessed about
 * Y times.
 * 
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>

#include <md5.h>

/*
 * use 1<<16 because Digest structure is made of shorts and
 * the first short[] is our hash table index
 */
#define HASH_SIZE 65536
#define HASH_SIZE_mask 65535

#define MAGIC1 10.0
#define MAGIC2 10.0

typedef int (*QSCMP) (const void *, const void *);
typedef struct { unsigned short shorts[8]; } Digest;

static void set_limits(void);

typedef struct _foo {
    Digest digest;
    int idx;
    int cnt;
    struct _foo *next;
} hash_link;
hash_link *table[HASH_SIZE];

int
digest_compare(const Digest *a, const Digest *b)
{
	int i;
	for (i=0;i<8; i++)
		if (a->shorts[i] != b->shorts[i])
			return 0;
	return 1;
}

hash_link *
hash_add(const Digest *D)
{
    int n = D->shorts[0];
    hash_link *h = malloc(sizeof(hash_link));
    assert(h != NULL);
    h->digest = *D;
    h->cnt = 0;
    h->next = table[n];
    table[n] = h;
    return h;
}

hash_link *
hash_find(const Digest *D)
{
    int n = D->shorts[0];
    hash_link *h;
    for (h = table[n]; h; h = h->next) {
	if (1 == digest_compare(D, &h->digest))
	    return h;
    }
    return NULL;
}

int hash_count(void)
{
    int count = 0;
    int n;
    hash_link *h;
    for (n = 0; n < HASH_SIZE; n++)
	for (h = table[n]; h; h = h->next)
	    count++;
    return count;
}

int intcompare(int *a, int *b)
{
    return *b - *a;
}

int hash_link_compare(hash_link *a, hash_link *b)
{
    return b->cnt - a->cnt;
}

void lsfit(double X[], double Y[], int n, double *slope, double *offset)
{
    int i;
    double sumx = 0.0;
    double sumy = 0.0;
    double sumxx = 0.0;
    double sumxy = 0.0;
    double D = 0.0;
    for (i = 0; i < n; i++) {
	sumx += X[i];
	sumy += Y[i];
	sumxx += (X[i] * X[i]);
	sumxy += (X[i] * Y[i]);
    }
    D = sumxx * n - sumx * sumx;
    *offset = (sumxx * sumy - sumx * sumxy) / D;
    *slope = (sumxy * n - sumx * sumy) / D;
}

int main()
{
    char buf[4096];
    char *t;
    int n;
    int i;
    int nurls;
    int *counts;
    hash_link *h;
    double base;
    int next_i;
    int j;
    double X[200];
    double Y[200];
    double B;			/* lsfit offset */
    double M;			/* lsfit slope */
#if TOPTEN_BROKEN_BY_MD5
    hash_link TOPTEN[10];
#endif

    set_limits();
    setbuf(stdout, NULL);

    /* SUCK IN ALL THE URLS */
    j = 0;
    while (fgets(buf, 4096, stdin)) {
	MD5_CTX md5;
	Digest d;
	if ((t = strchr(buf, '\n')))
	    *t = '\0';
	MD5Init(&md5);
	MD5Update(&md5, buf, strlen(buf));
	MD5Final((void*) &d.shorts[0], &md5);
	if ((h = hash_find(&d)) == NULL)
	    h = hash_add(&d);
	h->cnt++;
	j++;
    }
    printf("# %d Accesses\n", j);

    /* HOW MANY URLS DO WE HAVE? */
    nurls = hash_count();
    printf("# %d unique URLs\n", nurls);

    /* MAKE AN ARRAY OF THE COUNTS AND SORT IT */
    counts = calloc(nurls, sizeof(int));
    assert(counts != NULL);
    i = 0;
    for (n = 0; n < HASH_SIZE; n++)
	for (h = table[n]; h; h = h->next)
	    *(counts + i++) = h->cnt;
    printf("# sorting...\n");
    qsort(counts, nurls, sizeof(int), (QSCMP) intcompare);


#if TOPTEN_BROKEN_BY_MD5
    i = 0;
    j = nurls < 10 ? nurls : 10;
    /* SHOW THE TOP 10 */
    for (n = 0; n < HASH_SIZE; n++) {
	for (h = table[n]; h; h = h->next) {
	    if (h->cnt < *(counts+j-1))
		continue;
	    TOPTEN[i++] = *h;
	    if (i == 10)
		break;
	}
	if (i == 10)
	    break;
    }
    qsort(TOPTEN, i, sizeof(hash_link), (QSCMP) hash_link_compare);
    for (j=0; j<i; j++)
        printf("URL %9d %s\n", TOPTEN[j].cnt, TOPTEN[j].key);
#endif

    /* EXTRACT SELECT POINTS FOR THE LS FIT */
    base = exp((1.0 / MAGIC1) * log(MAGIC2));
    j = 0;
    next_i = (int) (pow(base, (double) j));
    for (i = 0; i < nurls; i++) {
	if ((i+1) < next_i)
	    continue;
	X[j] = log((double) i + 1);
	Y[j] = log((double) *(counts + i));
	printf ("FIT [%03d] = %d,%d\n", j, i+1, *(counts + i));
	j++;
	next_i = (int) (pow(base, (double) j));
    }
    lsfit(X, Y, j, &M, &B);
    printf("# Fit of y = Ax^p\n");
    printf("A = %0.5f\n", exp(B));
    printf("p = %0.5f\n", M);

    free(counts);
    return 0;
}



static void
set_limits(void)
{
    struct rlimit rl;
    if (getrlimit(RLIMIT_DATA, &rl) < 0) {
	perror("getrlimit RLIMIT_DATA");
	return;
    }
    if (rl.rlim_max > rl.rlim_cur) {
	rl.rlim_cur = rl.rlim_max;
	if (setrlimit(RLIMIT_DATA, &rl) < 0) {
	    perror("setrlimit RLIMIT_DATA");
	    return;
        }
    }
}
