Archive for January, 2013

Leptonica: Cosine Similarity for Pix comparison

Lately I have been researching various methods of image comparison and classification. One simple method that is often used for comparing documents is known as cosine similarity. The cosine similarity is simply the dot product of two vectors divided by their euclidean norms multiplied. Wikipedia has more information on the actual definition.


double cosineSimilarity(Pix* pixA, Pix* pixB) {
	double numerator = 0.0;
	double denominator_A = 0.0;
	double denominator_B = 0.0;
	double denominator = 0.0;
	int width = 0;
	int height = 0;
	getSmallestDimensions(pixA, pixB, width, height);

	l_uint8** linePtrs_A = (l_uint8**) pixGetLinePtrs(pixA, NULL);
	l_uint8** linePtrs_B = (l_uint8**) pixGetLinePtrs(pixB, NULL);
	volatile l_uint8 val_A, val_B;

	// sum of A * B
	for (int i = 0; i < height; ++i) {
		l_uint8 *line_A = linePtrs_A[i];
		l_uint8 *line_B = linePtrs_B[i];
		for (int k = 0; k < width; ++k) {
			val_A = line_A[k] & 0x1;
			val_B = line_B[k] & 0x1;
			numerator += val_A * val_B;
		}
	}

	for (int i = 0; i < height; ++i) {
		l_uint8 *line_A = linePtrs_A[i];
		l_uint8 *line_B = linePtrs_B[i];
		for (int k = 0; k < width; ++k) {
			val_A = ((int) (line_A[k] & 0x1), 2);
			val_B = ((int) (line_B[k] & 0x1), 2);
			denominator_A += val_A;
			denominator_B += val_B;
		}
	}
	denominator = sqrt(denominator_A) * sqrt(denominator_B);
	return numerator / denominator;
}

void getSmallestDimensions(Pix* pixA, Pix* pixB, int &width, int &height) {
	int w1 = 0.0;
	int w2 = 0.0;
	int h1 = 0.0;
	int h2 = 0.0;

	pixGetDimensions(pixA, &w1, &h1, NULL);
	pixGetDimensions(pixA, &w2, &h2, NULL);

	if (w1 < w2) {
		width = w1;
	} else {
		width = w2;
	}
	if (h1 < h2) {
		height = h1;
	} else {
		height = h2;
	}
}

Comments Trackbacks / Pingbacks (139)

Weka Arff Generator

Recently I have been playing around with the weka engine. For those who have never heard of weka, it is a collection of algorithms used in various types of analysis including data mining, machine learning, biological research etc.

My purpose for using weka was quite trivial, and simply involved using intraday tick data to observe stock patterns. The hardest part by far was obtaining tick data. Please read the comments in the code for information on how to obtain the stock data.


import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;

/**
 * Author: Ronald Grant  
This file parses the stock data into a arff format. * File does one category at a time. Output has to be * manually copied and pasted into arff file.
*

Usage: This program is designed specifically for parsing tick data obtained from http://hopey.netfonds.no
* Tick data can be downloaded from the following link:
* http://hopey.netfonds.no/tradedump.php?date=201212/05&paper=GOOG.O&csv_format=txt

* The link above is for Google Nasdaq: GOOG for the date 12-05-2012
* Data for the last 5 trading days is usually available for Nasdaq stocks.
* All tick files should be saved as .txt files in a folder specifically designated to hold the tick data files.
* It is recommended that the folder does not contain any other files, other than the txt files to be parsed.

*/ class ArffGenerator { private static String txtDir = "C:/Users/rgrant/school/datamining/projectb/stockdata/"; private static StringBuilder outfileName; private static StringBuilder testOutfileName; private static FileReader fr = null; private static BufferedReader dis = null; private static File outputFile; private static File testOutputFile; private static FileOutputStream fos; private static FileOutputStream testFos; private static OutputStreamWriter out; private static OutputStreamWriter testOut; private static boolean randomValues; public static void main(String[] args) throws IOException { File directory = new File(txtDir); File[] files = directory.listFiles(new Filter()); for (File file : files) { System.out.print("Generating Arff files for: " + file.getName() + "..."); printHeader(file.getName()); fr = new FileReader(file); dis = new BufferedReader(fr); String line; StringBuilder arffData = new StringBuilder(); StringBuilder arffTestData = new StringBuilder(); dis.readLine(); // get rid of header String[] dataArray; while ((line = dis.readLine()) != null) { dataArray = line.replaceAll("\\t", ",").replaceAll(",,,", "").split(","); for (int i = 0; i 0 && i < 2) { arffData.append(dataArray[i] + ","); if (randomValues) { if (Math.random() * 3 = 2) { arffData.append(dataArray[i] + "\n"); if (randomValues) { if (Math.random() * 2 < 1) { arffTestData.append(dataArray[i] + "\n"); } else { arffTestData.append("?\n"); } } else { arffTestData.append("?\n"); } } } } out.write(arffData.toString()); out.flush(); out.close(); testOut.write(arffTestData.toString()); testOut.flush(); testOut.close(); System.out.print("Done\n"); } } public static void printHeader(String filename) throws IOException { outfileName = new StringBuilder(filename); testOutfileName = new StringBuilder(filename); testOutfileName.replace(outfileName.indexOf(".txt"), outfileName.length(), "-Test.arff"); outfileName.replace(outfileName.indexOf(".txt"), outfileName.length(), ".arff"); outputFile = new File(outfileName.toString()); testOutputFile = new File(testOutfileName.toString()); fos = new FileOutputStream(outputFile); testFos = new FileOutputStream(testOutputFile); testOut = new OutputStreamWriter(testFos, "UTF-8"); out = new OutputStreamWriter(fos, "UTF-8"); out.write("@relation " + outfileName + "\n\n"); out.write("@attribute time DATE \"yyyy-MM-dd HH:mm:ss\" \n@attribute price REAL\n@attribute quantity NUMERIC\n\n@data\n"); testOut.write("@relation " + testOutfileName + "\n\n"); testOut .write("@attribute time DATE \"yyyy-MM-dd HH:mm:ss\" \n@attribute price REAL\n@attribute quantity NUMERIC\n\n@data\n"); } public static void generateTestFiles() { } } class Filter implements FileFilter { public boolean accept(File file) { return file.getName().endsWith("txt"); } }

Comments Trackbacks / Pingbacks (11)