Added a Bloomfilter and fixed some bugs

This commit is contained in:
Ziver Koc 2009-02-08 21:32:06 +00:00
parent 9e3de28d45
commit 017a27931a
8 changed files with 425 additions and 24 deletions

View file

@ -5,9 +5,10 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.BitSet;
public class Converter {
/**
* Converts an object to an array of bytes.
*
@ -26,7 +27,7 @@ public class Converter {
}
return baos.toByteArray();
}
/**
* Converts an array of bytes back to its constituent object. The
* input array is assumed to have been created from the original object.
@ -49,12 +50,13 @@ public class Converter {
}
return object;
}
/**
* Checks if the given interface is implemented in the object
* @param object The object to look for the interface
* @param interf The interface to look for
* @return True if the interface is implemented else false
*
* @param object the object to look for the interface
* @param interf the interface to look for
* @return true if the interface is implemented else false
*/
@SuppressWarnings("unchecked")
public static boolean isInstanceOf(Object object, Class interf){
@ -66,4 +68,85 @@ public class Converter {
}
return false;
}
// array neaded for byteToHex
private static char[] HEX_CHARS = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'};
/**
* Converts a byte Array to a Hex String
*
* @param raw the byte arrat to convert
* @return a Hex String
*/
public static String toHexString(byte[] raw){
StringBuffer ret = new StringBuffer();
for(byte b : raw){
ret.append(HEX_CHARS[(int) b & 0x0F ]);
ret.append(HEX_CHARS[(int) (b >>> 0x04)& 0x0F ]);
}
return ret.toString();
}
/**
* Converts the given byte to a String with 1's and 0's
*
* @param raw the byte to convert
* @return a String with 1's and 0's
*/
public static String toString(byte raw){
StringBuffer ret = new StringBuffer();
for(int i=128; i>0 ;i=( i<1 ? i=0 : i/2 ) ){
ret.append(( (raw & i) == 0 ? '0' : '1'));
}
return ret.toString();
}
/**
* Converts the given byte array to a String with 1's and 0's
*
* @param raw the byte array to convert
* @return a String with 1's and 0's
*/
public static String toString(byte[] raw){
StringBuffer ret = new StringBuffer();
for(byte b : raw){
for(int i=128; i>0 ;i=( i<1 ? i=0 : i/2 ) ){
ret.append(( (b & i) == 0 ? '0' : '1'));
}
}
return ret.toString();
}
/**
* Converts a BitSet to a Integer
*
* @param bits the BitSet to convert
* @return a Integer
*/
public static int toInt(BitSet bits){
int ret = 0;
for (int i = bits.nextSetBit(0); i >= 0; i = bits.nextSetBit(i+1)) {
ret += Math.pow(2, i);
}
return ret;
}
/**
* Converts a Integer to a BitSet
*
* @param i the Integer to convert
* @return a BitSet object
*/
public static BitSet toBitSet(int num){
BitSet ret = new BitSet();
String tmp = Integer.toBinaryString(num);
for(int i=0; i<tmp.length() ;i++){
ret.set(i , tmp.charAt(tmp.length()-i-1) != '0');
}
return ret;
}
}

View file

@ -109,15 +109,17 @@ public class FileFinder {
String[] temp = dir.list();
File file;
for(int i=0; i<temp.length ;i++){
file = new File(dir.getPath()+File.separator+temp[i]);
if(file.isDirectory()){
search(new File(dir.getPath()+File.separator+temp[i]+File.separator),fileList);
if(temp != null){
for(int i=0; i<temp.length ;i++){
file = new File(dir.getPath()+File.separator+temp[i]);
if(file.isDirectory()){
search(new File(dir.getPath()+File.separator+temp[i]+File.separator),fileList);
}
else if(file.isFile()){
MultiPrintStream.out.println("File Found: "+file);
fileList.add(file);
}
}
else if(file.isFile()){
MultiPrintStream.out.println("File Found: "+file);
fileList.add(file);
}
}
return fileList;

View file

@ -9,10 +9,8 @@ import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import sun.misc.BASE64Encoder;
public class Hasher {
/**
* Returns a hash of a file
*
@ -40,16 +38,46 @@ public class Hasher {
throw new RuntimeException("Unable to process file for MD5", e);
}
is.close();
MultiPrintStream.out.println("File Hash: "+output);
return output;
}
/**
* Returns the MD5 hash of the given object
*
* @param object The object to hash
* @return String containing the hash
*/
public static String MD5(Serializable object){
try {
return hash(object, "MD5");
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
return null;
}
/**
* Returns the SHA-1 hash of the given object
*
* @param object The object to hash
* @return String containing the hash
*/
public static String SHA1(Serializable object){
try {
return hash(object, "SHA-1");
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
return null;
}
/**
* Returns the hash of the given object
*
* @param object The object to hash
* @param hashType The hash method
* @param hashType The hash method (MD2, MD5, SHA-1, SHA-256, SHA-384, SHA-512 )
* @return String containing the hash
* @throws NoSuchAlgorithmException
*/
@ -59,6 +87,61 @@ public class Hasher {
md.update(Converter.toBytes(object));
byte raw[] = md.digest();
return (new BASE64Encoder()).encode(raw);
return Converter.toHexString(raw);//(new BASE64Encoder()).encode(raw);
}
}
/**
* MurmurHash2 ported from cpp source
*
* @param object The Key
* @param seed Seed
* @return A MurmurHash of the key
*/
public static int MurmurHash(Serializable object, int seed){
byte[] data = Converter.toBytes(object);
int length = data.length;
//Constants
int m = 0x5bd1e995;
int r = 24;
// Initialize the hash to a 'random' value
int h = seed ^ length;
int i=0;
for(; i+4<length ;i+=4){
// get the first 4 bytes
int k = data[i+3] & 0xff;
k <<= 8;
k |= data[i+2] & 0xff;
k <<= 8;
k |= data[i+1] & 0xff;
k <<= 8;
k |= data[i+0] & 0xff;
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
}
// Handle the last few bytes of the input
i = length % 4;
switch(i){
case 3: h ^= data[length-3] << 16;
case 2: h ^= data[length-2] << 8;
case 1: h ^= data[length-1];
h *= m;
}
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
return h;
}
}

View file

@ -1,4 +1,4 @@
package zutil.network.nio.service;
package zutil.network.nio.service.chat;
/**
* Tis is a listener class for new chat messages

View file

@ -1,4 +1,4 @@
package zutil.network.nio.service;
package zutil.network.nio.service.chat;
import java.nio.channels.SocketChannel;
import java.util.HashMap;
@ -8,6 +8,7 @@ import zutil.MultiPrintStream;
import zutil.network.nio.NioNetwork;
import zutil.network.nio.message.ChatMessage;
import zutil.network.nio.message.Message;
import zutil.network.nio.service.NetworkService;
public class ChatService extends NetworkService{
private HashMap<String,LinkedList<SocketChannel>> rooms;

View file

@ -11,8 +11,8 @@ import zutil.network.nio.message.SyncMessage;
import zutil.network.nio.message.type.EchoMessage;
import zutil.network.nio.message.type.ResponseRequestMessage;
import zutil.network.nio.response.ResponseEvent;
import zutil.network.nio.service.ChatService;
import zutil.network.nio.service.NetworkService;
import zutil.network.nio.service.chat.ChatService;
import zutil.network.nio.service.sync.SyncService;

View file

@ -0,0 +1,168 @@
package zutil.struct;
import java.io.Serializable;
import java.util.BitSet;
import java.util.Collection;
import java.util.Iterator;
import java.util.Set;
import zutil.Hasher;
/**
* A implementation of a bloom filter
* @author Ziver
*
*/
public class BloomFilter<T extends Serializable> implements Set<T>, Serializable{
private static final long serialVersionUID = 1L;
private BitSet bits;
private int content_size;
private int optimal_size;
private int k;
/**
* Creates a bloom filter
*
* @param size The amount of bits in the filter
* @param expected_data_count The estimated amount of data to
* be inserted(a bigger number is better than a smaller)
*/
public BloomFilter(int size, int expected_data_count){
bits = new BitSet(size);
k = (int)((size/expected_data_count) * Math.log(2));
content_size = 0;
optimal_size = expected_data_count;
}
/**
* @param e A Serializable object
* @return If the optimal size has been reached
*/
public boolean add(T e) {
content_size++;
int hash = 0;
for(int i=0; i<k ;i++){
hash = Hasher.MurmurHash(e, hash);
hash = Math.abs(hash) % bits.size();
bits.set(hash, true);
}
return isFull();
}
/**
* Adds a collection to the bloom filter
*
* @return If the optimal size has been reached
*/
public boolean addAll(Collection<? extends T> c) {
for(T t : c){
add(t);
}
return isFull();
}
/**
* @return clears the filter
*/
public void clear() {
content_size = 0;
bits.clear();
}
/**
* @param o The Serializable object to search for
* @return If the object contains in the filter or false
* if the Object is not Serializable
*/
public boolean contains(Object o) {
if(!(o instanceof Serializable))return false;
int hash = 0;
for(int i=0; i<k ;i++){
hash = Hasher.MurmurHash((Serializable)o, hash);
hash = Math.abs(hash) % bits.size();
if(!bits.get(hash))
return false;
}
return true;
}
/**
* Checks if the whole collection contains in the filter
*
* @param c The collection
*/
public boolean containsAll(Collection<?> c) {
for(Object o : c){
if(!contains(o)) return false;
}
return true;
}
/**
* @return If the bloom filter is empty
*/
public boolean isEmpty() {
return content_size == 0;
}
/**
* @return If the optimal size has been reached
*/
public boolean isFull() {
return content_size > optimal_size;
}
/**
* @return The number of data added
*/
public int size() {
return content_size;
}
/**
* @return The false posetive probability of the current state of the filter
*/
public double falsePosetiveProbability(){
return Math.pow(0.6185, bits.size()/content_size);
}
/**
* Set the hash count. Should be set before adding elements
* or the already added elements will be lost
*
* @param k The hash count
*/
public void setHashCount(int k){
this.k = k;
}
//*********************************************************************
//*********************************************************************
public Object[] toArray() {
throw new UnsupportedOperationException();
}
@SuppressWarnings("hiding")
public <T> T[] toArray(T[] a) {
throw new UnsupportedOperationException();
}
public Iterator<T> iterator() {
throw new UnsupportedOperationException();
}
public boolean remove(Object o) {
throw new UnsupportedOperationException();
}
public boolean removeAll(Collection<?> c) {
throw new UnsupportedOperationException();
}
public boolean retainAll(Collection<?> c) {
throw new UnsupportedOperationException();
}
}

View file

@ -0,0 +1,64 @@
package zutil.test;
import java.text.DecimalFormat;
import java.util.HashSet;
import java.util.Random;
import zutil.struct.BloomFilter;
import junit.framework.TestCase;
/**
* This code may be used, modified, and redistributed provided that the
* author tag below remains intact.
*
* @author Ian Clarke <ian@uprizer.com>
*/
public class BloomFilterTest extends TestCase {
public void testBloomFilter() {
DecimalFormat df = new DecimalFormat("0.00000");
Random r = new Random(124445l);
int bfSize = 400000;
System.out.println("Testing " + bfSize + " bit SimpleBloomFilter");
for (int i = 5; i < 10; i++) {
int addCount = 10000 * (i + 1);
BloomFilter<Integer> bf = new BloomFilter<Integer>(bfSize, addCount);
HashSet<Integer> added = new HashSet<Integer>();
for (int x = 0; x < addCount; x++) {
int num = r.nextInt();
added.add(num);
}
bf.addAll(added);
assertTrue("Assert that there are no false negatives", bf
.containsAll(added));
int falsePositives = 0;
for (int x = 0; x < addCount; x++) {
int num = r.nextInt();
// Ensure that this random number hasn't been added already
if (added.contains(num)) {
continue;
}
// If necessary, record a false positive
if (bf.contains(num)) {
falsePositives++;
}
}
double expectedFP = bf.falsePosetiveProbability();
double actualFP = (double) falsePositives / (double) addCount;
System.out.println("Got " + falsePositives
+ " false positives out of " + addCount + " added items, rate = "
+ df.format(actualFP) + ", expected = "
+ df.format(expectedFP));
double ratio = expectedFP/actualFP;
assertTrue(
"Assert that the actual false positive rate doesn't deviate by more than 10% from what was predicted",
ratio > 0.9 && ratio < 1.1);
}
}
}