(** Holds a set of sequences *) open Printf (** Datatype for holding a whole database *) type t = Sequence.t list (** Get the count of sequences in the DB *) let size db = List.length db (** Dump a database to stdout, for testing *) let rec dump_db db = match db with | [] -> () | seq::remaining_sequences -> Sequence.dumpSequence seq; dump_db remaining_sequences (** Loads a file in ascii format * * Format: * - One sequence per line * - Each line contains space-separated positive integers * - First integer is the count of itemsets, followed by the itemsets * - Each itemset starts with the count of items, followed by the items * - Each item is an integer *) let load_from_ascii filename = let sequence_channel = open_in filename in let db = ref [] in begin try while true do let sequence = Sequence.read_sequence sequence_channel in db := sequence::!db done; with _ -> () end; db let rec find_item_counts' db item_count_hash = match db with | [] -> item_count_hash | sequence::remaining_sequences -> let item_count_hash = Sequence.find_item_counts sequence item_count_hash in find_item_counts' remaining_sequences item_count_hash (** Using a hashtable, count how many of each item we have. *) let find_item_counts db = let item_count_hash = BatHashtbl.create 100 in let item_count_hash = find_item_counts' db item_count_hash in item_count_hash let merge_item_count_hash_entry inserts_item_count_hash updates_item_count_hash key = try let insert_value = BatHashtbl.find inserts_item_count_hash key in let update_value = BatHashtbl.find updates_item_count_hash key in BatHashtbl.replace inserts_item_count_hash key (insert_value + update_value) with Not_found -> () let merge_item_count_hashes inserts_item_count_hash updates_item_count_hash = BatHashtbl.iter ( fun key value -> merge_item_count_hash_entry inserts_item_count_hash updates_item_count_hash key ) inserts_item_count_hash; inserts_item_count_hash let find_frequent_items db min_sup = let item_count_hash = find_item_counts db in (* Narrow to just the ones with enough support *) let item_count_hash = BatHashtbl.filter (fun v -> v >= min_sup) item_count_hash in let items = Util.hash_keys_to_list item_count_hash in items let find_frequent_items_combined inserts updates min_sup = let inserts_item_count_hash = find_item_counts inserts in let updates_item_count_hash = find_item_counts updates in let item_count_hash = merge_item_count_hashes inserts_item_count_hash updates_item_count_hash in (* Narrow to just the ones with enough support *) let item_count_hash = BatHashtbl.filter (fun v -> v >= min_sup) item_count_hash in let items = Util.hash_keys_to_list item_count_hash in items let rec prune_infrequent_items' db frequent_items = match db with | [] -> [] | sequence::remaining_sequences -> let sequence = Sequence.prune_infrequent_items sequence frequent_items in if Sequence.is_empty sequence then prune_infrequent_items' remaining_sequences frequent_items else sequence::(prune_infrequent_items' remaining_sequences frequent_items) let prune_infrequent_items db min_sup = let frequent_items = find_frequent_items db min_sup in prune_infrequent_items' db frequent_items let rec prune_and_merge_infrequent_items db1 db2 frequent_items = let db = db1 @ db2 in prune_infrequent_items' db frequent_items let rec project' db last_item = match db with | [] -> [] | sequence::remaining_sequences -> let subsequences = Sequence.partial_split_at sequence last_item in printf "split_at sequence [%s] last_item [%s]\n" (Sequence.to_string sequence) (Item.to_string last_item); printf "Subsequences:\n"; dump_db subsequences; if subsequences = [[[]],Sequence.Complete] then project' remaining_sequences last_item else let remaining_projection = project' remaining_sequences last_item in subsequences @ remaining_projection (** Project a db over a sequence, given the desired minimum support *) (* For efficiency, we should actually take a DB and then return a list of projected DBs, one for each frequent item. But I'm not doing that yet. *) let project db min_sup sequence = let last_item = Sequence.last_item sequence in let db = project' db last_item in db let rec project_over_sequence sequence items item_projected_db = match items with | [] -> item_projected_db | item::remaining_items -> let subsequence, is_start_item = Sequence.get_subsequence sequence item in if subsequence = ([], Sequence.Complete) then project_over_sequence sequence remaining_items item_projected_db else let key = (item, is_start_item) in try let support, sequences = BatHashtbl.find item_projected_db key in let support = support + 1 in let sequences = (subsequence::sequences) in BatHashtbl.replace item_projected_db key (support, sequences); project_over_sequence sequence remaining_items item_projected_db with Not_found -> BatHashtbl.add item_projected_db key (1, (sequence::[])); project_over_sequence sequence remaining_items item_projected_db let rec project_over' db items item_projected_db = match db with | [] -> item_projected_db | sequence::remaining_sequences -> (* printf "%d\n" (List.length remaining_sequences); flush stdout; *) let item_projected_db = project_over_sequence sequence items item_projected_db in project_over' remaining_sequences items item_projected_db let project_over db items = (* printf "Projecting over DB:\n"; dump_db db; printf "Items: %s\n" (Util.join " " (List.map string_of_int items)); *) let item_projected_db = BatHashtbl.create 100 in let item_projected_db = project_over' db items item_projected_db in Util.hash_to_pairs_list item_projected_db (* let project db min_sup = match db with | [] -> [] | sequence::remaining_sequences -> *)