#!/usr/bin/env python import sys from the_semantic_db_code import * from the_semantic_db_functions import * from the_semantic_db_processor import * C = context_list("fragment documents") def fragment_string(s,fragments): r = [s] for frag in fragments: list = r r = [] for s in list: r += s.split(frag) return r file_table = { "eztv-1" : "web-pages/eztv-1.html", "eztv-2" : "web-pages/eztv-2.html", "diary-1" : "web-pages/k5-diary-1.html", "diary-2" : "web-pages/k5-diary-2.html", "wc-comments-1" : "web-pages/wc-comments-1.html", "wc-comments-2" : "web-pages/wc-comments-2.html", "slashdot-1" : "web-pages/slashdot-1.html", "slashdot-2" : "web-pages/slashdot-2.html", "slashdot-3" : "web-pages/slashdot-3.html", "semantic-1" : "web-pages/semantic-db-1.html", "semantic-2" : "web-pages/semantic-db-2.html", } #fragments = ["<","|",">"] # we don't need | in the fragments list if we don't include fragment text inside ket-labels. fragments = ["<",">"] def load_fragments(filename,fragments): result = superposition() with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): sequence = sequence.strip() if len(sequence) > 0: result += ket(sequence) return result # we start with a dict, and then convert the result to superposition because of speed. # 86 min for 5 web-pages, vs 2 seconds for the same pages using the dict-to-sp version! def dict_to_sp(dict): result = superposition() for x in dict: result.data.append(ket(x,dict[x])) return result def dict_load_fragments(filename,fragments): dict = {} with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): sequence = sequence.strip() if len(sequence) > 0: if sequence not in dict: dict[sequence] = 1 else: dict[sequence] += 1 return dict_to_sp(dict) def load_fragment_lengths(filename,fragments): result = superposition() with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): sequence = sequence.strip() result += ket(str(len(sequence))) # if len(sequence) > 0: # result += ket(str(len(sequence))) return result def dict_load_fragment_lengths(filename,fragments): dict = {} with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): length = str(len(sequence.strip())) if length not in dict: dict[length] = 1 else: dict[length] += 1 return dict_to_sp(dict) import hashlib # in testing so far, this thing works great! Much more discriminating power (by 10 points roughly) than frag_lengths. def dict_load_fragment_hash(filename,fragments): dict = {} with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): hash = hashlib.sha1(sequence.strip().encode('utf-8')).hexdigest()[-2:] if hash not in dict: dict[hash] = 1 else: dict[hash] += 1 return dict_to_sp(dict) def list_load_fragment_hash(filename,fragments): array = [0] * 256 with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): hash = hashlib.sha1(sequence.strip().encode('utf-8')).hexdigest()[-2:] x = int(hash,16) array[x] += 1 return array # let's try to plot these! #import matplotlib.pyplot as plt for topic in file_table: file = file_table[topic] print("topic: " + topic) print("file: " + file) x = topic # C.learn("fragments",x,load_fragments(file,fragments).coeff_sort()) # C.learn("fragment-lengths",x,load_fragment_lengths(file,fragments).coeff_sort()) # C.learn("fragments",x,dict_load_fragments(file,fragments).coeff_sort()) C.learn("fragment-lengths",x,dict_load_fragment_lengths(file,fragments).coeff_sort()) C.learn("fragment-hash",x,dict_load_fragment_hash(file,fragments).coeff_sort()) # print(topic, list_load_fragment_hash(file,fragments)) # x = list_load_fragment_hash(file,fragments) # plt.bar(range(0,256),x) # plt.show() # break #print(C.dump_universe()) # insert these rules into context: # simm |*> #=> 100 similar[fragment-lengths] |_self> # hs |*> #=> 100 similar[fragment-hash] |_self> C.learn("simm","*",stored_rule("100 similar[fragment-lengths] |_self>")) C.learn("hs","*",stored_rule("100 similar[fragment-hash] |_self>")) name = "web-pages/fragment-documents-4.sw" save_sw(C,name) sys.exit(0) # print out slashdot page: file = "web-pages/slashdot-1.html" #result = dict_load_fragment_lengths(file,fragments).coeff_sort().long_display() #print(result) def list_load_fragment_lengths(filename,fragments): dict = {} with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): length = str(len(sequence.strip())) if length not in dict: dict[length] = 1 else: dict[length] += 1 biggest = 0 for x in dict: biggest = max(biggest,int(x)) print("biggest:",biggest) array = [0] * (biggest + 1) for x in dict: array[int(x)] = dict[x] return array[:100] file = "web-pages/slashdot-2.html" #file = "web-pages/k5-diary-2.html" #file = "web-pages/eztv-1.html" file = "web-pages/wc-comments-1.html" result = list_load_fragment_lengths(file,fragments) #print(result) for x in result: print(x)