#!/usr/bin/env python import sys from the_semantic_db_code import * from the_semantic_db_functions import * from the_semantic_db_processor import * C = context_list("fragment documents big hash") def fragment_string(s,fragments): r = [s] for frag in fragments: list = r r = [] for s in list: r += s.split(frag) # let's strip out the empty strings: return [s.strip() for s in r if len(s.strip()) > 0 ] file_table = { "bigger-eztv-1" : "web-pages/eztv-1.html", "bigger-eztv-2" : "web-pages/eztv-2.html", "bigger-diary-1" : "web-pages/k5-diary-1.html", "bigger-diary-2" : "web-pages/k5-diary-2.html", "bigger-wc-comments-1" : "web-pages/wc-comments-1.html", "bigger-wc-comments-2" : "web-pages/wc-comments-2.html", "bigger-slashdot-1" : "web-pages/slashdot-1.html", "bigger-slashdot-2" : "web-pages/slashdot-2.html", "bigger-slashdot-3" : "web-pages/slashdot-3.html", "bigger-semantic-1" : "web-pages/semantic-db-1.html", "bigger-semantic-2" : "web-pages/semantic-db-2.html", } #fragments = ["<","|",">"] # we don't need | in the fragments list if we don't include fragment text inside ket-labels. fragments = ["<",">"] def load_fragments(filename,fragments): result = superposition() with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): sequence = sequence.strip() if len(sequence) > 0: result += ket(sequence) return result # we start with a dict, and then convert the result to superposition because of speed. # 86 min for 5 web-pages, vs 2 seconds for the same pages using the dict-to-sp version! def dict_to_sp(dict): result = superposition() for x in dict: result.data.append(ket(x,dict[x])) return result def dict_load_fragments(filename,fragments): dict = {} with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): if sequence not in dict: dict[sequence] = 1 else: dict[sequence] += 1 return dict_to_sp(dict) def load_fragment_lengths(filename,fragments): result = superposition() with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): sequence = sequence.strip() result += ket(str(len(sequence))) # if len(sequence) > 0: # result += ket(str(len(sequence))) return result def dict_load_fragment_lengths(filename,fragments): dict = {} with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): length = str(len(sequence)) if length not in dict: dict[length] = 1 else: dict[length] += 1 return dict_to_sp(dict) import hashlib # in testing so far, this thing works great! Much more discriminating power (by 10 points roughly) than frag_lengths. def dict_load_fragment_hash(filename,fragments): dict = {} with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): hash = hashlib.sha1(sequence.encode('utf-8')).hexdigest()[-4:] if hash not in dict: dict[hash] = 1 else: dict[hash] += 1 return dict_to_sp(dict) def list_load_fragment_hash(filename,fragments): array = [0] * 65536 with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): hash = hashlib.sha1(sequence.encode('utf-8')).hexdigest()[-4:] x = int(hash,16) array[x] += 1 return array for topic in file_table: file = file_table[topic] print("topic: " + topic) print("file: " + file) x = topic C.learn("fragment-hash-bigger",x,dict_load_fragment_hash(file,fragments).coeff_sort()) x = list_load_fragment_hash(file,fragments) name = "web-pages/" + topic + ".dat" file = open(name,'w') file.write("# " + topic + "\n") for e in x: file.write(str(e) + "\n") file.close() #sys.exit(0) #print(C.dump_universe()) # insert these rules into context: # simm |*> #=> 100 similar[fragment-lengths] |_self> # hs |*> #=> 100 similar[fragment-hash] |_self> #C.learn("simm","*",stored_rule("100 similar[fragment-lengths] |_self>")) C.learn("hs","*",stored_rule("100 similar[fragment-hash-bigger] |_self>")) name = "sw-examples/fragment-documents-bigger-hash.sw" #name = "tmp-frag.sw" save_sw(C,name) sys.exit(0) # print out slashdot page: file = "web-pages/slashdot-1.html" #result = dict_load_fragment_lengths(file,fragments).coeff_sort().long_display() #print(result) def list_load_fragment_lengths(filename,fragments): dict = {} with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): length = str(len(sequence.strip())) if length not in dict: dict[length] = 1 else: dict[length] += 1 biggest = 0 for x in dict: biggest = max(biggest,int(x)) print("biggest:",biggest) array = [0] * (biggest + 1) for x in dict: array[int(x)] = dict[x] return array[:100] file = "web-pages/slashdot-2.html" #file = "web-pages/k5-diary-2.html" #file = "web-pages/eztv-1.html" file = "web-pages/wc-comments-1.html" result = list_load_fragment_lengths(file,fragments) #print(result) for x in result: print(x)