#!/usr/bin/env python import sys from the_semantic_db_code import * from the_semantic_db_functions import * from the_semantic_db_processor import * C = context_list("fragment documents") def fragment_string(s,fragments): r = [s] for frag in fragments: list = r r = [] for s in list: r += s.split(frag) return r file_table = { "eztv-1" : "web-pages/eztv-1.html", "eztv-2" : "web-pages/eztv-2.html", "diary-1" : "web-pages/k5-diary-1.html", "diary-2" : "web-pages/k5-diary-2.html", "wc-comments-1" : "web-pages/wc-comments-1.html", } fragments = ["<","|",">"] def load_fragments(filename,fragments): result = superposition() with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): sequence = sequence.strip() if len(sequence) > 0: result += ket(sequence) return result def load_fragment_lengths(filename,fragments): result = superposition() with open(filename,'r') as f: text = f.read() for sequence in fragment_string(text,fragments): sequence = sequence.strip() result += ket(str(len(sequence))) # if len(sequence) > 0: # result += ket(str(len(sequence))) return result for topic in file_table: file = file_table[topic] print("topic: " + topic) print("file: " + file) x = topic C.learn("fragments",x,load_fragments(file,fragments).coeff_sort()) C.learn("fragment-lengths",x,load_fragment_lengths(file,fragments).coeff_sort()) # break #print(C.dump_universe()) name = "web-pages/fragment-documents-2.sw" save_sw(C,name)