#!/usr/bin/env python

import sys

from the_semantic_db_code import *
from the_semantic_db_functions import *
from the_semantic_db_processor import *

C = context_list("fragment documents")


def fragment_string(s,fragments):
  r = [s]
  for frag in fragments:
    list = r
    r = []
    for s in list:
      r += s.split(frag)
  return r


file_table = {
  "eztv-1"        : "web-pages/eztv-1.html",
  "eztv-2"        : "web-pages/eztv-2.html",
  "diary-1"       : "web-pages/k5-diary-1.html",
  "diary-2"       : "web-pages/k5-diary-2.html",
  "wc-comments-1" : "web-pages/wc-comments-1.html",
  "wc-comments-2" : "web-pages/wc-comments-2.html",
  "slashdot-1"    : "web-pages/slashdot-1.html",
  "slashdot-2"    : "web-pages/slashdot-2.html",
  "slashdot-3"    : "web-pages/slashdot-3.html",
  "semantic-1"    : "web-pages/semantic-db-1.html",
  "semantic-2"    : "web-pages/semantic-db-2.html",
}

#fragments = ["<","|",">"]
# we don't need | in the fragments list if we don't include fragment text inside ket-labels.
fragments = ["<",">"]


def load_fragments(filename,fragments):
  result = superposition()
  with open(filename,'r') as f:
    text = f.read()
    for sequence in fragment_string(text,fragments):
      sequence = sequence.strip()
      if len(sequence) > 0:
        result += ket(sequence)
  return result

# we start with a dict, and then convert the result to superposition because of speed.
# 86 min for 5 web-pages, vs 2 seconds for the same pages using the dict-to-sp version!
def dict_to_sp(dict):
  result = superposition()
  for x in dict:
    result.data.append(ket(x,dict[x]))
  return result


def dict_load_fragments(filename,fragments):
  dict = {}
  with open(filename,'r') as f:
    text = f.read()
    for sequence in fragment_string(text,fragments):
      sequence = sequence.strip()
      if len(sequence) > 0:
        if sequence not in dict:
          dict[sequence] = 1
        else:
          dict[sequence] += 1
  return dict_to_sp(dict)


def load_fragment_lengths(filename,fragments):
  result = superposition()
  with open(filename,'r') as f:
    text = f.read()
    for sequence in fragment_string(text,fragments):
      sequence = sequence.strip()
      result += ket(str(len(sequence)))
#      if len(sequence) > 0:
#        result += ket(str(len(sequence)))
  return result

def dict_load_fragment_lengths(filename,fragments):
  dict = {}
  with open(filename,'r') as f:
    text = f.read()
    for sequence in fragment_string(text,fragments):
      length = str(len(sequence.strip()))
      if length not in dict:
        dict[length] = 1
      else:
        dict[length] += 1
  return dict_to_sp(dict)


import hashlib

# in testing so far, this thing works great! Much more discriminating power (by 10 points roughly) than frag_lengths.
def dict_load_fragment_hash(filename,fragments):
  dict = {}
  with open(filename,'r') as f:
    text = f.read()
    for sequence in fragment_string(text,fragments):
      hash = hashlib.sha1(sequence.strip().encode('utf-8')).hexdigest()[-2:]
      if hash not in dict:
        dict[hash] = 1
      else:
        dict[hash] += 1
  return dict_to_sp(dict)


def list_load_fragment_hash(filename,fragments):
  array = [0] * 256
  with open(filename,'r') as f:
    text = f.read()
    for sequence in fragment_string(text,fragments):
      hash = hashlib.sha1(sequence.strip().encode('utf-8')).hexdigest()[-2:]
      x = int(hash,16)
      array[x] += 1      
  return array


# let's try to plot these!
#import matplotlib.pyplot as plt


for topic in file_table:
  file = file_table[topic]
  print("topic: " + topic)
  print("file:  " + file)
  x = topic
#  C.learn("fragments",x,load_fragments(file,fragments).coeff_sort())
#  C.learn("fragment-lengths",x,load_fragment_lengths(file,fragments).coeff_sort())

#  C.learn("fragments",x,dict_load_fragments(file,fragments).coeff_sort())
  C.learn("fragment-lengths",x,dict_load_fragment_lengths(file,fragments).coeff_sort())
  C.learn("fragment-hash",x,dict_load_fragment_hash(file,fragments).coeff_sort())

#  print(topic, list_load_fragment_hash(file,fragments))
#  x = list_load_fragment_hash(file,fragments)
#  plt.bar(range(0,256),x)
#  plt.show()
#  break


#print(C.dump_universe())

# insert these rules into context:
# simm |*> #=> 100 similar[fragment-lengths] |_self>
# hs |*> #=> 100 similar[fragment-hash] |_self>

C.learn("simm","*",stored_rule("100 similar[fragment-lengths] |_self>"))
C.learn("hs","*",stored_rule("100 similar[fragment-hash] |_self>"))

name = "web-pages/fragment-documents-4.sw"
save_sw(C,name)

sys.exit(0)


# print out slashdot page:
file = "web-pages/slashdot-1.html"
#result = dict_load_fragment_lengths(file,fragments).coeff_sort().long_display()
#print(result)


def list_load_fragment_lengths(filename,fragments):
  dict = {}
  with open(filename,'r') as f:
    text = f.read()
    for sequence in fragment_string(text,fragments):
      length = str(len(sequence.strip()))
      if length not in dict:
        dict[length] = 1
      else:
        dict[length] += 1
  biggest = 0
  for x in dict:
    biggest = max(biggest,int(x))
  print("biggest:",biggest)
  array = [0] * (biggest + 1)
  for x in dict:
    array[int(x)] = dict[x]
  return array[:100]


file = "web-pages/slashdot-2.html"
#file = "web-pages/k5-diary-2.html"
#file = "web-pages/eztv-1.html"
file = "web-pages/wc-comments-1.html"
result = list_load_fragment_lengths(file,fragments)
#print(result)
for x in result:
  print(x)