#!/opt/python3/bin/python3

import sys

from the_semantic_db_code import *
from the_semantic_db_functions import *
from the_semantic_db_processor import *

C = context_list("find all average movie rating")

#one = sys.argv[1]

def file_recall(filename,op,label):
  if type(label) == ket:
    coeff = label.value
    ket_label = label.label
  else:
    coeff = 1
    ket_label = label

  pattern = op + " |" + ket_label + "> => "
  n = len(pattern)
#  print("pattern:",pattern)
#  print("n:      ",n)

  with open(filename,'r') as f:
    for line in f:
      if line.startswith(pattern):
#        print("line:",line)
        return extract_literal_superposition(line[n:])[0].multiply(coeff)
  return ket("",0)


#imdb_sw = "sw-examples/imdb.sw"
# this should be half the size, and so twice the speed!
imdb_sw = "sw-examples/imdb-sans-actors.sw"

# this file only has movies with 10k or more votes:
#ratings_sw = "sw-examples/imdb-ratings.sw"

# this is the full set:
#votes_sw = "sw-examples/complete-imdb-ratings.sw"

# this is the full set, but only imdb-votes-self entries, again for speed
votes_sw = "sw-examples/imdb-votes-self-only.sw"

# this is the full set, but only imdb-rating-self entries, to speed it up by roughly 4 I think.
ratings_sw = "sw-examples/imdb-ratings-self-only.sw"


# bah. Let's see if pre-loading votes and ratings is faster
load_sw(C,votes_sw)
load_sw(C,ratings_sw)


actors_file = "imdb-actors-list.txt"

# test file. Seems to work!
actors_file = "short-actors.txt"

ave_file = "all-actors-average.txt"
wave_file = "all-actors-weighted-average.txt"

ave_f = open(ave_file,'w')
wave_f = open(wave_file,'w')

# fuck having list of actors, then iterate over full imdb file. O(n^2) for the fail.
# so just go through the imdb.sw file once. Should be much faster!
imdb_data = "sw-examples/imdb-sans-movies.sw"
#imdb_data = "sw-examples/trial.sw"


with open(imdb_data,'r') as f:
  for line in f:
    try:

      head, tail = line.rstrip().split(" => ",1)
      actor = head[15:-1]
#      "movies |actor: " + actor + ">"

      raw_movies = superposition()
      for x in tail[1:-1].split("> + |"):
        raw_movies += ket(x)
      raw_movies = raw_movies.apply_sigmoid(clean)

# in memory version:
      movie_ratings = raw_movies.apply_op(C,"imdb-rating-self")
      movie_votes = raw_movies.apply_op(C,"imdb-votes-self")


# file version:
#      movie_ratings = superposition()
#      movie_votes = superposition()
#      for x in raw_movies.data:
#        movie_ratings += file_recall(ratings_sw,"imdb-rating-self",x)
#        movie_votes += file_recall(votes_sw,"imdb-votes-self",x)


      # find the average movie rating for this actor:
      count = movie_ratings.count()
      count_sum = movie_ratings.count_sum()
      average = 0
      if count > 0:
        average = count_sum / count

      # find the weighted average movie rating for this actor:
      weighted_votes = multiply(movie_votes,movie_ratings) # I think this is the first use of multiply superpostions together.
      weighted_count = movie_votes.count_sum()
      weighted_count_sum = weighted_votes.count_sum()
      weighted_average = 0
      if weighted_count > 0:
        weighted_average = weighted_count_sum / weighted_count

      # save results
      ave_f.write("%.2f\t" % average + actor + "\n")
      wave_f.write("%.2f\t" % weighted_average + actor + "\n")
    except:
      continue

ave_f.close()
wave_f.close()
sys.exit(0)

print("============================")
print("actor:",one)
print("number of movies:",len(movie_ratings))
print("ratings:")
print(movie_ratings.long_display())
print("votes:")
print(movie_votes.long_display())
print("average movie rating:","%.2f" % average)
print("weighted average move rating:","%.2f" % weighted_average)